mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
dc71fcfabf
<!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes if applicable, - **Dependencies:** any dependencies required for this change, - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
46 lines
1.8 KiB
Python
46 lines
1.8 KiB
Python
from typing import List
|
|
|
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
|
|
|
|
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
|
"""Load `Markdown` files using `Unstructured`.
|
|
|
|
You can run the loader in one of two modes: "single" and "elements".
|
|
If you use "single" mode, the document will be returned as a single
|
|
langchain Document object. If you use "elements" mode, the unstructured
|
|
library will split the document into elements such as Title and NarrativeText.
|
|
You can pass in additional unstructured kwargs after mode to apply
|
|
different unstructured settings.
|
|
|
|
Examples
|
|
--------
|
|
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
|
|
|
loader = UnstructuredMarkdownLoader(
|
|
"example.md", mode="elements", strategy="fast",
|
|
)
|
|
docs = loader.load()
|
|
|
|
References
|
|
----------
|
|
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
|
|
"""
|
|
|
|
def _get_elements(self) -> List:
|
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
|
from unstructured.partition.md import partition_md
|
|
|
|
# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
|
|
# versions of unstructured like 0.4.17-dev1
|
|
_unstructured_version = __unstructured_version__.split("-")[0]
|
|
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
|
|
|
|
if unstructured_version < (0, 4, 16):
|
|
raise ValueError(
|
|
f"You are on unstructured version {__unstructured_version__}. "
|
|
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
|
)
|
|
|
|
return partition_md(filename=self.file_path, **self.unstructured_kwargs)
|