2023-12-11 21:53:30 +00:00
|
|
|
from typing import List
|
|
|
|
|
|
|
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
|
|
|
|
|
|
|
|
|
|
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
|
|
|
"""Load `Markdown` files using `Unstructured`.
|
|
|
|
|
|
|
|
You can run the loader in one of two modes: "single" and "elements".
|
|
|
|
If you use "single" mode, the document will be returned as a single
|
|
|
|
langchain Document object. If you use "elements" mode, the unstructured
|
|
|
|
library will split the document into elements such as Title and NarrativeText.
|
|
|
|
You can pass in additional unstructured kwargs after mode to apply
|
|
|
|
different unstructured settings.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
|
|
|
|
|
|
|
loader = UnstructuredMarkdownLoader(
|
|
|
|
"example.md", mode="elements", strategy="fast",
|
|
|
|
)
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
References
|
|
|
|
----------
|
corrected outdated link (#15053)
<!-- Thank you for contributing to LangChain!
Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.
Replace this entire comment with:
- **Description:** a description of the change,
- **Issue:** the issue # it fixes if applicable,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2023-12-22 20:39:38 +00:00
|
|
|
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
|
2023-12-11 21:53:30 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
def _get_elements(self) -> List:
|
|
|
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
|
|
|
from unstructured.partition.md import partition_md
|
|
|
|
|
|
|
|
# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
|
|
|
|
# versions of unstructured like 0.4.17-dev1
|
|
|
|
_unstructured_version = __unstructured_version__.split("-")[0]
|
|
|
|
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
|
|
|
|
|
|
|
|
if unstructured_version < (0, 4, 16):
|
|
|
|
raise ValueError(
|
|
|
|
f"You are on unstructured version {__unstructured_version__}. "
|
|
|
|
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
|
|
|
)
|
|
|
|
|
|
|
|
return partition_md(filename=self.file_path, **self.unstructured_kwargs)
|