mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
fix: apply unstructured preprocess functions (#9473)
### Summary Fixes a bug from #7850 where post processing functions in Unstructured loaders were not apply. Adds a assertion to the test to verify the post processing function was applied and also updates the explanation in the example notebook.
This commit is contained in:
parent
292ae8468e
commit
83d2a871eb
@ -299,7 +299,7 @@
|
||||
"id": "1cf27fc8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
|
||||
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `str` -> `str` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -495,7 +495,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -74,7 +74,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
|
||||
def _post_process_elements(self, elements: list) -> list:
|
||||
"""Applies post processing functions to extracted unstructured elements.
|
||||
Post processing functions are Element -> Element callables are passed
|
||||
Post processing functions are str -> str callables are passed
|
||||
in using the post_processors kwarg when the loader is instantiated."""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
@ -84,6 +84,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
elements = self._get_elements()
|
||||
self._post_process_elements(elements)
|
||||
if self.mode == "elements":
|
||||
docs: List[Document] = list()
|
||||
for element in elements:
|
||||
|
@ -12,18 +12,20 @@ EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||
|
||||
|
||||
def test_unstructured_loader_with_post_processor() -> None:
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
def add_the_end(text: str) -> str:
|
||||
return text + "THE END!"
|
||||
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredFileLoader(
|
||||
file_path=file_path,
|
||||
pos_processors=[clean_extra_whitespace],
|
||||
post_processors=[add_the_end],
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert docs[0].page_content.endswith("THE END!")
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader() -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user