fix: apply unstructured preprocess functions (#9473)

### Summary

Fixes a bug from #7850 where post processing functions in Unstructured
loaders were not apply. Adds a assertion to the test to verify the post
processing function was applied and also updates the explanation in the
example notebook.
This commit is contained in:
Matt Robinson 2023-08-18 21:54:28 -04:00 committed by GitHub
parent 292ae8468e
commit 83d2a871eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 5 deletions

View File

@ -299,7 +299,7 @@
"id": "1cf27fc8",
"metadata": {},
"source": [
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `str` -> `str` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example."
]
},
{
@ -495,7 +495,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
"version": "3.8.10"
}
},
"nbformat": 4,

View File

@ -74,7 +74,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def _post_process_elements(self, elements: list) -> list:
"""Applies post processing functions to extracted unstructured elements.
Post processing functions are Element -> Element callables are passed
Post processing functions are str -> str callables are passed
in using the post_processors kwarg when the loader is instantiated."""
for element in elements:
for post_processor in self.post_processors:
@ -84,6 +84,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def load(self) -> List[Document]:
"""Load file."""
elements = self._get_elements()
self._post_process_elements(elements)
if self.mode == "elements":
docs: List[Document] = list()
for element in elements:

View File

@ -12,18 +12,20 @@ EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
def test_unstructured_loader_with_post_processor() -> None:
from unstructured.cleaners.core import clean_extra_whitespace
def add_the_end(text: str) -> str:
return text + "THE END!"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredFileLoader(
file_path=file_path,
pos_processors=[clean_extra_whitespace],
post_processors=[add_the_end],
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].page_content.endswith("THE END!")
def test_unstructured_api_file_loader() -> None: