feat: optional post-processing for Unstructured loaders (#7850)

### Summary

Adds a post-processing method for Unstructured loaders that allows users
to optionally modify or clean extracted elements.

### Testing

```python
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace

loader = UnstructuredFileLoader(
    "./example_data/layout-parser-paper.pdf",
    mode="elements",
    post_processors=[clean_extra_whitespace],
)

docs = loader.load()
docs[:5]
```


### Reviewrs
  - @rlancemartin
  - @eyurtsev
  - @hwchase17
pull/7329/head
Matt Robinson 1 year ago committed by GitHub
parent 2a315dbee9
commit 3c489be773
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -295,6 +295,74 @@
"docs[:5]" "docs[:5]"
] ]
}, },
{
"cell_type": "markdown",
"id": "1cf27fc8",
"metadata": {},
"source": [
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "112e5538",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredFileLoader\n",
"from unstructured.cleaners.core import clean_extra_whitespace"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b9c5ac8d",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredFileLoader(\n",
" \"./example_data/layout-parser-paper.pdf\",\n",
" mode=\"elements\",\n",
" post_processors=[clean_extra_whitespace],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c44d5def",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b6f27929",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n",
" Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
" Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
" Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
" Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[:5]"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "b066cb5a", "id": "b066cb5a",

@ -1,7 +1,7 @@
"""Loader that uses unstructured to load files.""" """Loader that uses unstructured to load files."""
import collections import collections
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import IO, Any, Dict, List, Sequence, Union from typing import IO, Any, Callable, Dict, List, Sequence, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -36,7 +36,12 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
class UnstructuredBaseLoader(BaseLoader, ABC): class UnstructuredBaseLoader(BaseLoader, ABC):
"""Loader that uses unstructured to load files.""" """Loader that uses unstructured to load files."""
def __init__(self, mode: str = "single", **unstructured_kwargs: Any): def __init__(
self,
mode: str = "single",
post_processors: List[Callable] = [],
**unstructured_kwargs: Any,
):
"""Initialize with file path.""" """Initialize with file path."""
try: try:
import unstructured # noqa:F401 import unstructured # noqa:F401
@ -57,6 +62,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
unstructured_kwargs.pop("strategy") unstructured_kwargs.pop("strategy")
self.unstructured_kwargs = unstructured_kwargs self.unstructured_kwargs = unstructured_kwargs
self.post_processors = post_processors
@abstractmethod @abstractmethod
def _get_elements(self) -> List: def _get_elements(self) -> List:
@ -66,6 +72,15 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def _get_metadata(self) -> dict: def _get_metadata(self) -> dict:
"""Get metadata.""" """Get metadata."""
def _post_process_elements(self, elements: list) -> list:
"""Applies post processing functions to extracted unstructured elements.
Post processing functions are Element -> Element callables are passed
in using the post_processors kwarg when the loader is instantiated."""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file.""" """Load file."""
elements = self._get_elements() elements = self._get_elements()

@ -2,14 +2,30 @@ import os
from contextlib import ExitStack from contextlib import ExitStack
from pathlib import Path from pathlib import Path
from unstructured.cleaners.core import clean_extra_whitespace
from langchain.document_loaders import ( from langchain.document_loaders import (
UnstructuredAPIFileIOLoader, UnstructuredAPIFileIOLoader,
UnstructuredAPIFileLoader, UnstructuredAPIFileLoader,
UnstructuredFileLoader,
) )
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
def test_unstructured_loader_with_post_processor() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredFileLoader(
file_path=file_path,
pos_processors=[clean_extra_whitespace],
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
def test_unstructured_api_file_loader() -> None: def test_unstructured_api_file_loader() -> None:
"""Test unstructured loader.""" """Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")

Loading…
Cancel
Save