From 0998577dfe663d2ddfb4e55c36861f3d4537a681 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 12 Feb 2023 07:36:11 -0800 Subject: [PATCH] Harrison/unstructured structured (#1004) --- .../document_loaders/examples/email.ipynb | 55 +++++++- .../examples/microsoft_word.ipynb | 55 +++++++- .../document_loaders/examples/pdf.ipynb | 54 ++++++-- .../examples/powerpoint.ipynb | 55 +++++++- .../examples/unstructured_file.ipynb | 118 +++++++++++++++++- langchain/document_loaders/docx.py | 24 +--- langchain/document_loaders/email.py | 24 +--- langchain/document_loaders/html.py | 24 +--- langchain/document_loaders/pdf.py | 22 +--- langchain/document_loaders/powerpoint.py | 24 +--- langchain/document_loaders/unstructured.py | 29 ++++- 11 files changed, 363 insertions(+), 121 deletions(-) diff --git a/docs/modules/document_loaders/examples/email.ipynb b/docs/modules/document_loaders/examples/email.ipynb index ef04f4bb..1ad2c590 100644 --- a/docs/modules/document_loaders/examples/email.ipynb +++ b/docs/modules/document_loaders/examples/email.ipynb @@ -61,10 +61,61 @@ "data" ] }, + { + "cell_type": "markdown", + "id": "8bf50cba", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b9592eaf", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0b16d03f", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d7bdc5e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "4ef9a5f4", + "id": "6a074515", "metadata": {}, "outputs": [], "source": [] @@ -86,7 +137,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/document_loaders/examples/microsoft_word.ipynb b/docs/modules/document_loaders/examples/microsoft_word.ipynb index ae1c35ab..4eef6deb 100644 --- a/docs/modules/document_loaders/examples/microsoft_word.ipynb +++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb @@ -61,10 +61,61 @@ "data" ] }, + { + "cell_type": "markdown", + "id": "5d1472e9", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "93abf60b", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c35cdbcc", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fae2d730", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "61953c83", + "id": "961a7b1d", "metadata": {}, "outputs": [], "source": [] @@ -86,7 +137,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/document_loaders/examples/pdf.ipynb b/docs/modules/document_loaders/examples/pdf.ipynb index 51e27581..d008721d 100644 --- a/docs/modules/document_loaders/examples/pdf.ipynb +++ b/docs/modules/document_loaders/examples/pdf.ipynb @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "0cc0cd42", "metadata": {}, "outputs": [], @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "082d557c", "metadata": {}, "outputs": [], @@ -159,14 +159,54 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "5c41106f", + "execution_count": null, + "id": "df11c953", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "09957371", + "metadata": {}, + "source": [ + "### Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fab833b", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3e8ff1b", "metadata": {}, "outputs": [], "source": [ "data = loader.load()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "43c23d2d", + "metadata": {}, + "outputs": [], + "source": [ + "data[0]" + ] + }, { "cell_type": "markdown", "id": "21998d18", @@ -177,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "2f0cc9ff", "metadata": {}, "outputs": [], @@ -187,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "id": "42b531e8", "metadata": {}, "outputs": [], @@ -197,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "id": "010d5cdd", "metadata": {}, "outputs": [], diff --git a/docs/modules/document_loaders/examples/powerpoint.ipynb b/docs/modules/document_loaders/examples/powerpoint.ipynb index dec8c1d4..6255dfb9 100644 --- a/docs/modules/document_loaders/examples/powerpoint.ipynb +++ b/docs/modules/document_loaders/examples/powerpoint.ipynb @@ -61,10 +61,61 @@ "data" ] }, + { + "cell_type": "markdown", + "id": "525d6b67", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "064f9162", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "abefbbdb", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a547c534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "0c55f1cf", + "id": "381d4139", "metadata": {}, "outputs": [], "source": [] @@ -86,7 +137,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/document_loaders/examples/unstructured_file.ipynb b/docs/modules/document_loaders/examples/unstructured_file.ipynb index bb363903..92f043c4 100644 --- a/docs/modules/document_loaders/examples/unstructured_file.ipynb +++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb @@ -12,6 +12,40 @@ { "cell_type": "code", "execution_count": 1, + "id": "2886982e", + "metadata": {}, + "outputs": [], + "source": [ + "# # Install package\n", + "# !pip install unstructured" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54d62efd", + "metadata": {}, + "outputs": [], + "source": [ + "# # Install other dependencies\n", + "# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n", + "# !brew install libmagic" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "af6a64f5", + "metadata": {}, + "outputs": [], + "source": [ + "# import nltk\n", + "# nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "79d3e549", "metadata": {}, "outputs": [], @@ -21,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "2593d1dc", "metadata": {}, "outputs": [], @@ -31,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "fe34e941", "metadata": {}, "outputs": [], @@ -39,10 +73,86 @@ "docs = loader.load()" ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ee449788", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].page_content[:400]" + ] + }, + { + "cell_type": "markdown", + "id": "7874d01d", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ff5b616d", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "feca3b6c", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fec5bbac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", + " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", + " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", + " Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", + " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[:5]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "24e577e5", + "id": "8ca8a648", "metadata": {}, "outputs": [], "source": [] @@ -64,7 +174,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/document_loaders/docx.py b/langchain/document_loaders/docx.py index 0b595ece..8edc10da 100644 --- a/langchain/document_loaders/docx.py +++ b/langchain/document_loaders/docx.py @@ -1,29 +1,13 @@ """Loader that loads Microsoft Word files.""" from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader -class UnstructuredDocxLoader(BaseLoader): +class UnstructuredDocxLoader(UnstructuredFileLoader): """Loader that uses unstructured to load Microsoft Word files.""" - def __init__(self, file_path: str): - """Initialize with file path.""" - try: - import unstructured # noqa:F401 - except ImportError: - raise ValueError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.file_path = file_path - - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.docx import partition_docx - elements = partition_docx(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + return partition_docx(filename=self.file_path) diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index ec22601f..2c3ecd88 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -1,29 +1,13 @@ """Loader that loads email files.""" from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader -class UnstructuredEmailLoader(BaseLoader): +class UnstructuredEmailLoader(UnstructuredFileLoader): """Loader that uses unstructured to load email files.""" - def __init__(self, file_path: str): - """Initialize with file path.""" - try: - import unstructured # noqa:F401 - except ImportError: - raise ValueError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.file_path = file_path - - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.email import partition_email - elements = partition_email(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + return partition_email(filename=self.file_path) diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py index 5c7afa91..a65edba2 100644 --- a/langchain/document_loaders/html.py +++ b/langchain/document_loaders/html.py @@ -1,29 +1,13 @@ """Loader that loads PDF files.""" from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader -class UnstructuredHTMLLoader(BaseLoader): +class UnstructuredHTMLLoader(UnstructuredFileLoader): """Loader that uses unstructured to load HTML files.""" - def __init__(self, file_path: str): - """Initialize with file path.""" - try: - import unstructured # noqa:F401 - except ImportError: - raise ValueError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.file_path = file_path - - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.html import partition_html - elements = partition_html(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + return partition_html(filename=self.file_path) diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index d6a9810c..ca7b7e58 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -3,30 +3,16 @@ from typing import List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader -class UnstructuredPDFLoader(BaseLoader): +class UnstructuredPDFLoader(UnstructuredFileLoader): """Loader that uses unstructured to load PDF files.""" - def __init__(self, file_path: str): - """Initialize with file path.""" - try: - import unstructured # noqa:F401 - except ImportError: - raise ValueError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.file_path = file_path - - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.pdf import partition_pdf - elements = partition_pdf(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + return partition_pdf(filename=self.file_path) class PDFMinerLoader(BaseLoader): diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index 40bb9825..d8709b9b 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -1,29 +1,13 @@ """Loader that loads powerpoint files.""" from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader -class UnstructuredPowerPointLoader(BaseLoader): +class UnstructuredPowerPointLoader(UnstructuredFileLoader): """Loader that uses unstructured to load powerpoint files.""" - def __init__(self, file_path: str): - """Initialize with file path.""" - try: - import unstructured # noqa:F401 - except ImportError: - raise ValueError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.file_path = file_path - - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.pptx import partition_pptx - elements = partition_pptx(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + return partition_pptx(filename=self.file_path) diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 8f3b931f..079af736 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader class UnstructuredFileLoader(BaseLoader): """Loader that uses unstructured to load files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: str, mode: str = "single"): """Initialize with file path.""" try: import unstructured # noqa:F401 @@ -17,13 +17,30 @@ class UnstructuredFileLoader(BaseLoader): "unstructured package not found, please install it with " "`pip install unstructured`" ) + _valid_modes = {"single", "elements"} + if mode not in _valid_modes: + raise ValueError( + f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" + ) self.file_path = file_path + self.mode = mode - def load(self) -> List[Document]: - """Load file.""" + def _get_elements(self) -> List: from unstructured.partition.auto import partition - elements = partition(filename=self.file_path) - text = "\n\n".join([str(el) for el in elements]) + return partition(filename=self.file_path) + + def load(self) -> List[Document]: + """Load file.""" + elements = self._get_elements() metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + if self.mode == "elements": + docs = [ + Document(page_content=str(el), metadata=metadata) for el in elements + ] + elif self.mode == "single": + text = "\n\n".join([str(el) for el in elements]) + docs = [Document(page_content=text, metadata=metadata)] + else: + raise ValueError(f"mode of {self.mode} not supported.") + return docs