Harrison/unstructured structured (#1004)

2024-11-06 03:20:49 +00:00 · 2023-02-12 07:36:11 -08:00 · 2023-02-12 07:36:11 -08:00 · 0998577dfe
commit 0998577dfe
parent bbb06ca4cf
11 changed files with 363 additions and 121 deletions
--- a/docs/modules/document_loaders/examples/email.ipynb
+++ b/docs/modules/document_loaders/examples/email.ipynb
@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf50cba",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b9592eaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0b16d03f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d7bdc5e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "4ef9a5f4",
+   "id": "6a074515",
   "metadata": {},
   "outputs": [],
   "source": []
@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/microsoft_word.ipynb
+++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb
@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "5d1472e9",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "93abf60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c35cdbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fae2d730",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "61953c83",
+   "id": "961a7b1d",
   "metadata": {},
   "outputs": [],
   "source": []
@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/document_loaders/examples/pdf.ipynb
@ -139,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "id": "0cc0cd42",
   "metadata": {},
   "outputs": [],
@ -149,7 +149,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "id": "082d557c",
   "metadata": {},
   "outputs": [],
@ -159,14 +159,54 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "id": "5c41106f",
+   "execution_count": null,
+   "id": "df11c953",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "09957371",
+   "metadata": {},
+   "source": [
+    "### Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fab833b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3e8ff1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43c23d2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "21998d18",
@ -177,7 +217,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "id": "2f0cc9ff",
   "metadata": {},
   "outputs": [],
@ -187,7 +227,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "id": "42b531e8",
   "metadata": {},
   "outputs": [],
@ -197,7 +237,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
   "id": "010d5cdd",
   "metadata": {},
   "outputs": [],
--- a/docs/modules/document_loaders/examples/powerpoint.ipynb
+++ b/docs/modules/document_loaders/examples/powerpoint.ipynb
@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "525d6b67",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "064f9162",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "abefbbdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a547c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0c55f1cf",
+   "id": "381d4139",
   "metadata": {},
   "outputs": [],
   "source": []
@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb
@ -12,6 +12,40 @@
  {
   "cell_type": "code",
   "execution_count": 1,
+   "id": "2886982e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install package\n",
+    "# !pip install unstructured"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "54d62efd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install other dependencies\n",
+    "# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n",
+    "# !brew install libmagic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "af6a64f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import nltk\n",
+    "# nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
   "id": "79d3e549",
   "metadata": {},
   "outputs": [],
@ -21,7 +55,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "2593d1dc",
   "metadata": {},
   "outputs": [],
@ -31,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "id": "fe34e941",
   "metadata": {},
   "outputs": [],
@ -39,10 +73,86 @@
    "docs = loader.load()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ee449788",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0].page_content[:400]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7874d01d",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ff5b616d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "feca3b6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fec5bbac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[:5]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "24e577e5",
+   "id": "8ca8a648",
   "metadata": {},
   "outputs": [],
   "source": []
@ -64,7 +174,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/docx.py
+++ b/langchain/document_loaders/docx.py
@ -1,29 +1,13 @@
 """Loader that loads Microsoft Word files."""
 from typing import List

-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader


-class UnstructuredDocxLoader(BaseLoader):
+class UnstructuredDocxLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load Microsoft Word files."""

-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
        from unstructured.partition.docx import partition_docx

-        elements = partition_docx(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_docx(filename=self.file_path)
--- a/langchain/document_loaders/email.py
+++ b/langchain/document_loaders/email.py
@ -1,29 +1,13 @@
 """Loader that loads email files."""
 from typing import List

-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader


-class UnstructuredEmailLoader(BaseLoader):
+class UnstructuredEmailLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load email files."""

-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
        from unstructured.partition.email import partition_email

-        elements = partition_email(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_email(filename=self.file_path)
--- a/langchain/document_loaders/html.py
+++ b/langchain/document_loaders/html.py
@ -1,29 +1,13 @@
 """Loader that loads PDF files."""
 from typing import List

-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader


-class UnstructuredHTMLLoader(BaseLoader):
+class UnstructuredHTMLLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load HTML files."""

-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
        from unstructured.partition.html import partition_html

-        elements = partition_html(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_html(filename=self.file_path)
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -3,30 +3,16 @@ from typing import List

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader


-class UnstructuredPDFLoader(BaseLoader):
+class UnstructuredPDFLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load PDF files."""

-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
        from unstructured.partition.pdf import partition_pdf

-        elements = partition_pdf(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_pdf(filename=self.file_path)


 class PDFMinerLoader(BaseLoader):
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -1,29 +1,13 @@
 """Loader that loads powerpoint files."""
 from typing import List

-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader


-class UnstructuredPowerPointLoader(BaseLoader):
+class UnstructuredPowerPointLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load powerpoint files."""

-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
        from unstructured.partition.pptx import partition_pptx

-        elements = partition_pptx(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_pptx(filename=self.file_path)
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
 class UnstructuredFileLoader(BaseLoader):
    """Loader that uses unstructured to load files."""

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: str, mode: str = "single"):
        """Initialize with file path."""
        try:
            import unstructured  # noqa:F401
@ -17,13 +17,30 @@ class UnstructuredFileLoader(BaseLoader):
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
+        _valid_modes = {"single", "elements"}
+        if mode not in _valid_modes:
+            raise ValueError(
+                f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
+            )
        self.file_path = file_path
+        self.mode = mode
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.auto import partition
+
+        return partition(filename=self.file_path)

    def load(self) -> List[Document]:
        """Load file."""
-        from unstructured.partition.auto import partition
-
-        elements = partition(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
+        elements = self._get_elements()
        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        if self.mode == "elements":
+            docs = [
+                Document(page_content=str(el), metadata=metadata) for el in elements
+            ]
+        elif self.mode == "single":
+            text = "\n\n".join([str(el) for el in elements])
+            docs = [Document(page_content=text, metadata=metadata)]
+        else:
+            raise ValueError(f"mode of {self.mode} not supported.")
+        return docs