From 0998577dfe663d2ddfb4e55c36861f3d4537a681 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 12 Feb 2023 07:36:11 -0800
Subject: [PATCH] Harrison/unstructured structured (#1004)

---
 .../document_loaders/examples/email.ipynb     |  55 +++++++-
 .../examples/microsoft_word.ipynb             |  55 +++++++-
 .../document_loaders/examples/pdf.ipynb       |  54 ++++++--
 .../examples/powerpoint.ipynb                 |  55 +++++++-
 .../examples/unstructured_file.ipynb          | 118 +++++++++++++++++-
 langchain/document_loaders/docx.py            |  24 +---
 langchain/document_loaders/email.py           |  24 +---
 langchain/document_loaders/html.py            |  24 +---
 langchain/document_loaders/pdf.py             |  22 +---
 langchain/document_loaders/powerpoint.py      |  24 +---
 langchain/document_loaders/unstructured.py    |  29 ++++-
 11 files changed, 363 insertions(+), 121 deletions(-)

diff --git a/docs/modules/document_loaders/examples/email.ipynb b/docs/modules/document_loaders/examples/email.ipynb
index ef04f4bb..1ad2c590 100644
--- a/docs/modules/document_loaders/examples/email.ipynb
+++ b/docs/modules/document_loaders/examples/email.ipynb
@@ -61,10 +61,61 @@
     "data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8bf50cba",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b9592eaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0b16d03f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d7bdc5e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4ef9a5f4",
+   "id": "6a074515",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -86,7 +137,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/docs/modules/document_loaders/examples/microsoft_word.ipynb b/docs/modules/document_loaders/examples/microsoft_word.ipynb
index ae1c35ab..4eef6deb 100644
--- a/docs/modules/document_loaders/examples/microsoft_word.ipynb
+++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb
@@ -61,10 +61,61 @@
     "data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5d1472e9",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "93abf60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c35cdbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fae2d730",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "61953c83",
+   "id": "961a7b1d",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -86,7 +137,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/docs/modules/document_loaders/examples/pdf.ipynb b/docs/modules/document_loaders/examples/pdf.ipynb
index 51e27581..d008721d 100644
--- a/docs/modules/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/document_loaders/examples/pdf.ipynb
@@ -139,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "0cc0cd42",
    "metadata": {},
    "outputs": [],
@@ -149,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "082d557c",
    "metadata": {},
    "outputs": [],
@@ -159,14 +159,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "5c41106f",
+   "execution_count": null,
+   "id": "df11c953",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09957371",
+   "metadata": {},
+   "source": [
+    "### Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fab833b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3e8ff1b",
    "metadata": {},
    "outputs": [],
    "source": [
     "data = loader.load()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43c23d2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "21998d18",
@@ -177,7 +217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
    "id": "2f0cc9ff",
    "metadata": {},
    "outputs": [],
@@ -187,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
    "id": "42b531e8",
    "metadata": {},
    "outputs": [],
@@ -197,7 +237,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "id": "010d5cdd",
    "metadata": {},
    "outputs": [],
diff --git a/docs/modules/document_loaders/examples/powerpoint.ipynb b/docs/modules/document_loaders/examples/powerpoint.ipynb
index dec8c1d4..6255dfb9 100644
--- a/docs/modules/document_loaders/examples/powerpoint.ipynb
+++ b/docs/modules/document_loaders/examples/powerpoint.ipynb
@@ -61,10 +61,61 @@
     "data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "525d6b67",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "064f9162",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "abefbbdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a547c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0c55f1cf",
+   "id": "381d4139",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -86,7 +137,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/docs/modules/document_loaders/examples/unstructured_file.ipynb b/docs/modules/document_loaders/examples/unstructured_file.ipynb
index bb363903..92f043c4 100644
--- a/docs/modules/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb
@@ -12,6 +12,40 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "2886982e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install package\n",
+    "# !pip install unstructured"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "54d62efd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install other dependencies\n",
+    "# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n",
+    "# !brew install libmagic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "af6a64f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import nltk\n",
+    "# nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "79d3e549",
    "metadata": {},
    "outputs": [],
@@ -21,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "id": "2593d1dc",
    "metadata": {},
    "outputs": [],
@@ -31,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "id": "fe34e941",
    "metadata": {},
    "outputs": [],
@@ -39,10 +73,86 @@
     "docs = loader.load()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ee449788",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0].page_content[:400]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7874d01d",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ff5b616d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "feca3b6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fec5bbac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[:5]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "24e577e5",
+   "id": "8ca8a648",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -64,7 +174,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/langchain/document_loaders/docx.py b/langchain/document_loaders/docx.py
index 0b595ece..8edc10da 100644
--- a/langchain/document_loaders/docx.py
+++ b/langchain/document_loaders/docx.py
@@ -1,29 +1,13 @@
 """Loader that loads Microsoft Word files."""
 from typing import List
 
-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
 
-class UnstructuredDocxLoader(BaseLoader):
+class UnstructuredDocxLoader(UnstructuredFileLoader):
     """Loader that uses unstructured to load Microsoft Word files."""
 
-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.docx import partition_docx
 
-        elements = partition_docx(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_docx(filename=self.file_path)
diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py
index ec22601f..2c3ecd88 100644
--- a/langchain/document_loaders/email.py
+++ b/langchain/document_loaders/email.py
@@ -1,29 +1,13 @@
 """Loader that loads email files."""
 from typing import List
 
-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
 
-class UnstructuredEmailLoader(BaseLoader):
+class UnstructuredEmailLoader(UnstructuredFileLoader):
     """Loader that uses unstructured to load email files."""
 
-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.email import partition_email
 
-        elements = partition_email(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_email(filename=self.file_path)
diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py
index 5c7afa91..a65edba2 100644
--- a/langchain/document_loaders/html.py
+++ b/langchain/document_loaders/html.py
@@ -1,29 +1,13 @@
 """Loader that loads PDF files."""
 from typing import List
 
-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
 
-class UnstructuredHTMLLoader(BaseLoader):
+class UnstructuredHTMLLoader(UnstructuredFileLoader):
     """Loader that uses unstructured to load HTML files."""
 
-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.html import partition_html
 
-        elements = partition_html(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_html(filename=self.file_path)
diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py
index d6a9810c..ca7b7e58 100644
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@@ -3,30 +3,16 @@ from typing import List
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
 
-class UnstructuredPDFLoader(BaseLoader):
+class UnstructuredPDFLoader(UnstructuredFileLoader):
     """Loader that uses unstructured to load PDF files."""
 
-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.pdf import partition_pdf
 
-        elements = partition_pdf(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_pdf(filename=self.file_path)
 
 
 class PDFMinerLoader(BaseLoader):
diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py
index 40bb9825..d8709b9b 100644
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@@ -1,29 +1,13 @@
 """Loader that loads powerpoint files."""
 from typing import List
 
-from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
 
-class UnstructuredPowerPointLoader(BaseLoader):
+class UnstructuredPowerPointLoader(UnstructuredFileLoader):
     """Loader that uses unstructured to load powerpoint files."""
 
-    def __init__(self, file_path: str):
-        """Initialize with file path."""
-        try:
-            import unstructured  # noqa:F401
-        except ImportError:
-            raise ValueError(
-                "unstructured package not found, please install it with "
-                "`pip install unstructured`"
-            )
-        self.file_path = file_path
-
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.pptx import partition_pptx
 
-        elements = partition_pptx(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
-        metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        return partition_pptx(filename=self.file_path)
diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py
index 8f3b931f..079af736 100644
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
 class UnstructuredFileLoader(BaseLoader):
     """Loader that uses unstructured to load files."""
 
-    def __init__(self, file_path: str):
+    def __init__(self, file_path: str, mode: str = "single"):
         """Initialize with file path."""
         try:
             import unstructured  # noqa:F401
@@ -17,13 +17,30 @@ class UnstructuredFileLoader(BaseLoader):
                 "unstructured package not found, please install it with "
                 "`pip install unstructured`"
             )
+        _valid_modes = {"single", "elements"}
+        if mode not in _valid_modes:
+            raise ValueError(
+                f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
+            )
         self.file_path = file_path
+        self.mode = mode
 
-    def load(self) -> List[Document]:
-        """Load file."""
+    def _get_elements(self) -> List:
         from unstructured.partition.auto import partition
 
-        elements = partition(filename=self.file_path)
-        text = "\n\n".join([str(el) for el in elements])
+        return partition(filename=self.file_path)
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        elements = self._get_elements()
         metadata = {"source": self.file_path}
-        return [Document(page_content=text, metadata=metadata)]
+        if self.mode == "elements":
+            docs = [
+                Document(page_content=str(el), metadata=metadata) for el in elements
+            ]
+        elif self.mode == "single":
+            text = "\n\n".join([str(el) for el in elements])
+            docs = [Document(page_content=text, metadata=metadata)]
+        else:
+            raise ValueError(f"mode of {self.mode} not supported.")
+        return docs