From 2ec25ddd4cc19223b08bab6aef84dec7807095d7 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Mon, 6 Feb 2023 18:13:46 -0800
Subject: [PATCH] add unstructured examples (#913)

---
 .../document_loaders/examples/html.ipynb      | 94 +++++++++++++++++++
 .../document_loaders/examples/pdf.ipynb       | 73 ++++++++++++++
 .../examples/powerpoint.ipynb                 | 94 +++++++++++++++++++
 .../document_loaders/how_to_guides.rst        |  6 ++
 langchain/document_loaders/__init__.py        |  6 ++
 langchain/document_loaders/html.py            | 29 ++++++
 langchain/document_loaders/pdf.py             | 29 ++++++
 langchain/document_loaders/powerpoint.py      | 29 ++++++
 8 files changed, 360 insertions(+)
 create mode 100644 docs/modules/document_loaders/examples/html.ipynb
 create mode 100644 docs/modules/document_loaders/examples/pdf.ipynb
 create mode 100644 docs/modules/document_loaders/examples/powerpoint.ipynb
 create mode 100644 langchain/document_loaders/html.py
 create mode 100644 langchain/document_loaders/pdf.py
 create mode 100644 langchain/document_loaders/powerpoint.py

diff --git a/docs/modules/document_loaders/examples/html.ipynb b/docs/modules/document_loaders/examples/html.ipynb
new file mode 100644
index 0000000000..2a4988284d
--- /dev/null
+++ b/docs/modules/document_loaders/examples/html.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2dfc4698",
+   "metadata": {},
+   "source": [
+    "# HTML\n",
+    "\n",
+    "This covers how to load HTML documents into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "24b434b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredHTMLLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "00f46fda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredHTMLLoader(\"example_data/fake-content.html\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b68a26b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "34de48fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79b1bce4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/examples/pdf.ipynb b/docs/modules/document_loaders/examples/pdf.ipynb
new file mode 100644
index 0000000000..b7e589e829
--- /dev/null
+++ b/docs/modules/document_loaders/examples/pdf.ipynb
@@ -0,0 +1,73 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f70e6118",
+   "metadata": {},
+   "source": [
+    "# PDF\n",
+    "\n",
+    "This covers how to load pdfs into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0cc0cd42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredPDFLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "082d557c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5c41106f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54fb6b62",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/examples/powerpoint.ipynb b/docs/modules/document_loaders/examples/powerpoint.ipynb
new file mode 100644
index 0000000000..dec8c1d4be
--- /dev/null
+++ b/docs/modules/document_loaders/examples/powerpoint.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "39af9ecd",
+   "metadata": {},
+   "source": [
+    "# PowerPoint\n",
+    "\n",
+    "This covers how to load PowerPoint documents into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "721c48aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredPowerPointLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9d3d0e35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "06073f91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c9adc5cb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c55f1cf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst
index 6ae963fecf..daa8a352ba 100644
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@@ -11,6 +11,12 @@ There are a lot of different document loaders that LangChain supports. Below are
 
 `ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
 
+`HTML <./examples/html.html>`_: A walkthrough of how to load data from an html file.
+
+`PDF <./examples/pdf.html>`_: A walkthrough of how to load data from a PDF file.
+
+`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
+
 .. toctree::
    :maxdepth: 1
    :glob:
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index e2309f0f53..d564f63b27 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -1,7 +1,10 @@
 """All different types of document loaders."""
 
 from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
+from langchain.document_loaders.pdf import UnstructuredPDFLoader
+from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
@@ -10,4 +13,7 @@ __all__ = [
     "DirectoryLoader",
     "NotionDirectoryLoader",
     "ReadTheDocsLoader",
+    "UnstructuredHTMLLoader",
+    "UnstructuredPowerPointLoader",
+    "UnstructuredPDFLoader",
 ]
diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py
new file mode 100644
index 0000000000..5c7afa91d0
--- /dev/null
+++ b/langchain/document_loaders/html.py
@@ -0,0 +1,29 @@
+"""Loader that loads PDF files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredHTMLLoader(BaseLoader):
+    """Loader that uses unstructured to load HTML files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.html import partition_html
+
+        elements = partition_html(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]
diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py
new file mode 100644
index 0000000000..629e0e0bf1
--- /dev/null
+++ b/langchain/document_loaders/pdf.py
@@ -0,0 +1,29 @@
+"""Loader that loads PowerPoint files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredPDFLoader(BaseLoader):
+    """Loader that uses unstructured to load PowerPoint files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.pdf import partition_pdf
+
+        elements = partition_pdf(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]
diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py
new file mode 100644
index 0000000000..d75f7c600b
--- /dev/null
+++ b/langchain/document_loaders/powerpoint.py
@@ -0,0 +1,29 @@
+"""Loader that loads PDF files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredPowerPointLoader(BaseLoader):
+    """Loader that uses unstructured to load PDF files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.pptx import partition_pptx
+
+        elements = partition_pptx(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]