Harrison/unstructured support (#903)

1 year ago · 53d56d7650
parent 2a68be3e8d
commit 53d56d7650
16 changed files with 555 additions and 0 deletions
--- a/docs/modules/document_loaders.rst
+++ b/docs/modules/document_loaders.rst
@ -0,0 +1,29 @@
+Document Loaders
+==========================
+
+Combining language models with your own text data is a powerful way to differentiate them.
+The first step in doing this is to load the data into "documents" - a fancy way of say some pieces of text.
+This module is aimed at making this easy.
+
+A primary driver of a lot of this is the `Unstructured <https://github.com/Unstructured-IO/unstructured>`_ python package.
+This package is a great way to transform all types of files - text, powerpoint, images, html, pdf, etc - into text data.
+
+For detailed instructions on how to get set up with Unstructured, see installation guidelines `here <https://github.com/Unstructured-IO/unstructured#coffee-getting-started>`_.
+
+The following sections of documentation are provided:
+
+- `Key Concepts <./document_loaders/key_concepts.html>`_: A conceptual guide going over the various concepts related to loading documents.
+
+- `How-To Guides <./document_loaders/how_to_guides.html>`_: A collection of how-to guides. These highlight different types of loaders.
+
+
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Document Loaders
+   :name: Document Loaders
+   :hidden:
+
+   ./document_loaders/key_concepts.md
+   ./document_loaders/how_to_guides.rst
--- a/docs/modules/document_loaders/examples/directory_loader.ipynb
+++ b/docs/modules/document_loaders/examples/directory_loader.ipynb
@ -0,0 +1,101 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "79f24a6b",
+   "metadata": {},
+   "source": [
+    "# Directory Loader\n",
+    "This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "019d8520",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import DirectoryLoader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c76cdc5",
+   "metadata": {},
+   "source": [
+    "We can use the `glob` parameter to control which files to load. Note that here it doesn't load the `.rst` file or the `.ipynb` files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "891fe56f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DirectoryLoader('../', glob=\"**/*.md\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "addfe9cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b042086d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbc8256b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/export_format.png
+++ b/docs/modules/document_loaders/examples/export_format.png
--- a/docs/modules/document_loaders/examples/export_notion.png
+++ b/docs/modules/document_loaders/examples/export_notion.png
--- a/docs/modules/document_loaders/examples/notion.ipynb
+++ b/docs/modules/document_loaders/examples/notion.ipynb
@ -0,0 +1,86 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1dc7df1d",
+   "metadata": {},
+   "source": [
+    "# Notion\n",
+    "This notebook covers how to load documents from a Notion database dump.\n",
+    "\n",
+    "In order to get this notion dump, follow these instructions:\n",
+    "\n",
+    "## 🧑 Instructions for ingesting your own dataset\n",
+    "\n",
+    "Export your dataset from Notion. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n",
+    "\n",
+    "<img src=\"export_notion.png\" alt=\"export\" width=\"200\"/>\n",
+    "\n",
+    "When exporting, make sure to select the `Markdown & CSV` format option.\n",
+    "\n",
+    "<img src=\"export_format.png\" alt=\"export-format\" width=\"200\"/>\n",
+    "\n",
+    "This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n",
+    "\n",
+    "Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n",
+    "\n",
+    "```shell\n",
+    "unzip Export-d3adfe0f-3131-4bf3-8987-a52017fc1bae.zip -d Notion_DB\n",
+    "```\n",
+    "\n",
+    "Run the following command to ingest the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "007c5cbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import NotionDirectoryLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1caec59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = NotionDirectoryLoader(\"Notion_DB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c30ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/readthedocs_documentation.ipynb
+++ b/docs/modules/document_loaders/examples/readthedocs_documentation.ipynb
@ -0,0 +1,78 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "17812129",
+   "metadata": {},
+   "source": [
+    "# ReadTheDocs Documentation\n",
+    "This notebook covers how to load content from html that was generated as part of a Read-The-Docs build.\n",
+    "\n",
+    "For an example of this in the wild, see [here](https://github.com/hwchase17/chat-langchain).\n",
+    "\n",
+    "This assumes that the html has already been scraped into a folder. This can be done by uncommenting and running the following command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84696e27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!wget -r -A.html -P rtdocs https://langchain.readthedocs.io/en/latest/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "92dd950b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import ReadTheDocsLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "494567c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = ReadTheDocsLoader(\"rtdocs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2e6d6f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb
@ -0,0 +1,72 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "20deed05",
+   "metadata": {},
+   "source": [
+    "# Unstructured File Loader\n",
+    "This notebook covers how to use Unstructured to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "79d3e549",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredFileLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2593d1dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fe34e941",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24e577e5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -0,0 +1,19 @@
+How To Guides
+====================================
+
+There are a lot of different document loaders that LangChain supports. Below are how-to guides for working with them
+
+`File Loader <./examples/unstructured_file.html>`_: A walkthrough of how to use Unstructured to load files of arbitrary types (pdfs, txt, html, etc).
+
+`Directory Loader <./examples/directory_loader.html>`_: A walkthrough of how to use Unstructured load files from a given directory.
+
+`Notion <./examples/notion.html>`_: A walkthrough of how to load data for an arbitrary Notion DB.
+
+`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+   :hidden:
+
+   examples/*
--- a/docs/modules/document_loaders/key_concepts.md
+++ b/docs/modules/document_loaders/key_concepts.md
@ -0,0 +1,12 @@
+# Key Concepts
+
+## Document
+This class is a container for document information. This contains two parts:
+- `page_content`: The content of the actual page itself.
+- `metadata`: The metadata associated with the document. This can be things like the file path, the url, etc.
+
+## Loader
+This base class is a way to load documents. It exposes a `load` method that returns `Document` objects.
+
+## [Unstructured](https://github.com/Unstructured-IO/unstructured)
+Unstructured is a python package specifically focused on transformations from raw documents to text.
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -0,0 +1,13 @@
+"""All different types of document loaders."""
+
+from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.notion import NotionDirectoryLoader
+from langchain.document_loaders.readthedocs import ReadTheDocsLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+
+__all__ = [
+    "UnstructuredFileLoader",
+    "DirectoryLoader",
+    "NotionDirectoryLoader",
+    "ReadTheDocsLoader",
+]
--- a/langchain/document_loaders/base.py
+++ b/langchain/document_loaders/base.py
@ -0,0 +1,26 @@
+"""Base loader class."""
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
+
+
+class BaseLoader(ABC):
+    """Base loader class."""
+
+    @abstractmethod
+    def load(self) -> List[Document]:
+        """Load data into document objects."""
+
+    def load_and_split(
+        self, text_splitter: Optional[TextSplitter] = None
+    ) -> List[Document]:
+        """Load documents and split into chunks."""
+        if text_splitter is None:
+            _text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
+        else:
+            _text_splitter = text_splitter
+        docs = self.load()
+        return _text_splitter.split_documents(docs)
--- a/langchain/document_loaders/directory.py
+++ b/langchain/document_loaders/directory.py
@ -0,0 +1,26 @@
+"""Loading logic for loading documents from a directory."""
+from pathlib import Path
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+
+
+class DirectoryLoader(BaseLoader):
+    """Loading logic for loading documents from a directory."""
+
+    def __init__(self, path: str, glob: str = "**/*"):
+        """Initialize with path to directory and how to glob over it."""
+        self.path = path
+        self.glob = glob
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        p = Path(self.path)
+        docs = []
+        for i in p.glob(self.glob):
+            if i.is_file():
+                sub_docs = UnstructuredFileLoader(str(i)).load()
+                docs.extend(sub_docs)
+        return docs
--- a/langchain/document_loaders/notion.py
+++ b/langchain/document_loaders/notion.py
@ -0,0 +1,25 @@
+"""Loader that loads Notion directory dump."""
+from pathlib import Path
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class NotionDirectoryLoader(BaseLoader):
+    """Loader that loads Notion directory dump."""
+
+    def __init__(self, path: str):
+        """Initialize with path."""
+        self.file_path = path
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        ps = list(Path(self.file_path).glob("**/*.md"))
+        docs = []
+        for p in ps:
+            with open(p) as f:
+                text = f.read()
+            metadata = {"source": str(p)}
+            docs.append(Document(page_content=text, metadata=metadata))
+        return docs
--- a/langchain/document_loaders/readthedocs.py
+++ b/langchain/document_loaders/readthedocs.py
@ -0,0 +1,33 @@
+"""Loader that loads ReadTheDocs documentation directory dump."""
+from pathlib import Path
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class ReadTheDocsLoader(BaseLoader):
+    """Loader that loads ReadTheDocs documentation directory dump."""
+
+    def __init__(self, path: str):
+        """Initialize path."""
+        self.file_path = path
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        from bs4 import BeautifulSoup
+
+        def _clean_data(data: str) -> str:
+            soup = BeautifulSoup(data)
+            text = soup.find_all("main", {"id": "main-content"})[0].get_text()
+            return "\n".join([t for t in text.split("\n") if t])
+
+        docs = []
+        for p in Path(self.file_path).rglob("*"):
+            if p.is_dir():
+                continue
+            with open(p) as f:
+                text = _clean_data(f.read())
+            metadata = {"source": str(p)}
+            docs.append(Document(page_content=text, metadata=metadata))
+        return docs
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -0,0 +1,29 @@
+"""Loader that uses unstructured to load files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredFileLoader(BaseLoader):
+    """Loader that uses unstructured to load files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.auto import partition
+
+        elements = partition(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -44,6 +44,12 @@ class TextSplitter(ABC):
                documents.append(Document(page_content=chunk, metadata=_metadatas[i]))
        return documents

+    def split_documents(self, documents: List[Document]) -> List[Document]:
+        """Split documents."""
+        texts = [doc.page_content for doc in documents]
+        metadatas = [doc.metadata for doc in documents]
+        return self.create_documents(texts, metadatas)
+
    def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
        text = separator.join(docs)
        text = text.strip()