diff --git a/docs/modules/document_loaders.rst b/docs/modules/document_loaders.rst new file mode 100644 index 00000000..a393563a --- /dev/null +++ b/docs/modules/document_loaders.rst @@ -0,0 +1,29 @@ +Document Loaders +========================== + +Combining language models with your own text data is a powerful way to differentiate them. +The first step in doing this is to load the data into "documents" - a fancy way of say some pieces of text. +This module is aimed at making this easy. + +A primary driver of a lot of this is the `Unstructured `_ python package. +This package is a great way to transform all types of files - text, powerpoint, images, html, pdf, etc - into text data. + +For detailed instructions on how to get set up with Unstructured, see installation guidelines `here `_. + +The following sections of documentation are provided: + +- `Key Concepts <./document_loaders/key_concepts.html>`_: A conceptual guide going over the various concepts related to loading documents. + +- `How-To Guides <./document_loaders/how_to_guides.html>`_: A collection of how-to guides. These highlight different types of loaders. + + + + +.. toctree:: + :maxdepth: 1 + :caption: Document Loaders + :name: Document Loaders + :hidden: + + ./document_loaders/key_concepts.md + ./document_loaders/how_to_guides.rst \ No newline at end of file diff --git a/docs/modules/document_loaders/examples/directory_loader.ipynb b/docs/modules/document_loaders/examples/directory_loader.ipynb new file mode 100644 index 00000000..0a268f3d --- /dev/null +++ b/docs/modules/document_loaders/examples/directory_loader.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79f24a6b", + "metadata": {}, + "source": [ + "# Directory Loader\n", + "This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "019d8520", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import DirectoryLoader" + ] + }, + { + "cell_type": "markdown", + "id": "0c76cdc5", + "metadata": {}, + "source": [ + "We can use the `glob` parameter to control which files to load. Note that here it doesn't load the `.rst` file or the `.ipynb` files." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "891fe56f", + "metadata": {}, + "outputs": [], + "source": [ + "loader = DirectoryLoader('../', glob=\"**/*.md\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "addfe9cf", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b042086d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbc8256b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/export_format.png b/docs/modules/document_loaders/examples/export_format.png new file mode 100644 index 00000000..008db3eb Binary files /dev/null and b/docs/modules/document_loaders/examples/export_format.png differ diff --git a/docs/modules/document_loaders/examples/export_notion.png b/docs/modules/document_loaders/examples/export_notion.png new file mode 100644 index 00000000..1c7411c2 Binary files /dev/null and b/docs/modules/document_loaders/examples/export_notion.png differ diff --git a/docs/modules/document_loaders/examples/notion.ipynb b/docs/modules/document_loaders/examples/notion.ipynb new file mode 100644 index 00000000..d4014b97 --- /dev/null +++ b/docs/modules/document_loaders/examples/notion.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Notion\n", + "This notebook covers how to load documents from a Notion database dump.\n", + "\n", + "In order to get this notion dump, follow these instructions:\n", + "\n", + "## 🧑 Instructions for ingesting your own dataset\n", + "\n", + "Export your dataset from Notion. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n", + "\n", + "\"export\"\n", + "\n", + "When exporting, make sure to select the `Markdown & CSV` format option.\n", + "\n", + "\"export-format\"\n", + "\n", + "This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n", + "\n", + "Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n", + "\n", + "```shell\n", + "unzip Export-d3adfe0f-3131-4bf3-8987-a52017fc1bae.zip -d Notion_DB\n", + "```\n", + "\n", + "Run the following command to ingest the data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import NotionDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "loader = NotionDirectoryLoader(\"Notion_DB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/readthedocs_documentation.ipynb b/docs/modules/document_loaders/examples/readthedocs_documentation.ipynb new file mode 100644 index 00000000..5377ee60 --- /dev/null +++ b/docs/modules/document_loaders/examples/readthedocs_documentation.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "17812129", + "metadata": {}, + "source": [ + "# ReadTheDocs Documentation\n", + "This notebook covers how to load content from html that was generated as part of a Read-The-Docs build.\n", + "\n", + "For an example of this in the wild, see [here](https://github.com/hwchase17/chat-langchain).\n", + "\n", + "This assumes that the html has already been scraped into a folder. This can be done by uncommenting and running the following command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84696e27", + "metadata": {}, + "outputs": [], + "source": [ + "#!wget -r -A.html -P rtdocs https://langchain.readthedocs.io/en/latest/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "92dd950b", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import ReadTheDocsLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "494567c3", + "metadata": {}, + "outputs": [], + "source": [ + "loader = ReadTheDocsLoader(\"rtdocs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2e6d6f0", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/unstructured_file.ipynb b/docs/modules/document_loaders/examples/unstructured_file.ipynb new file mode 100644 index 00000000..bb363903 --- /dev/null +++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb @@ -0,0 +1,72 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "20deed05", + "metadata": {}, + "source": [ + "# Unstructured File Loader\n", + "This notebook covers how to use Unstructured to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "79d3e549", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2593d1dc", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fe34e941", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24e577e5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst new file mode 100644 index 00000000..6ae963fe --- /dev/null +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -0,0 +1,19 @@ +How To Guides +==================================== + +There are a lot of different document loaders that LangChain supports. Below are how-to guides for working with them + +`File Loader <./examples/unstructured_file.html>`_: A walkthrough of how to use Unstructured to load files of arbitrary types (pdfs, txt, html, etc). + +`Directory Loader <./examples/directory_loader.html>`_: A walkthrough of how to use Unstructured load files from a given directory. + +`Notion <./examples/notion.html>`_: A walkthrough of how to load data for an arbitrary Notion DB. + +`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs. + +.. toctree:: + :maxdepth: 1 + :glob: + :hidden: + + examples/* diff --git a/docs/modules/document_loaders/key_concepts.md b/docs/modules/document_loaders/key_concepts.md new file mode 100644 index 00000000..82a2ebae --- /dev/null +++ b/docs/modules/document_loaders/key_concepts.md @@ -0,0 +1,12 @@ +# Key Concepts + +## Document +This class is a container for document information. This contains two parts: +- `page_content`: The content of the actual page itself. +- `metadata`: The metadata associated with the document. This can be things like the file path, the url, etc. + +## Loader +This base class is a way to load documents. It exposes a `load` method that returns `Document` objects. + +## [Unstructured](https://github.com/Unstructured-IO/unstructured) +Unstructured is a python package specifically focused on transformations from raw documents to text. diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py new file mode 100644 index 00000000..e2309f0f --- /dev/null +++ b/langchain/document_loaders/__init__.py @@ -0,0 +1,13 @@ +"""All different types of document loaders.""" + +from langchain.document_loaders.directory import DirectoryLoader +from langchain.document_loaders.notion import NotionDirectoryLoader +from langchain.document_loaders.readthedocs import ReadTheDocsLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + +__all__ = [ + "UnstructuredFileLoader", + "DirectoryLoader", + "NotionDirectoryLoader", + "ReadTheDocsLoader", +] diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py new file mode 100644 index 00000000..d5784a74 --- /dev/null +++ b/langchain/document_loaders/base.py @@ -0,0 +1,26 @@ +"""Base loader class.""" + +from abc import ABC, abstractmethod +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter + + +class BaseLoader(ABC): + """Base loader class.""" + + @abstractmethod + def load(self) -> List[Document]: + """Load data into document objects.""" + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + """Load documents and split into chunks.""" + if text_splitter is None: + _text_splitter: TextSplitter = RecursiveCharacterTextSplitter() + else: + _text_splitter = text_splitter + docs = self.load() + return _text_splitter.split_documents(docs) diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py new file mode 100644 index 00000000..f90e6787 --- /dev/null +++ b/langchain/document_loaders/directory.py @@ -0,0 +1,26 @@ +"""Loading logic for loading documents from a directory.""" +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class DirectoryLoader(BaseLoader): + """Loading logic for loading documents from a directory.""" + + def __init__(self, path: str, glob: str = "**/*"): + """Initialize with path to directory and how to glob over it.""" + self.path = path + self.glob = glob + + def load(self) -> List[Document]: + """Load documents.""" + p = Path(self.path) + docs = [] + for i in p.glob(self.glob): + if i.is_file(): + sub_docs = UnstructuredFileLoader(str(i)).load() + docs.extend(sub_docs) + return docs diff --git a/langchain/document_loaders/notion.py b/langchain/document_loaders/notion.py new file mode 100644 index 00000000..f5d83bf9 --- /dev/null +++ b/langchain/document_loaders/notion.py @@ -0,0 +1,25 @@ +"""Loader that loads Notion directory dump.""" +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class NotionDirectoryLoader(BaseLoader): + """Loader that loads Notion directory dump.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + ps = list(Path(self.file_path).glob("**/*.md")) + docs = [] + for p in ps: + with open(p) as f: + text = f.read() + metadata = {"source": str(p)} + docs.append(Document(page_content=text, metadata=metadata)) + return docs diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py new file mode 100644 index 00000000..31d6aa06 --- /dev/null +++ b/langchain/document_loaders/readthedocs.py @@ -0,0 +1,33 @@ +"""Loader that loads ReadTheDocs documentation directory dump.""" +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class ReadTheDocsLoader(BaseLoader): + """Loader that loads ReadTheDocs documentation directory dump.""" + + def __init__(self, path: str): + """Initialize path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + from bs4 import BeautifulSoup + + def _clean_data(data: str) -> str: + soup = BeautifulSoup(data) + text = soup.find_all("main", {"id": "main-content"})[0].get_text() + return "\n".join([t for t in text.split("\n") if t]) + + docs = [] + for p in Path(self.file_path).rglob("*"): + if p.is_dir(): + continue + with open(p) as f: + text = _clean_data(f.read()) + metadata = {"source": str(p)} + docs.append(Document(page_content=text, metadata=metadata)) + return docs diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py new file mode 100644 index 00000000..8f3b931f --- /dev/null +++ b/langchain/document_loaders/unstructured.py @@ -0,0 +1,29 @@ +"""Loader that uses unstructured to load files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredFileLoader(BaseLoader): + """Loader that uses unstructured to load files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.auto import partition + + elements = partition(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 0246e269..71f656e8 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -44,6 +44,12 @@ class TextSplitter(ABC): documents.append(Document(page_content=chunk, metadata=_metadatas[i])) return documents + def split_documents(self, documents: List[Document]) -> List[Document]: + """Split documents.""" + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return self.create_documents(texts, metadatas) + def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: text = separator.join(docs) text = text.strip()