mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/unstructured support (#903)
This commit is contained in:
parent
2a68be3e8d
commit
53d56d7650
29
docs/modules/document_loaders.rst
Normal file
29
docs/modules/document_loaders.rst
Normal file
@ -0,0 +1,29 @@
|
||||
Document Loaders
|
||||
==========================
|
||||
|
||||
Combining language models with your own text data is a powerful way to differentiate them.
|
||||
The first step in doing this is to load the data into "documents" - a fancy way of say some pieces of text.
|
||||
This module is aimed at making this easy.
|
||||
|
||||
A primary driver of a lot of this is the `Unstructured <https://github.com/Unstructured-IO/unstructured>`_ python package.
|
||||
This package is a great way to transform all types of files - text, powerpoint, images, html, pdf, etc - into text data.
|
||||
|
||||
For detailed instructions on how to get set up with Unstructured, see installation guidelines `here <https://github.com/Unstructured-IO/unstructured#coffee-getting-started>`_.
|
||||
|
||||
The following sections of documentation are provided:
|
||||
|
||||
- `Key Concepts <./document_loaders/key_concepts.html>`_: A conceptual guide going over the various concepts related to loading documents.
|
||||
|
||||
- `How-To Guides <./document_loaders/how_to_guides.html>`_: A collection of how-to guides. These highlight different types of loaders.
|
||||
|
||||
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Document Loaders
|
||||
:name: Document Loaders
|
||||
:hidden:
|
||||
|
||||
./document_loaders/key_concepts.md
|
||||
./document_loaders/how_to_guides.rst
|
101
docs/modules/document_loaders/examples/directory_loader.ipynb
Normal file
101
docs/modules/document_loaders/examples/directory_loader.ipynb
Normal file
@ -0,0 +1,101 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79f24a6b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Directory Loader\n",
|
||||
"This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "019d8520",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import DirectoryLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c76cdc5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can use the `glob` parameter to control which files to load. Note that here it doesn't load the `.rst` file or the `.ipynb` files."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "891fe56f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = DirectoryLoader('../', glob=\"**/*.md\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "addfe9cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b042086d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cbc8256b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
BIN
docs/modules/document_loaders/examples/export_format.png
Normal file
BIN
docs/modules/document_loaders/examples/export_format.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 23 KiB |
BIN
docs/modules/document_loaders/examples/export_notion.png
Normal file
BIN
docs/modules/document_loaders/examples/export_notion.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 590 KiB |
86
docs/modules/document_loaders/examples/notion.ipynb
Normal file
86
docs/modules/document_loaders/examples/notion.ipynb
Normal file
@ -0,0 +1,86 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1dc7df1d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notion\n",
|
||||
"This notebook covers how to load documents from a Notion database dump.\n",
|
||||
"\n",
|
||||
"In order to get this notion dump, follow these instructions:\n",
|
||||
"\n",
|
||||
"## 🧑 Instructions for ingesting your own dataset\n",
|
||||
"\n",
|
||||
"Export your dataset from Notion. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n",
|
||||
"\n",
|
||||
"<img src=\"export_notion.png\" alt=\"export\" width=\"200\"/>\n",
|
||||
"\n",
|
||||
"When exporting, make sure to select the `Markdown & CSV` format option.\n",
|
||||
"\n",
|
||||
"<img src=\"export_format.png\" alt=\"export-format\" width=\"200\"/>\n",
|
||||
"\n",
|
||||
"This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n",
|
||||
"\n",
|
||||
"Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n",
|
||||
"\n",
|
||||
"```shell\n",
|
||||
"unzip Export-d3adfe0f-3131-4bf3-8987-a52017fc1bae.zip -d Notion_DB\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Run the following command to ingest the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "007c5cbf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import NotionDirectoryLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a1caec59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = NotionDirectoryLoader(\"Notion_DB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b1c30ff7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "17812129",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ReadTheDocs Documentation\n",
|
||||
"This notebook covers how to load content from html that was generated as part of a Read-The-Docs build.\n",
|
||||
"\n",
|
||||
"For an example of this in the wild, see [here](https://github.com/hwchase17/chat-langchain).\n",
|
||||
"\n",
|
||||
"This assumes that the html has already been scraped into a folder. This can be done by uncommenting and running the following command"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "84696e27",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!wget -r -A.html -P rtdocs https://langchain.readthedocs.io/en/latest/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "92dd950b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import ReadTheDocsLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "494567c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = ReadTheDocsLoader(\"rtdocs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e2e6d6f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "20deed05",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Unstructured File Loader\n",
|
||||
"This notebook covers how to use Unstructured to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "79d3e549",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import UnstructuredFileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2593d1dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "fe34e941",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24e577e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
19
docs/modules/document_loaders/how_to_guides.rst
Normal file
19
docs/modules/document_loaders/how_to_guides.rst
Normal file
@ -0,0 +1,19 @@
|
||||
How To Guides
|
||||
====================================
|
||||
|
||||
There are a lot of different document loaders that LangChain supports. Below are how-to guides for working with them
|
||||
|
||||
`File Loader <./examples/unstructured_file.html>`_: A walkthrough of how to use Unstructured to load files of arbitrary types (pdfs, txt, html, etc).
|
||||
|
||||
`Directory Loader <./examples/directory_loader.html>`_: A walkthrough of how to use Unstructured load files from a given directory.
|
||||
|
||||
`Notion <./examples/notion.html>`_: A walkthrough of how to load data for an arbitrary Notion DB.
|
||||
|
||||
`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
:hidden:
|
||||
|
||||
examples/*
|
12
docs/modules/document_loaders/key_concepts.md
Normal file
12
docs/modules/document_loaders/key_concepts.md
Normal file
@ -0,0 +1,12 @@
|
||||
# Key Concepts
|
||||
|
||||
## Document
|
||||
This class is a container for document information. This contains two parts:
|
||||
- `page_content`: The content of the actual page itself.
|
||||
- `metadata`: The metadata associated with the document. This can be things like the file path, the url, etc.
|
||||
|
||||
## Loader
|
||||
This base class is a way to load documents. It exposes a `load` method that returns `Document` objects.
|
||||
|
||||
## [Unstructured](https://github.com/Unstructured-IO/unstructured)
|
||||
Unstructured is a python package specifically focused on transformations from raw documents to text.
|
13
langchain/document_loaders/__init__.py
Normal file
13
langchain/document_loaders/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredFileLoader",
|
||||
"DirectoryLoader",
|
||||
"NotionDirectoryLoader",
|
||||
"ReadTheDocsLoader",
|
||||
]
|
26
langchain/document_loaders/base.py
Normal file
26
langchain/document_loaders/base.py
Normal file
@ -0,0 +1,26 @@
|
||||
"""Base loader class."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||
|
||||
|
||||
class BaseLoader(ABC):
|
||||
"""Base loader class."""
|
||||
|
||||
@abstractmethod
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
|
||||
def load_and_split(
|
||||
self, text_splitter: Optional[TextSplitter] = None
|
||||
) -> List[Document]:
|
||||
"""Load documents and split into chunks."""
|
||||
if text_splitter is None:
|
||||
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
|
||||
else:
|
||||
_text_splitter = text_splitter
|
||||
docs = self.load()
|
||||
return _text_splitter.split_documents(docs)
|
26
langchain/document_loaders/directory.py
Normal file
26
langchain/document_loaders/directory.py
Normal file
@ -0,0 +1,26 @@
|
||||
"""Loading logic for loading documents from a directory."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class DirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from a directory."""
|
||||
|
||||
def __init__(self, path: str, glob: str = "**/*"):
|
||||
"""Initialize with path to directory and how to glob over it."""
|
||||
self.path = path
|
||||
self.glob = glob
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
p = Path(self.path)
|
||||
docs = []
|
||||
for i in p.glob(self.glob):
|
||||
if i.is_file():
|
||||
sub_docs = UnstructuredFileLoader(str(i)).load()
|
||||
docs.extend(sub_docs)
|
||||
return docs
|
25
langchain/document_loaders/notion.py
Normal file
25
langchain/document_loaders/notion.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""Loader that loads Notion directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class NotionDirectoryLoader(BaseLoader):
|
||||
"""Loader that loads Notion directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
ps = list(Path(self.file_path).glob("**/*.md"))
|
||||
docs = []
|
||||
for p in ps:
|
||||
with open(p) as f:
|
||||
text = f.read()
|
||||
metadata = {"source": str(p)}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
33
langchain/document_loaders/readthedocs.py
Normal file
33
langchain/document_loaders/readthedocs.py
Normal file
@ -0,0 +1,33 @@
|
||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ReadTheDocsLoader(BaseLoader):
|
||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def _clean_data(data: str) -> str:
|
||||
soup = BeautifulSoup(data)
|
||||
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
||||
return "\n".join([t for t in text.split("\n") if t])
|
||||
|
||||
docs = []
|
||||
for p in Path(self.file_path).rglob("*"):
|
||||
if p.is_dir():
|
||||
continue
|
||||
with open(p) as f:
|
||||
text = _clean_data(f.read())
|
||||
metadata = {"source": str(p)}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
29
langchain/document_loaders/unstructured.py
Normal file
29
langchain/document_loaders/unstructured.py
Normal file
@ -0,0 +1,29 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class UnstructuredFileLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
elements = partition(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -44,6 +44,12 @@ class TextSplitter(ABC):
|
||||
documents.append(Document(page_content=chunk, metadata=_metadatas[i]))
|
||||
return documents
|
||||
|
||||
def split_documents(self, documents: List[Document]) -> List[Document]:
|
||||
"""Split documents."""
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return self.create_documents(texts, metadatas)
|
||||
|
||||
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
|
||||
text = separator.join(docs)
|
||||
text = text.strip()
|
||||
|
Loading…
Reference in New Issue
Block a user