mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
add unstructured examples (#913)
This commit is contained in:
parent
31b054f69d
commit
2ec25ddd4c
94
docs/modules/document_loaders/examples/html.ipynb
Normal file
94
docs/modules/document_loaders/examples/html.ipynb
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2dfc4698",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# HTML\n",
|
||||||
|
"\n",
|
||||||
|
"This covers how to load HTML documents into a document format that we can use downstream."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "24b434b5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredHTMLLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "00f46fda",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredHTMLLoader(\"example_data/fake-content.html\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "b68a26b3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "34de48fa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "79b1bce4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
73
docs/modules/document_loaders/examples/pdf.ipynb
Normal file
73
docs/modules/document_loaders/examples/pdf.ipynb
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f70e6118",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# PDF\n",
|
||||||
|
"\n",
|
||||||
|
"This covers how to load pdfs into a document format that we can use downstream."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "0cc0cd42",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredPDFLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "082d557c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "5c41106f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "54fb6b62",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
94
docs/modules/document_loaders/examples/powerpoint.ipynb
Normal file
94
docs/modules/document_loaders/examples/powerpoint.ipynb
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "39af9ecd",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# PowerPoint\n",
|
||||||
|
"\n",
|
||||||
|
"This covers how to load PowerPoint documents into a document format that we can use downstream."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "721c48aa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredPowerPointLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "9d3d0e35",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "06073f91",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "c9adc5cb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0c55f1cf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -11,6 +11,12 @@ There are a lot of different document loaders that LangChain supports. Below are
|
|||||||
|
|
||||||
`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
|
`ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
|
||||||
|
|
||||||
|
`HTML <./examples/html.html>`_: A walkthrough of how to load data from an html file.
|
||||||
|
|
||||||
|
`PDF <./examples/pdf.html>`_: A walkthrough of how to load data from a PDF file.
|
||||||
|
|
||||||
|
`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:glob:
|
:glob:
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
"""All different types of document loaders."""
|
"""All different types of document loaders."""
|
||||||
|
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
|
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||||
|
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||||
|
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
@ -10,4 +13,7 @@ __all__ = [
|
|||||||
"DirectoryLoader",
|
"DirectoryLoader",
|
||||||
"NotionDirectoryLoader",
|
"NotionDirectoryLoader",
|
||||||
"ReadTheDocsLoader",
|
"ReadTheDocsLoader",
|
||||||
|
"UnstructuredHTMLLoader",
|
||||||
|
"UnstructuredPowerPointLoader",
|
||||||
|
"UnstructuredPDFLoader",
|
||||||
]
|
]
|
||||||
|
29
langchain/document_loaders/html.py
Normal file
29
langchain/document_loaders/html.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that loads PDF files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredHTMLLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
elements = partition_html(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
29
langchain/document_loaders/pdf.py
Normal file
29
langchain/document_loaders/pdf.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that loads PowerPoint files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredPDFLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load PowerPoint files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
|
||||||
|
elements = partition_pdf(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
29
langchain/document_loaders/powerpoint.py
Normal file
29
langchain/document_loaders/powerpoint.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""Loader that loads PDF files."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredPowerPointLoader(BaseLoader):
|
||||||
|
"""Loader that uses unstructured to load PDF files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
|
elements = partition_pptx(filename=self.file_path)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user