From 2ec25ddd4cc19223b08bab6aef84dec7807095d7 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 6 Feb 2023 18:13:46 -0800 Subject: [PATCH] add unstructured examples (#913) --- .../document_loaders/examples/html.ipynb | 94 +++++++++++++++++++ .../document_loaders/examples/pdf.ipynb | 73 ++++++++++++++ .../examples/powerpoint.ipynb | 94 +++++++++++++++++++ .../document_loaders/how_to_guides.rst | 6 ++ langchain/document_loaders/__init__.py | 6 ++ langchain/document_loaders/html.py | 29 ++++++ langchain/document_loaders/pdf.py | 29 ++++++ langchain/document_loaders/powerpoint.py | 29 ++++++ 8 files changed, 360 insertions(+) create mode 100644 docs/modules/document_loaders/examples/html.ipynb create mode 100644 docs/modules/document_loaders/examples/pdf.ipynb create mode 100644 docs/modules/document_loaders/examples/powerpoint.ipynb create mode 100644 langchain/document_loaders/html.py create mode 100644 langchain/document_loaders/pdf.py create mode 100644 langchain/document_loaders/powerpoint.py diff --git a/docs/modules/document_loaders/examples/html.ipynb b/docs/modules/document_loaders/examples/html.ipynb new file mode 100644 index 0000000000..2a4988284d --- /dev/null +++ b/docs/modules/document_loaders/examples/html.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2dfc4698", + "metadata": {}, + "source": [ + "# HTML\n", + "\n", + "This covers how to load HTML documents into a document format that we can use downstream." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24b434b5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredHTMLLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "00f46fda", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredHTMLLoader(\"example_data/fake-content.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b68a26b3", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "34de48fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b1bce4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/pdf.ipynb b/docs/modules/document_loaders/examples/pdf.ipynb new file mode 100644 index 0000000000..b7e589e829 --- /dev/null +++ b/docs/modules/document_loaders/examples/pdf.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f70e6118", + "metadata": {}, + "source": [ + "# PDF\n", + "\n", + "This covers how to load pdfs into a document format that we can use downstream." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0cc0cd42", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredPDFLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "082d557c", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5c41106f", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54fb6b62", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/powerpoint.ipynb b/docs/modules/document_loaders/examples/powerpoint.ipynb new file mode 100644 index 0000000000..dec8c1d4be --- /dev/null +++ b/docs/modules/document_loaders/examples/powerpoint.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39af9ecd", + "metadata": {}, + "source": [ + "# PowerPoint\n", + "\n", + "This covers how to load PowerPoint documents into a document format that we can use downstream." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "721c48aa", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredPowerPointLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9d3d0e35", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06073f91", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c9adc5cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c55f1cf", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index 6ae963fecf..daa8a352ba 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -11,6 +11,12 @@ There are a lot of different document loaders that LangChain supports. Below are `ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs. +`HTML <./examples/html.html>`_: A walkthrough of how to load data from an html file. + +`PDF <./examples/pdf.html>`_: A walkthrough of how to load data from a PDF file. + +`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file. + .. toctree:: :maxdepth: 1 :glob: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index e2309f0f53..d564f63b27 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,7 +1,10 @@ """All different types of document loaders.""" from langchain.document_loaders.directory import DirectoryLoader +from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.notion import NotionDirectoryLoader +from langchain.document_loaders.pdf import UnstructuredPDFLoader +from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader @@ -10,4 +13,7 @@ __all__ = [ "DirectoryLoader", "NotionDirectoryLoader", "ReadTheDocsLoader", + "UnstructuredHTMLLoader", + "UnstructuredPowerPointLoader", + "UnstructuredPDFLoader", ] diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py new file mode 100644 index 0000000000..5c7afa91d0 --- /dev/null +++ b/langchain/document_loaders/html.py @@ -0,0 +1,29 @@ +"""Loader that loads PDF files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredHTMLLoader(BaseLoader): + """Loader that uses unstructured to load HTML files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.html import partition_html + + elements = partition_html(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py new file mode 100644 index 0000000000..629e0e0bf1 --- /dev/null +++ b/langchain/document_loaders/pdf.py @@ -0,0 +1,29 @@ +"""Loader that loads PowerPoint files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredPDFLoader(BaseLoader): + """Loader that uses unstructured to load PowerPoint files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.pdf import partition_pdf + + elements = partition_pdf(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py new file mode 100644 index 0000000000..d75f7c600b --- /dev/null +++ b/langchain/document_loaders/powerpoint.py @@ -0,0 +1,29 @@ +"""Loader that loads PDF files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredPowerPointLoader(BaseLoader): + """Loader that uses unstructured to load PDF files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.pptx import partition_pptx + + elements = partition_pptx(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]