add unstructured examples (#913)

2024-11-08 07:10:35 +00:00 · 2023-02-06 18:13:46 -08:00 · 2023-02-06 18:13:46 -08:00 · 2ec25ddd4c
commit 2ec25ddd4c
parent 31b054f69d
8 changed files with 360 additions and 0 deletions
--- a/docs/modules/document_loaders/examples/html.ipynb
+++ b/docs/modules/document_loaders/examples/html.ipynb
@ -0,0 +1,94 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2dfc4698",
   "metadata": {},
   "source": [
    "# HTML\n",
    "\n",
    "This covers how to load HTML documents into a document format that we can use downstream."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "24b434b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import UnstructuredHTMLLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "00f46fda",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = UnstructuredHTMLLoader(\"example_data/fake-content.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b68a26b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "34de48fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79b1bce4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/modules/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/document_loaders/examples/pdf.ipynb
@ -0,0 +1,73 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f70e6118",
   "metadata": {},
   "source": [
    "# PDF\n",
    "\n",
    "This covers how to load pdfs into a document format that we can use downstream."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0cc0cd42",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import UnstructuredPDFLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "082d557c",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5c41106f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54fb6b62",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/modules/document_loaders/examples/powerpoint.ipynb
+++ b/docs/modules/document_loaders/examples/powerpoint.ipynb
@ -0,0 +1,94 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39af9ecd",
   "metadata": {},
   "source": [
    "# PowerPoint\n",
    "\n",
    "This covers how to load PowerPoint documents into a document format that we can use downstream."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "721c48aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import UnstructuredPowerPointLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9d3d0e35",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06073f91",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c9adc5cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c55f1cf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -11,6 +11,12 @@ There are a lot of different document loaders that LangChain supports. Below are
 `ReadTheDocs <./examples/readthedocs_documentation.html>`_: A walkthrough of how to load data for documentation generated by ReadTheDocs.
 `HTML <./examples/html.html>`_: A walkthrough of how to load data from an html file.
 `PDF <./examples/pdf.html>`_: A walkthrough of how to load data from a PDF file.
 `PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
 .. toctree::
   :maxdepth: 1
   :glob:
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -1,7 +1,10 @@
 """All different types of document loaders."""
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.pdf import UnstructuredPDFLoader
 from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
@ -10,4 +13,7 @@ __all__ = [
    "DirectoryLoader",
    "NotionDirectoryLoader",
    "ReadTheDocsLoader",
    "UnstructuredHTMLLoader",
    "UnstructuredPowerPointLoader",
    "UnstructuredPDFLoader",
 ]
--- a/langchain/document_loaders/html.py
+++ b/langchain/document_loaders/html.py
@ -0,0 +1,29 @@
 """Loader that loads PDF files."""
 from typing import List
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 class UnstructuredHTMLLoader(BaseLoader):
    """Loader that uses unstructured to load HTML files."""
    def __init__(self, file_path: str):
        """Initialize with file path."""
        try:
            import unstructured  # noqa:F401
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
        self.file_path = file_path
    def load(self) -> List[Document]:
        """Load file."""
        from unstructured.partition.html import partition_html
        elements = partition_html(filename=self.file_path)
        text = "\n\n".join([str(el) for el in elements])
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -0,0 +1,29 @@
 """Loader that loads PowerPoint files."""
 from typing import List
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 class UnstructuredPDFLoader(BaseLoader):
    """Loader that uses unstructured to load PowerPoint files."""
    def __init__(self, file_path: str):
        """Initialize with file path."""
        try:
            import unstructured  # noqa:F401
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
        self.file_path = file_path
    def load(self) -> List[Document]:
        """Load file."""
        from unstructured.partition.pdf import partition_pdf
        elements = partition_pdf(filename=self.file_path)
        text = "\n\n".join([str(el) for el in elements])
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -0,0 +1,29 @@
 """Loader that loads PDF files."""
 from typing import List
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 class UnstructuredPowerPointLoader(BaseLoader):
    """Loader that uses unstructured to load PDF files."""
    def __init__(self, file_path: str):
        """Initialize with file path."""
        try:
            import unstructured  # noqa:F401
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
        self.file_path = file_path
    def load(self) -> List[Document]:
        """Load file."""
        from unstructured.partition.pptx import partition_pptx
        elements = partition_pptx(filename=self.file_path)
        text = "\n\n".join([str(el) for el in elements])
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]