feat: document loader for epublications (#2202)

### Summary Adds a new document loader for processing e-publications. Works with `unstructured>=0.5.4`. You need to have [`pandoc`](https://pandoc.org/installing.html) installed for this loader to work. ### Testing ```python from langchain.document_loaders import UnstructuredEPubLoader loader = UnstructuredEPubLoader("winter-sports.epub", mode="elements") data = loader.load() data[0] ```
1 year ago · 3dfe1cf60e
parent a4a1ee6b5d
commit 3dfe1cf60e
5 changed files with 154 additions and 5 deletions
--- a/docs/ecosystem/unstructured.md
+++ b/docs/ecosystem/unstructured.md
@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references
 - Install the Python SDK with `pip install "unstructured[local-inference]"`
 - Install the following system dependencies if they are not already available on your system.
  Depending on what document types you're parsing, you may not need all of these.
-    - `libmagic-dev`
+    - `libmagic-dev` (filetype detection)
-    - `poppler-utils`
+    - `poppler-utils` (images and PDFs)
-    - `tesseract-ocr`
+    - `tesseract-ocr`(images and PDFs)
-    - `libreoffice`
+    - `libreoffice` (MS Office docs)
    - `pandoc` (EPUBs)
 - If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which
  `unstructured` uses for layout detection:
    - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`
--- a/docs/modules/document_loaders/examples/epub.ipynb
+++ b/docs/modules/document_loaders/examples/epub.ipynb
@ -0,0 +1,124 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39af9ecd",
   "metadata": {},
   "source": [
    "# EPubs\n",
    "\n",
    "This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "721c48aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import UnstructuredEPubLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9d3d0e35",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = UnstructuredEPubLoader(\"winter-sports.epub\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06073f91",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "525d6b67",
   "metadata": {},
   "source": [
    "## Retain Elements\n",
    "\n",
    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "064f9162",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "abefbbdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a547c534",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "381d4139",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
@ -311,7 +311,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.8.13"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.duckdb_loader import DuckDBLoader
 from langchain.document_loaders.email import UnstructuredEmailLoader
 from langchain.document_loaders.epub import UnstructuredEPubLoader
 from langchain.document_loaders.evernote import EverNoteLoader
 from langchain.document_loaders.facebook_chat import FacebookChatLoader
 from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
@ -85,6 +86,7 @@ __all__ = [
    "UnstructuredImageLoader",
    "ObsidianLoader",
    "UnstructuredEmailLoader",
    "UnstructuredEPubLoader",
    "UnstructuredMarkdownLoader",
    "RoamLoader",
    "YoutubeLoader",
--- a/langchain/document_loaders/epub.py
+++ b/langchain/document_loaders/epub.py
@ -0,0 +1,22 @@
 """Loader that loads EPub files."""
 from typing import List
 from langchain.document_loaders.unstructured import (
    UnstructuredFileLoader,
    satisfies_min_unstructured_version,
 )
 class UnstructuredEPubLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load epub files."""
    def _get_elements(self) -> List:
        min_unstructured_version = "0.5.4"
        if not satisfies_min_unstructured_version(min_unstructured_version):
            raise ValueError(
                "Partitioning epub files is only supported in "
                f"unstructured>={min_unstructured_version}."
            )
        from unstructured.partition.epub import partition_epub
        return partition_epub(filename=self.file_path)