feat: document loader for epublications (#2202)

### Summary Adds a new document loader for processing e-publications. Works with `unstructured>=0.5.4`. You need to have [`pandoc`](https://pandoc.org/installing.html) installed for this loader to work. ### Testing ```python from langchain.document_loaders import UnstructuredEPubLoader loader = UnstructuredEPubLoader("winter-sports.epub", mode="elements") data = loader.load() data[0] ```
1 year ago · 3dfe1cf60e
parent a4a1ee6b5d
commit 3dfe1cf60e
5 changed files with 154 additions and 5 deletions
--- a/docs/ecosystem/unstructured.md
+++ b/docs/ecosystem/unstructured.md
@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references
 - Install the Python SDK with `pip install "unstructured[local-inference]"`
 - Install the following system dependencies if they are not already available on your system.
  Depending on what document types you're parsing, you may not need all of these.
-    - `libmagic-dev`
-    - `poppler-utils`
-    - `tesseract-ocr`
-    - `libreoffice`
+    - `libmagic-dev` (filetype detection)
+    - `poppler-utils` (images and PDFs)
+    - `tesseract-ocr`(images and PDFs)
+    - `libreoffice` (MS Office docs)
+    - `pandoc` (EPUBs)
 - If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which
  `unstructured` uses for layout detection:
    - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`
--- a/docs/modules/document_loaders/examples/epub.ipynb
+++ b/docs/modules/document_loaders/examples/epub.ipynb
@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "39af9ecd",
+   "metadata": {},
+   "source": [
+    "# EPubs\n",
+    "\n",
+    "This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "721c48aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredEPubLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9d3d0e35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEPubLoader(\"winter-sports.epub\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "06073f91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "525d6b67",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "064f9162",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "abefbbdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a547c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381d4139",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
@ -311,7 +311,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.8.13"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.duckdb_loader import DuckDBLoader
 from langchain.document_loaders.email import UnstructuredEmailLoader
+from langchain.document_loaders.epub import UnstructuredEPubLoader
 from langchain.document_loaders.evernote import EverNoteLoader
 from langchain.document_loaders.facebook_chat import FacebookChatLoader
 from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
@ -85,6 +86,7 @@ __all__ = [
    "UnstructuredImageLoader",
    "ObsidianLoader",
    "UnstructuredEmailLoader",
+    "UnstructuredEPubLoader",
    "UnstructuredMarkdownLoader",
    "RoamLoader",
    "YoutubeLoader",
--- a/langchain/document_loaders/epub.py
+++ b/langchain/document_loaders/epub.py
@ -0,0 +1,22 @@
+"""Loader that loads EPub files."""
+from typing import List
+
+from langchain.document_loaders.unstructured import (
+    UnstructuredFileLoader,
+    satisfies_min_unstructured_version,
+)
+
+
+class UnstructuredEPubLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load epub files."""
+
+    def _get_elements(self) -> List:
+        min_unstructured_version = "0.5.4"
+        if not satisfies_min_unstructured_version(min_unstructured_version):
+            raise ValueError(
+                "Partitioning epub files is only supported in "
+                f"unstructured>={min_unstructured_version}."
+            )
+        from unstructured.partition.epub import partition_epub
+
+        return partition_epub(filename=self.file_path)