From 3dfe1cf60e84361dc8d01fce8eb4fab209e43ab5 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 30 Mar 2023 23:45:31 -0400 Subject: [PATCH] feat: document loader for epublications (#2202) ### Summary Adds a new document loader for processing e-publications. Works with `unstructured>=0.5.4`. You need to have [`pandoc`](https://pandoc.org/installing.html) installed for this loader to work. ### Testing ```python from langchain.document_loaders import UnstructuredEPubLoader loader = UnstructuredEPubLoader("winter-sports.epub", mode="elements") data = loader.load() data[0] ``` --- docs/ecosystem/unstructured.md | 9 +- .../document_loaders/examples/epub.ipynb | 124 ++++++++++++++++++ .../examples/unstructured_file.ipynb | 2 +- langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/epub.py | 22 ++++ 5 files changed, 154 insertions(+), 5 deletions(-) create mode 100644 docs/modules/document_loaders/examples/epub.ipynb create mode 100644 langchain/document_loaders/epub.py diff --git a/docs/ecosystem/unstructured.md b/docs/ecosystem/unstructured.md index 6509c618..721c2321 100644 --- a/docs/ecosystem/unstructured.md +++ b/docs/ecosystem/unstructured.md @@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references - Install the Python SDK with `pip install "unstructured[local-inference]"` - Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these. - - `libmagic-dev` - - `poppler-utils` - - `tesseract-ocr` - - `libreoffice` + - `libmagic-dev` (filetype detection) + - `poppler-utils` (images and PDFs) + - `tesseract-ocr`(images and PDFs) + - `libreoffice` (MS Office docs) + - `pandoc` (EPUBs) - If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which `unstructured` uses for layout detection: - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"` diff --git a/docs/modules/document_loaders/examples/epub.ipynb b/docs/modules/document_loaders/examples/epub.ipynb new file mode 100644 index 00000000..05b3295f --- /dev/null +++ b/docs/modules/document_loaders/examples/epub.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39af9ecd", + "metadata": {}, + "source": [ + "# EPubs\n", + "\n", + "This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "721c48aa", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredEPubLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9d3d0e35", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEPubLoader(\"winter-sports.epub\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06073f91", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "525d6b67", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "064f9162", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "abefbbdb", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a547c534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "381d4139", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb index fd1da88a..4d954997 100644 --- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb +++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb @@ -311,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index b2976fe1..2b6a35d4 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.email import UnstructuredEmailLoader +from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader @@ -85,6 +86,7 @@ __all__ = [ "UnstructuredImageLoader", "ObsidianLoader", "UnstructuredEmailLoader", + "UnstructuredEPubLoader", "UnstructuredMarkdownLoader", "RoamLoader", "YoutubeLoader", diff --git a/langchain/document_loaders/epub.py b/langchain/document_loaders/epub.py new file mode 100644 index 00000000..23b8f541 --- /dev/null +++ b/langchain/document_loaders/epub.py @@ -0,0 +1,22 @@ +"""Loader that loads EPub files.""" +from typing import List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + satisfies_min_unstructured_version, +) + + +class UnstructuredEPubLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load epub files.""" + + def _get_elements(self) -> List: + min_unstructured_version = "0.5.4" + if not satisfies_min_unstructured_version(min_unstructured_version): + raise ValueError( + "Partitioning epub files is only supported in " + f"unstructured>={min_unstructured_version}." + ) + from unstructured.partition.epub import partition_epub + + return partition_epub(filename=self.file_path)