diff --git a/docs/ecosystem/unstructured.md b/docs/ecosystem/unstructured.md index 6509c618cb..721c2321c6 100644 --- a/docs/ecosystem/unstructured.md +++ b/docs/ecosystem/unstructured.md @@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references - Install the Python SDK with `pip install "unstructured[local-inference]"` - Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these. - - `libmagic-dev` - - `poppler-utils` - - `tesseract-ocr` - - `libreoffice` + - `libmagic-dev` (filetype detection) + - `poppler-utils` (images and PDFs) + - `tesseract-ocr`(images and PDFs) + - `libreoffice` (MS Office docs) + - `pandoc` (EPUBs) - If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which `unstructured` uses for layout detection: - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"` diff --git a/docs/modules/document_loaders/examples/epub.ipynb b/docs/modules/document_loaders/examples/epub.ipynb new file mode 100644 index 0000000000..05b3295f4f --- /dev/null +++ b/docs/modules/document_loaders/examples/epub.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39af9ecd", + "metadata": {}, + "source": [ + "# EPubs\n", + "\n", + "This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "721c48aa", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredEPubLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9d3d0e35", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEPubLoader(\"winter-sports.epub\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06073f91", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "525d6b67", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "064f9162", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "abefbbdb", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a547c534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "381d4139", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb index fd1da88a71..4d954997af 100644 --- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb +++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb @@ -311,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index b2976fe176..2b6a35d438 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.email import UnstructuredEmailLoader +from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader @@ -85,6 +86,7 @@ __all__ = [ "UnstructuredImageLoader", "ObsidianLoader", "UnstructuredEmailLoader", + "UnstructuredEPubLoader", "UnstructuredMarkdownLoader", "RoamLoader", "YoutubeLoader", diff --git a/langchain/document_loaders/epub.py b/langchain/document_loaders/epub.py new file mode 100644 index 0000000000..23b8f54182 --- /dev/null +++ b/langchain/document_loaders/epub.py @@ -0,0 +1,22 @@ +"""Loader that loads EPub files.""" +from typing import List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + satisfies_min_unstructured_version, +) + + +class UnstructuredEPubLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load epub files.""" + + def _get_elements(self) -> List: + min_unstructured_version = "0.5.4" + if not satisfies_min_unstructured_version(min_unstructured_version): + raise ValueError( + "Partitioning epub files is only supported in " + f"unstructured>={min_unstructured_version}." + ) + from unstructured.partition.epub import partition_epub + + return partition_epub(filename=self.file_path)