feat: document loader for epublications (#2202)

### Summary

Adds a new document loader for processing e-publications. Works with
`unstructured>=0.5.4`. You need to have
[`pandoc`](https://pandoc.org/installing.html) installed for this loader
to work.

### Testing

```python
from langchain.document_loaders import UnstructuredEPubLoader

loader = UnstructuredEPubLoader("winter-sports.epub", mode="elements")
data = loader.load()
data[0]
```
doc
Matt Robinson 1 year ago committed by GitHub
parent a4a1ee6b5d
commit 3dfe1cf60e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references
- Install the Python SDK with `pip install "unstructured[local-inference]"` - Install the Python SDK with `pip install "unstructured[local-inference]"`
- Install the following system dependencies if they are not already available on your system. - Install the following system dependencies if they are not already available on your system.
Depending on what document types you're parsing, you may not need all of these. Depending on what document types you're parsing, you may not need all of these.
- `libmagic-dev` - `libmagic-dev` (filetype detection)
- `poppler-utils` - `poppler-utils` (images and PDFs)
- `tesseract-ocr` - `tesseract-ocr`(images and PDFs)
- `libreoffice` - `libreoffice` (MS Office docs)
- `pandoc` (EPUBs)
- If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which - If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which
`unstructured` uses for layout detection: `unstructured` uses for layout detection:
- `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"` - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`

@ -0,0 +1,124 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "39af9ecd",
"metadata": {},
"source": [
"# EPubs\n",
"\n",
"This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "721c48aa",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredEPubLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9d3d0e35",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredEPubLoader(\"winter-sports.epub\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "06073f91",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "markdown",
"id": "525d6b67",
"metadata": {},
"source": [
"## Retain Elements\n",
"\n",
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "064f9162",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "abefbbdb",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a547c534",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "381d4139",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -311,7 +311,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.1" "version": "3.8.13"
} }
}, },
"nbformat": 4, "nbformat": 4,

@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader
from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.epub import UnstructuredEPubLoader
from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.evernote import EverNoteLoader
from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
@ -85,6 +86,7 @@ __all__ = [
"UnstructuredImageLoader", "UnstructuredImageLoader",
"ObsidianLoader", "ObsidianLoader",
"UnstructuredEmailLoader", "UnstructuredEmailLoader",
"UnstructuredEPubLoader",
"UnstructuredMarkdownLoader", "UnstructuredMarkdownLoader",
"RoamLoader", "RoamLoader",
"YoutubeLoader", "YoutubeLoader",

@ -0,0 +1,22 @@
"""Loader that loads EPub files."""
from typing import List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredEPubLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load epub files."""
def _get_elements(self) -> List:
min_unstructured_version = "0.5.4"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning epub files is only supported in "
f"unstructured>={min_unstructured_version}."
)
from unstructured.partition.epub import partition_epub
return partition_epub(filename=self.file_path)
Loading…
Cancel
Save