mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
feat: document loader for epublications (#2202)
### Summary Adds a new document loader for processing e-publications. Works with `unstructured>=0.5.4`. You need to have [`pandoc`](https://pandoc.org/installing.html) installed for this loader to work. ### Testing ```python from langchain.document_loaders import UnstructuredEPubLoader loader = UnstructuredEPubLoader("winter-sports.epub", mode="elements") data = loader.load() data[0] ```
This commit is contained in:
parent
a4a1ee6b5d
commit
3dfe1cf60e
@ -13,10 +13,11 @@ This page is broken into two parts: installation and setup, and then references
|
||||
- Install the Python SDK with `pip install "unstructured[local-inference]"`
|
||||
- Install the following system dependencies if they are not already available on your system.
|
||||
Depending on what document types you're parsing, you may not need all of these.
|
||||
- `libmagic-dev`
|
||||
- `poppler-utils`
|
||||
- `tesseract-ocr`
|
||||
- `libreoffice`
|
||||
- `libmagic-dev` (filetype detection)
|
||||
- `poppler-utils` (images and PDFs)
|
||||
- `tesseract-ocr`(images and PDFs)
|
||||
- `libreoffice` (MS Office docs)
|
||||
- `pandoc` (EPUBs)
|
||||
- If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which
|
||||
`unstructured` uses for layout detection:
|
||||
- `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`
|
||||
|
124
docs/modules/document_loaders/examples/epub.ipynb
Normal file
124
docs/modules/document_loaders/examples/epub.ipynb
Normal file
@ -0,0 +1,124 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39af9ecd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# EPubs\n",
|
||||
"\n",
|
||||
"This covers how to load `.epub` documents into a document format that we can use downstream. You'll need to install the [`pandocs`](https://pandoc.org/installing.html) package for this loader to work."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "721c48aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import UnstructuredEPubLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "9d3d0e35",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredEPubLoader(\"winter-sports.epub\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "06073f91",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "064f9162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "abefbbdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a547c534",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "381d4139",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -311,7 +311,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -17,6 +17,7 @@ from langchain.document_loaders.dataframe import DataFrameLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
||||
from langchain.document_loaders.evernote import EverNoteLoader
|
||||
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||
@ -85,6 +86,7 @@ __all__ = [
|
||||
"UnstructuredImageLoader",
|
||||
"ObsidianLoader",
|
||||
"UnstructuredEmailLoader",
|
||||
"UnstructuredEPubLoader",
|
||||
"UnstructuredMarkdownLoader",
|
||||
"RoamLoader",
|
||||
"YoutubeLoader",
|
||||
|
22
langchain/document_loaders/epub.py
Normal file
22
langchain/document_loaders/epub.py
Normal file
@ -0,0 +1,22 @@
|
||||
"""Loader that loads EPub files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
satisfies_min_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredEPubLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load epub files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
min_unstructured_version = "0.5.4"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning epub files is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
return partition_epub(filename=self.file_path)
|
Loading…
Reference in New Issue
Block a user