Dev2049/pypdfium2 (#4209)

thanks @jerrytigerxu for the addition!

---------

Co-authored-by: Jere Xu <jtxu2008@gmail.com>
Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om>
parallel_dir_loader
Davis Chase 1 year ago committed by GitHub
parent 59204a5033
commit 5ca13cc1f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -372,6 +372,44 @@
{
"cell_type": "code",
"execution_count": 9,
"id": "483720b5",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "markdown",
"id": "96351714",
"metadata": {},
"source": [
"# Using PyPDFium2"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "003fcc1d",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFium2Loader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "46766e29",
"metadata": {},
"outputs": [],
"source": [
"loader = PyPDFium2Loader(\"example_data/layout-parser-paper.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "010d5cdd",
"metadata": {},
"outputs": [],
@ -662,7 +700,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.3"
}
},
"nbformat": 4,

@ -61,6 +61,7 @@ from langchain.document_loaders.pdf import (
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFDirectoryLoader,
PyPDFium2Loader,
PyPDFLoader,
UnstructuredPDFLoader,
)
@ -161,6 +162,7 @@ __all__ = [
"PlaywrightURLLoader",
"PyMuPDFLoader",
"PyPDFLoader",
"PyPDFium2Loader",
"PythonLoader",
"ReadTheDocsLoader",
"RoamLoader",

@ -115,6 +115,34 @@ class PyPDFLoader(BasePDFLoader):
]
class PyPDFium2Loader(BasePDFLoader):
"""Loads a PDF with pypdfium2 and chunks at character level."""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import pypdfium2 # noqa:F401
except ImportError:
raise ValueError(
"pypdfium2 package not found, please install it with"
" `pip install pypdfium2`"
)
super().__init__(file_path)
def load(self) -> List[Document]:
"""Load given path as pages."""
import pypdfium2
with open(self.file_path, "rb") as f:
pdf_reader = pypdfium2.PdfDocument(f)
docs = []
for i, page in enumerate(pdf_reader):
content = page.get_textpage().get_text_range()
metadata = {"source": self.file_path, "page": i}
docs.append(Document(page_content=content, metadata=metadata))
return docs
class PyPDFDirectoryLoader(BaseLoader):
"""Loads a directory with PDF files with pypdf and chunks at character level.

@ -1,12 +1,14 @@
from pathlib import Path
from langchain.document_loaders import (
MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFium2Loader,
PyPDFLoader,
UnstructuredPDFLoader,
)
from langchain.document_loaders.pdf import MathpixPDFLoader
def test_unstructured_pdf_loader() -> None:
@ -48,6 +50,36 @@ def test_pdfminer_pdf_as_html_loader() -> None:
assert len(docs) == 1
def test_pypdf_loader() -> None:
"""Test PyPDFLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 16
def test_pypdfium2_loader() -> None:
"""Test PyPDFium2Loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyPDFium2Loader(str(file_path))
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFium2Loader(str(file_path))
docs = loader.load()
assert len(docs) == 16
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"

Loading…
Cancel
Save