Clean up tests for pdf parsers (#4595)

# Organize tests for pdf parsers

Clean up tests for pdf parsers, remove duplicate tests, convert to unit
tests.
This commit is contained in:
Eugene Yurtsev 2023-05-15 14:21:05 -04:00 committed by GitHub
parent 70fd7cda14
commit 09587a3201
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 353 additions and 278 deletions

View File

@ -93,12 +93,20 @@ class PyPDFium2Parser(BaseBlobParser):
"""Lazily parse the blob."""
import pypdfium2
with blob.as_bytes_io() as f:
pdf_reader = pypdfium2.PdfDocument(f)
# pypdfium2 is really finicky with respect to closing things,
# if done incorrectly creates seg faults.
with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
content = page.get_textpage().get_text_range()
text_page = page.get_textpage()
content = text_page.get_text_range()
text_page.close()
page.close()
metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata)
finally:
pdf_reader.close()
class PDFPlumberParser(BaseBlobParser):

592
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -83,6 +83,8 @@ docarray = {version="^0.31.0", optional=true}
protobuf = {version="3.19", optional=true}
hnswlib = {version="^0.7.0", optional=true}
lxml = {version = "^4.9.2", optional = true}
pymupdf = {version = "^1.22.3", optional = true}
pypdfium2 = {version = "^4.10.0", optional = true}
[tool.poetry.group.docs.dependencies]
@ -177,6 +179,8 @@ extended_testing = [
"jq",
"pdfminer.six",
"pypdf",
"pymupdf",
"pypdfium2",
"tqdm",
"lxml",
]

View File

@ -7,6 +7,8 @@ from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
@ -62,3 +64,16 @@ def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("fitz") # package is PyMuPDF
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())