forked from Archives/langchain
09587a3201
# Organize tests for pdf parsers Clean up tests for pdf parsers, remove duplicate tests, convert to unit tests.
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
"""Tests for the various PDF parsers."""
|
|
from typing import Iterator
|
|
|
|
import pytest
|
|
|
|
from langchain.document_loaders.base import BaseBlobParser
|
|
from langchain.document_loaders.blob_loaders import Blob
|
|
from langchain.document_loaders.parsers.pdf import (
|
|
PDFMinerParser,
|
|
PyMuPDFParser,
|
|
PyPDFium2Parser,
|
|
PyPDFParser,
|
|
)
|
|
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
|
|
|
|
|
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
|
"""Standard tests to verify that the given parser works.
|
|
|
|
Args:
|
|
parser (BaseBlobParser): The parser to test.
|
|
splits_by_page (bool): Whether the parser splits by page or not by default.
|
|
"""
|
|
blob = Blob.from_path(HELLO_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
assert len(docs) == 1
|
|
page_content = docs[0].page_content
|
|
assert isinstance(page_content, str)
|
|
# The different parsers return different amount of whitespace, so using
|
|
# startswith instead of equals.
|
|
assert docs[0].page_content.startswith("Hello world!")
|
|
|
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
|
|
if splits_by_page:
|
|
assert len(docs) == 16
|
|
else:
|
|
assert len(docs) == 1
|
|
# Test is imprecise since the parsers yield different parse information depending
|
|
# on configuration. Each parser seems to yield a slightly different result
|
|
# for this page!
|
|
assert "LayoutParser" in docs[0].page_content
|
|
metadata = docs[0].metadata
|
|
|
|
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
|
|
|
if splits_by_page:
|
|
assert metadata["page"] == 0
|
|
|
|
|
|
@pytest.mark.requires("pypdf")
|
|
def test_pypdf_parser() -> None:
|
|
"""Test PyPDF parser."""
|
|
_assert_with_parser(PyPDFParser())
|
|
|
|
|
|
@pytest.mark.requires("pdfminer")
|
|
def test_pdfminer_parser() -> None:
|
|
"""Test PDFMiner parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
|
|
|
|
@pytest.mark.requires("fitz") # package is PyMuPDF
|
|
def test_pymupdf_loader() -> None:
|
|
"""Test PyMuPDF loader."""
|
|
_assert_with_parser(PyMuPDFParser())
|
|
|
|
|
|
@pytest.mark.requires("pypdfium2")
|
|
def test_pypdfium2_parser() -> None:
|
|
"""Test PyPDFium2 parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PyPDFium2Parser())
|