|
|
@ -7,6 +7,8 @@ from langchain.document_loaders.base import BaseBlobParser
|
|
|
|
from langchain.document_loaders.blob_loaders import Blob
|
|
|
|
from langchain.document_loaders.blob_loaders import Blob
|
|
|
|
from langchain.document_loaders.parsers.pdf import (
|
|
|
|
from langchain.document_loaders.parsers.pdf import (
|
|
|
|
PDFMinerParser,
|
|
|
|
PDFMinerParser,
|
|
|
|
|
|
|
|
PyMuPDFParser,
|
|
|
|
|
|
|
|
PyPDFium2Parser,
|
|
|
|
PyPDFParser,
|
|
|
|
PyPDFParser,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
|
|
|
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
|
|
@ -62,3 +64,16 @@ def test_pdfminer_parser() -> None:
|
|
|
|
"""Test PDFMiner parser."""
|
|
|
|
"""Test PDFMiner parser."""
|
|
|
|
# Does not follow defaults to split by page.
|
|
|
|
# Does not follow defaults to split by page.
|
|
|
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
|
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("fitz") # package is PyMuPDF
|
|
|
|
|
|
|
|
def test_pymupdf_loader() -> None:
|
|
|
|
|
|
|
|
"""Test PyMuPDF loader."""
|
|
|
|
|
|
|
|
_assert_with_parser(PyMuPDFParser())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("pypdfium2")
|
|
|
|
|
|
|
|
def test_pypdfium2_parser() -> None:
|
|
|
|
|
|
|
|
"""Test PyPDFium2 parser."""
|
|
|
|
|
|
|
|
# Does not follow defaults to split by page.
|
|
|
|
|
|
|
|
_assert_with_parser(PyPDFium2Parser())
|
|
|
|