2023-03-04 04:59:28 +00:00
|
|
|
from pathlib import Path
|
2023-08-04 19:55:06 +00:00
|
|
|
from typing import Sequence, Union
|
|
|
|
|
|
|
|
import pytest
|
2023-03-04 04:59:28 +00:00
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
from langchain_community.document_loaders import (
|
2023-08-04 19:55:06 +00:00
|
|
|
AmazonTextractPDFLoader,
|
2023-05-06 00:55:31 +00:00
|
|
|
MathpixPDFLoader,
|
2023-03-04 04:59:28 +00:00
|
|
|
PDFMinerLoader,
|
2023-04-10 00:57:25 +00:00
|
|
|
PDFMinerPDFasHTMLLoader,
|
2023-03-04 04:59:28 +00:00
|
|
|
PyMuPDFLoader,
|
2023-05-06 00:55:31 +00:00
|
|
|
PyPDFium2Loader,
|
|
|
|
PyPDFLoader,
|
2023-03-04 04:59:28 +00:00
|
|
|
UnstructuredPDFLoader,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-06-20 05:31:43 +00:00
|
|
|
def test_unstructured_pdf_loader_elements_mode() -> None:
|
|
|
|
"""Test unstructured loader with various modes."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_unstructured_pdf_loader_paged_mode() -> None:
|
|
|
|
"""Test unstructured loader with various modes."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 16
|
|
|
|
|
|
|
|
|
|
|
|
def test_unstructured_pdf_loader_default_mode() -> None:
|
2023-03-04 04:59:28 +00:00
|
|
|
"""Test unstructured loader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = UnstructuredPDFLoader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
|
|
|
|
def test_pdfminer_loader() -> None:
|
|
|
|
"""Test PDFMiner loader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PDFMinerLoader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PDFMinerLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
2023-11-01 15:25:37 +00:00
|
|
|
# Verify that concatenating pages parameter works
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 16
|
|
|
|
|
2023-03-04 04:59:28 +00:00
|
|
|
|
2023-04-10 00:57:25 +00:00
|
|
|
def test_pdfminer_pdf_as_html_loader() -> None:
|
|
|
|
"""Test PDFMinerPDFasHTMLLoader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
|
2023-05-06 00:55:31 +00:00
|
|
|
def test_pypdf_loader() -> None:
|
|
|
|
"""Test PyPDFLoader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PyPDFLoader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PyPDFLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 16
|
|
|
|
|
|
|
|
|
|
|
|
def test_pypdfium2_loader() -> None:
|
|
|
|
"""Test PyPDFium2Loader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PyPDFium2Loader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PyPDFium2Loader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 16
|
|
|
|
|
|
|
|
|
2023-03-04 04:59:28 +00:00
|
|
|
def test_pymupdf_loader() -> None:
|
|
|
|
"""Test PyMuPDF loader."""
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = PyMuPDFLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = PyMuPDFLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 16
|
2023-03-09 04:53:37 +00:00
|
|
|
assert loader.web_path is None
|
|
|
|
|
|
|
|
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
|
|
|
|
loader = PyMuPDFLoader(web_path)
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert loader.web_path == web_path
|
|
|
|
assert loader.file_path != web_path
|
|
|
|
assert len(docs) == 1
|
2023-04-29 03:11:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_mathpix_loader() -> None:
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
|
|
loader = MathpixPDFLoader(str(file_path))
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
print(docs[0].page_content)
|
|
|
|
|
|
|
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
|
|
loader = MathpixPDFLoader(str(file_path))
|
|
|
|
|
|
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
print(docs[0].page_content)
|
2023-08-04 19:55:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"file_path, features, docs_length, create_client",
|
|
|
|
[
|
|
|
|
(
|
|
|
|
(
|
|
|
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
|
|
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
|
|
|
),
|
2023-10-31 01:02:10 +00:00
|
|
|
["FORMS", "TABLES", "LAYOUT"],
|
|
|
|
1,
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
(
|
|
|
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
|
|
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
|
|
|
),
|
|
|
|
[],
|
|
|
|
1,
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
(
|
|
|
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
|
|
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
|
|
|
),
|
|
|
|
["TABLES"],
|
|
|
|
1,
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
(
|
|
|
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
|
|
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
|
|
|
),
|
|
|
|
["FORMS"],
|
|
|
|
1,
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
(
|
|
|
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
|
|
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
|
|
|
),
|
|
|
|
["LAYOUT"],
|
2023-08-04 19:55:06 +00:00
|
|
|
1,
|
|
|
|
False,
|
|
|
|
),
|
|
|
|
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
2023-10-31 01:02:10 +00:00
|
|
|
(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
|
2023-08-04 19:55:06 +00:00
|
|
|
(
|
|
|
|
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
2023-10-31 01:02:10 +00:00
|
|
|
["FORMS", "TABLES", "LAYOUT"],
|
2023-08-04 19:55:06 +00:00
|
|
|
16,
|
|
|
|
True,
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
|
|
|
def test_amazontextract_loader(
|
|
|
|
file_path: str,
|
|
|
|
features: Union[Sequence[str], None],
|
|
|
|
docs_length: int,
|
|
|
|
create_client: bool,
|
|
|
|
) -> None:
|
|
|
|
if create_client:
|
|
|
|
import boto3
|
|
|
|
|
|
|
|
textract_client = boto3.client("textract", region_name="us-east-2")
|
|
|
|
loader = AmazonTextractPDFLoader(
|
|
|
|
file_path, textract_features=features, client=textract_client
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
loader = AmazonTextractPDFLoader(file_path, textract_features=features)
|
|
|
|
docs = loader.load()
|
2023-10-31 01:02:10 +00:00
|
|
|
print(docs)
|
2023-08-04 19:55:06 +00:00
|
|
|
|
|
|
|
assert len(docs) == docs_length
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
|
|
|
def test_amazontextract_loader_failures() -> None:
|
|
|
|
# 2-page PDF local file system
|
|
|
|
two_page_pdf = str(
|
|
|
|
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
|
|
|
)
|
|
|
|
loader = AmazonTextractPDFLoader(two_page_pdf)
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
loader.load()
|