langchain/libs/community/tests/integration_tests/document_loaders/test_pdf.py

from pathlib import Path
from typing import Sequence, Union

import pytest

from langchain_community.document_loaders import (
    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    PDFMinerLoader,
    PDFMinerPDFasHTMLLoader,
    PyMuPDFLoader,
    PyPDFium2Loader,
    PyPDFLoader,
    UnstructuredPDFLoader,
)


def test_unstructured_pdf_loader_elements_mode() -> None:
    """Test unstructured loader with various modes."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = UnstructuredPDFLoader(str(file_path), mode="elements")
    docs = loader.load()

    assert len(docs) == 2


def test_unstructured_pdf_loader_paged_mode() -> None:
    """Test unstructured loader with various modes."""
    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = UnstructuredPDFLoader(str(file_path), mode="paged")
    docs = loader.load()

    assert len(docs) == 16


def test_unstructured_pdf_loader_default_mode() -> None:
    """Test unstructured loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = UnstructuredPDFLoader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1


def test_pdfminer_loader() -> None:
    """Test PDFMiner loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PDFMinerLoader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PDFMinerLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 1

    # Verify that concatenating pages parameter works
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
    docs = loader.load()

    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PDFMinerLoader(str(file_path), concatenate_pages=False)

    docs = loader.load()
    assert len(docs) == 16


def test_pdfminer_pdf_as_html_loader() -> None:
    """Test PDFMinerPDFasHTMLLoader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PDFMinerPDFasHTMLLoader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PDFMinerPDFasHTMLLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 1


def test_pypdf_loader() -> None:
    """Test PyPDFLoader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PyPDFLoader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PyPDFLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 16


def test_pypdfium2_loader() -> None:
    """Test PyPDFium2Loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PyPDFium2Loader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PyPDFium2Loader(str(file_path))

    docs = loader.load()
    assert len(docs) == 16


def test_pymupdf_loader() -> None:
    """Test PyMuPDF loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = PyMuPDFLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 1

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = PyMuPDFLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 16
    assert loader.web_path is None

    web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
    loader = PyMuPDFLoader(web_path)

    docs = loader.load()
    assert loader.web_path == web_path
    assert loader.file_path != web_path
    assert len(docs) == 1


def test_mathpix_loader() -> None:
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = MathpixPDFLoader(str(file_path))
    docs = loader.load()

    assert len(docs) == 1
    print(docs[0].page_content)

    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = MathpixPDFLoader(str(file_path))

    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].page_content)


@pytest.mark.parametrize(
    "file_path, features, docs_length, create_client",
    [
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            ["FORMS", "TABLES", "LAYOUT"],
            1,
            False,
        ),
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            [],
            1,
            False,
        ),
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            ["TABLES"],
            1,
            False,
        ),
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            ["FORMS"],
            1,
            False,
        ),
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            ["LAYOUT"],
            1,
            False,
        ),
        (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
        (str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
        (
            "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
            ["FORMS", "TABLES", "LAYOUT"],
            16,
            True,
        ),
    ],
)
@pytest.mark.skip(reason="Requires AWS credentials to run")
def test_amazontextract_loader(
    file_path: str,
    features: Union[Sequence[str], None],
    docs_length: int,
    create_client: bool,
) -> None:
    if create_client:
        import boto3

        textract_client = boto3.client("textract", region_name="us-east-2")
        loader = AmazonTextractPDFLoader(
            file_path, textract_features=features, client=textract_client
        )
    else:
        loader = AmazonTextractPDFLoader(file_path, textract_features=features)
    docs = loader.load()
    print(docs)

    assert len(docs) == docs_length


@pytest.mark.skip(reason="Requires AWS credentials to run")
def test_amazontextract_loader_failures() -> None:
    # 2-page PDF local file system
    two_page_pdf = str(
        Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
    )
    loader = AmazonTextractPDFLoader(two_page_pdf)
    with pytest.raises(ValueError):
        loader.load()
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`from pathlib import Path`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00			`from typing import Sequence, Union`

			`import pytest`
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from langchain_community.document_loaders import (`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00			`AmazonTextractPDFLoader,`
Dev2049/pypdfium2 (#4209) thanks @jerrytigerxu for the addition! --------- Co-authored-by: Jere Xu <jtxu2008@gmail.com> Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om> 2023-05-06 00:55:31 +00:00			`MathpixPDFLoader,`
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`PDFMinerLoader,`
Add new loader to load pdf as html content (#2607) Adds a new pdf loader using the existing dependency on PDFMiner. The new loader can be helpful for chunking texts semantically into sections as the output html content can be parsed via `BeautifulSoup` to get more structured and rich information about font size, page numbers, pdf headers/footers, etc. which may not be available otherwise with other pdf loaders 2023-04-10 00:57:25 +00:00			`PDFMinerPDFasHTMLLoader,`
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`PyMuPDFLoader,`
Dev2049/pypdfium2 (#4209) thanks @jerrytigerxu for the addition! --------- Co-authored-by: Jere Xu <jtxu2008@gmail.com> Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om> 2023-05-06 00:55:31 +00:00			`PyPDFium2Loader,`
			`PyPDFLoader,`
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`UnstructuredPDFLoader,`
			`)`


Harrison/unstructured page number (#6464) Co-authored-by: Reza Sanaie <reza@sanaie.ca> 2023-06-20 05:31:43 +00:00			`def test_unstructured_pdf_loader_elements_mode() -> None:`
			`"""Test unstructured loader with various modes."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = UnstructuredPDFLoader(str(file_path), mode="elements")`
			`docs = loader.load()`

			`assert len(docs) == 2`


			`def test_unstructured_pdf_loader_paged_mode() -> None:`
			`"""Test unstructured loader with various modes."""`
			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = UnstructuredPDFLoader(str(file_path), mode="paged")`
			`docs = loader.load()`

			`assert len(docs) == 16`


			`def test_unstructured_pdf_loader_default_mode() -> None:`
Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`"""Test unstructured loader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = UnstructuredPDFLoader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`


			`def test_pdfminer_loader() -> None:`
			`"""Test PDFMiner loader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PDFMinerLoader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PDFMinerLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 1`

feat: Add page metadata on PDFMinerLoader (#12277) - Description: #12273 's suggestion PR Like other PDFLoader, loading pdf per each page and giving page metadata. - Issue: #12273 - Twitter handle: @blue0_0hope --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> 2023-11-01 15:25:37 +00:00			`# Verify that concatenating pages parameter works`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PDFMinerLoader(str(file_path), concatenate_pages=True)`
			`docs = loader.load()`

			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PDFMinerLoader(str(file_path), concatenate_pages=False)`

			`docs = loader.load()`
			`assert len(docs) == 16`

Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00
Add new loader to load pdf as html content (#2607) Adds a new pdf loader using the existing dependency on PDFMiner. The new loader can be helpful for chunking texts semantically into sections as the output html content can be parsed via `BeautifulSoup` to get more structured and rich information about font size, page numbers, pdf headers/footers, etc. which may not be available otherwise with other pdf loaders 2023-04-10 00:57:25 +00:00			`def test_pdfminer_pdf_as_html_loader() -> None:`
			`"""Test PDFMinerPDFasHTMLLoader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PDFMinerPDFasHTMLLoader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PDFMinerPDFasHTMLLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 1`


Dev2049/pypdfium2 (#4209) thanks @jerrytigerxu for the addition! --------- Co-authored-by: Jere Xu <jtxu2008@gmail.com> Co-authored-by: jerrytigerxu <jere.tiger.xu@gmailc.om> 2023-05-06 00:55:31 +00:00			`def test_pypdf_loader() -> None:`
			`"""Test PyPDFLoader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PyPDFLoader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PyPDFLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 16`


			`def test_pypdfium2_loader() -> None:`
			`"""Test PyPDFium2Loader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PyPDFium2Loader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PyPDFium2Loader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 16`


Add PyMuPDF PDF loader (#1426) Different PDF libraries have different strengths and weaknesses. PyMuPDF does a good job at extracting the most amount of content from the doc, regardless of the source quality, extremely fast (especially compared to Unstructured). https://pymupdf.readthedocs.io/en/latest/index.html 2023-03-04 04:59:28 +00:00			`def test_pymupdf_loader() -> None:`
			`"""Test PyMuPDF loader."""`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = PyMuPDFLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 1`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = PyMuPDFLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 16`
Harrison/remote paths pdf (#1544) Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com> 2023-03-09 04:53:37 +00:00			`assert loader.web_path is None`

			`web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"`
			`loader = PyMuPDFLoader(web_path)`

			`docs = loader.load()`
			`assert loader.web_path == web_path`
			`assert loader.file_path != web_path`
			`assert len(docs) == 1`
Add Mathpix pdf loader (#3727) Inspo https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> 2023-04-29 03:11:22 +00:00

			`def test_mathpix_loader() -> None:`
			`file_path = Path(__file__).parent.parent / "examples/hello.pdf"`
			`loader = MathpixPDFLoader(str(file_path))`
			`docs = loader.load()`

			`assert len(docs) == 1`
			`print(docs[0].page_content)`

			`file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"`
			`loader = MathpixPDFLoader(str(file_path))`

			`docs = loader.load()`
			`assert len(docs) == 1`
			`print(docs[0].page_content)`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00

			`@pytest.mark.parametrize(`
			`"file_path, features, docs_length, create_client",`
			`[`
			`(`
			`(`
			`"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"`
			`"/langchain/alejandro_rosalez_sample_1.jpg"`
			`),`
Textract linearizer (#12446) Description: Textract PDF Loader generating linearized output, meaning it will replicate the structure of the source document as close as possible based on the features passed into the call (e. g. LAYOUT, FORMS, TABLES). With LAYOUT reading order for multi-column documents or identification of lists and figures is supported and with TABLES it will generate the table structure as well. FORMS will indicate "key: value" with columms. - Issue: the issue fixes #12068 - Dependencies: amazon-textract-textractor is added, which provides the linearization - Tag maintainer: @3coins --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-31 01:02:10 +00:00			`["FORMS", "TABLES", "LAYOUT"],`
			`1,`
			`False,`
			`),`
			`(`
			`(`
			`"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"`
			`"/langchain/alejandro_rosalez_sample_1.jpg"`
			`),`
			`[],`
			`1,`
			`False,`
			`),`
			`(`
			`(`
			`"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"`
			`"/langchain/alejandro_rosalez_sample_1.jpg"`
			`),`
			`["TABLES"],`
			`1,`
			`False,`
			`),`
			`(`
			`(`
			`"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"`
			`"/langchain/alejandro_rosalez_sample_1.jpg"`
			`),`
			`["FORMS"],`
			`1,`
			`False,`
			`),`
			`(`
			`(`
			`"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"`
			`"/langchain/alejandro_rosalez_sample_1.jpg"`
			`),`
			`["LAYOUT"],`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00			`1,`
			`False,`
			`),`
			`(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),`
Textract linearizer (#12446) Description: Textract PDF Loader generating linearized output, meaning it will replicate the structure of the source document as close as possible based on the features passed into the call (e. g. LAYOUT, FORMS, TABLES). With LAYOUT reading order for multi-column documents or identification of lists and figures is supported and with TABLES it will generate the table structure as well. FORMS will indicate "key: value" with columms. - Issue: the issue fixes #12068 - Dependencies: amazon-textract-textractor is added, which provides the linearization - Tag maintainer: @3coins --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-31 01:02:10 +00:00			`(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00			`(`
			`"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",`
Textract linearizer (#12446) Description: Textract PDF Loader generating linearized output, meaning it will replicate the structure of the source document as close as possible based on the features passed into the call (e. g. LAYOUT, FORMS, TABLES). With LAYOUT reading order for multi-column documents or identification of lists and figures is supported and with TABLES it will generate the table structure as well. FORMS will indicate "key: value" with columms. - Issue: the issue fixes #12068 - Dependencies: amazon-textract-textractor is added, which provides the linearization - Tag maintainer: @3coins --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-31 01:02:10 +00:00			`["FORMS", "TABLES", "LAYOUT"],`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00			`16,`
			`True,`
			`),`
			`],`
			`)`
			`@pytest.mark.skip(reason="Requires AWS credentials to run")`
			`def test_amazontextract_loader(`
			`file_path: str,`
			`features: Union[Sequence[str], None],`
			`docs_length: int,`
			`create_client: bool,`
			`) -> None:`
			`if create_client:`
			`import boto3`

			`textract_client = boto3.client("textract", region_name="us-east-2")`
			`loader = AmazonTextractPDFLoader(`
			`file_path, textract_features=features, client=textract_client`
			`)`
			`else:`
			`loader = AmazonTextractPDFLoader(file_path, textract_features=features)`
			`docs = loader.load()`
Textract linearizer (#12446) Description: Textract PDF Loader generating linearized output, meaning it will replicate the structure of the source document as close as possible based on the features passed into the call (e. g. LAYOUT, FORMS, TABLES). With LAYOUT reading order for multi-column documents or identification of lists and figures is supported and with TABLES it will generate the table structure as well. FORMS will indicate "key: value" with columms. - Issue: the issue fixes #12068 - Dependencies: amazon-textract-textractor is added, which provides the linearization - Tag maintainer: @3coins --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-31 01:02:10 +00:00			`print(docs)`
Amazon Textract as document loader (#8661) Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-04 19:55:06 +00:00
			`assert len(docs) == docs_length`


			`@pytest.mark.skip(reason="Requires AWS credentials to run")`
			`def test_amazontextract_loader_failures() -> None:`
			`# 2-page PDF local file system`
			`two_page_pdf = str(`
			`Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"`
			`)`
			`loader = AmazonTextractPDFLoader(two_page_pdf)`
			`with pytest.raises(ValueError):`
			`loader.load()`