Amazon Textract as document loader (#8661)

Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · 8374367de2
parent 82ef1f587d
commit 8374367de2
7 changed files with 368 additions and 18 deletions
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader
 from langchain.document_loaders.open_city_data import OpenCityDataLoader
 from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
 from langchain.document_loaders.pdf import (
    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    OnlinePDFLoader,
    PDFMinerLoader,
@ -330,4 +331,5 @@ __all__ = [
    "YoutubeAudioLoader",
    "YoutubeLoader",
    "ConcurrentLoader",
    "AmazonTextractPDFLoader",
 ]
--- a/libs/langchain/langchain/document_loaders/parsers/pdf.py
+++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py
@ -1,5 +1,6 @@
 """Module contains common parsers for PDFs."""
-from typing import Any, Iterator, Mapping, Optional, Union
+from typing import Any, Iterator, Mapping, Optional, Sequence, Union
 from urllib.parse import urlparse
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob
@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser):
                )
                for page in doc.pages
            ]
 class AmazonTextractPDFParser(BaseBlobParser):
    """Sends PDF files to Amazon Textract and parses them to generate Documents.
    For parsing multi-page PDFs, they have to reside on S3.
    """
    def __init__(
        self,
        textract_features: Optional[Sequence[int]] = None,
        client: Optional[Any] = None,
    ) -> None:
        """Initializes the parser.
        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
        """
        try:
            import textractcaller as tc
            self.tc = tc
            if textract_features is not None:
                self.textract_features = [
                    tc.Textract_Features(f) for f in textract_features
                ]
            else:
                self.textract_features = []
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import amazon-textract-caller python package. "
                "Please install it with `pip install amazon-textract-caller`."
            )
        if not client:
            try:
                import boto3
                self.boto3_textract_client = boto3.client("textract")
            except ImportError:
                raise ModuleNotFoundError(
                    "Could not import boto3 python package. "
                    "Please install it with `pip install boto3`."
                )
        else:
            self.boto3_textract_client = client
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Iterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs the blob.data is taken
        """
        url_parse_result = urlparse(str(blob.path)) if blob.path else None
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
            url_parse_result
            and url_parse_result.scheme == "s3"
            and url_parse_result.netloc
        ):
            textract_response_json = self.tc.call_textract(
                input_document=str(blob.path),
                features=self.textract_features,
                boto3_textract_client=self.boto3_textract_client,
            )
        else:
            textract_response_json = self.tc.call_textract(
                input_document=blob.as_bytes(),
                features=self.textract_features,
                call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.boto3_textract_client,
            )
        current_text = ""
        current_page = 1
        for block in textract_response_json["Blocks"]:
            if "Page" in block and not (int(block["Page"]) == current_page):
                yield Document(
                    page_content=current_text,
                    metadata={"source": blob.source, "page": current_page},
                )
                current_text = ""
                current_page = int(block["Page"])
            if "Text" in block:
                current_text += block["Text"] + " "
        yield Document(
            page_content=current_text,
            metadata={"source": blob.source, "page": current_page},
        )
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@ -7,7 +7,7 @@ import time
 from abc import ABC
 from io import StringIO
 from pathlib import Path
-from typing import Any, Iterator, List, Mapping, Optional, Union
+from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
 from urllib.parse import urlparse
 import requests
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.blob_loaders import Blob
 from langchain.document_loaders.parsers.pdf import (
    AmazonTextractPDFParser,
    PDFMinerParser,
    PDFPlumberParser,
    PyMuPDFParser,
@ -71,8 +72,14 @@ class BasePDFLoader(BaseLoader, ABC):
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)
-        # If the file is a web path, download it to a temporary file, and use that
+        # If the file is a web path or S3, download it to a temporary file, and use that
        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
            self.temp_dir = tempfile.TemporaryDirectory()
            _, suffix = os.path.splitext(self.file_path)
            temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
            if self._is_s3_url(self.file_path):
                self.web_path = self.file_path
            else:
                r = requests.get(self.file_path)
                if r.status_code != 200:
@ -82,8 +89,6 @@ class BasePDFLoader(BaseLoader, ABC):
                    )
                self.web_path = self.file_path
            self.temp_dir = tempfile.TemporaryDirectory()
            temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
                with open(temp_pdf, mode="wb") as f:
                    f.write(r.content)
                self.file_path = str(temp_pdf)
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)
    @staticmethod
    def _is_s3_url(url: str) -> bool:
        """check if the url is S3"""
        try:
            result = urlparse(url)
            if result.scheme == "s3" and result.netloc:
                return True
            return False
        except ValueError:
            return False
    @property
    def source(self) -> str:
        return self.web_path if self.web_path is not None else self.file_path
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
        parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
        blob = Blob.from_path(self.file_path)
        return parser.parse(blob)
 class AmazonTextractPDFLoader(BasePDFLoader):
    """Loads a PDF document from local file system, HTTP or S3.
    To authenticate, the AWS client uses the following methods to
    automatically load credentials:
    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
    If a specific credential profile should be used, you must pass
    the name of the profile from the ~/.aws/credentials file that is to be used.
    Make sure the credentials / roles used have the required policies to
    access the Amazon Textract service.
    Example:
        .. code-block:: python
            from langchain.document_loaders import AmazonTextractPDFLoader
            loader = AmazonTextractPDFLoader(
                file_path="s3://pdfs/myfile.pdf"
            )
            document = loader.load()
    """
    def __init__(
        self,
        file_path: str,
        textract_features: Optional[Sequence[str]] = None,
        client: Optional[Any] = None,
        credentials_profile_name: Optional[str] = None,
        region_name: Optional[str] = None,
        endpoint_url: Optional[str] = None,
    ) -> None:
        """Initialize the loader.
        Args:
            file_path: A file, url or s3 path for input file
            textract_features: Features to be used for extraction, each feature
                               should be passed as a str that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client (Optional)
            credentials_profile_name: AWS profile name, if not default (Optional)
            region_name: AWS region, eg us-east-1 (Optional)
            endpoint_url: endpoint url for the textract service (Optional)
        """
        super().__init__(file_path)
        try:
            import textractcaller as tc  # noqa: F401
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import amazon-textract-caller python package. "
                "Please install it with `pip install amazon-textract-caller`."
            )
        if textract_features:
            features = [tc.Textract_Features[x] for x in textract_features]
        else:
            features = []
        if credentials_profile_name or region_name or endpoint_url:
            try:
                import boto3
                if credentials_profile_name is not None:
                    session = boto3.Session(profile_name=credentials_profile_name)
                else:
                    # use default credentials
                    session = boto3.Session()
                client_params = {}
                if region_name:
                    client_params["region_name"] = region_name
                if endpoint_url:
                    client_params["endpoint_url"] = endpoint_url
                client = session.client("textract", **client_params)
            except ImportError:
                raise ModuleNotFoundError(
                    "Could not import boto3 python package. "
                    "Please install it with `pip install boto3`."
                )
            except Exception as e:
                raise ValueError(
                    "Could not load credentials to authenticate with AWS client. "
                    "Please check that credentials in the specified "
                    "profile name are valid."
                ) from e
        self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
    def load(self) -> List[Document]:
        """Load given path as pages."""
        return list(self.lazy_load())
    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """Lazy load documents"""
        # the self.file_path is local, but the blob has to include
        # the S3 location if the file originated from S3 for multi-page documents
        # raises ValueError when multi-page and not on S3"""
        if self.web_path and self._is_s3_url(self.web_path):
            blob = Blob(path=self.web_path)
        else:
            blob = Blob.from_path(self.file_path)
            if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
                raise ValueError(
                    f"the file {blob.path} is a multi-page document, \
                    but not stored on S3. \
                    Textract requires multi-page documents to be on S3."
                )
        yield from self.parser.parse(blob)
    @staticmethod
    def _get_number_of_pages(blob: Blob) -> int:
        try:
            import pypdf
            from PIL import Image, ImageSequence
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import pypdf or Pilloe python package. "
                "Please install it with `pip install pypdf Pillow`."
            )
        if blob.mimetype == "application/pdf":
            with blob.as_bytes_io() as input_pdf_file:
                pdf_reader = pypdf.PdfReader(input_pdf_file)
                return len(pdf_reader.pages)
        elif blob.mimetype == "image/tiff":
            num_pages = 0
            img = Image.open(blob.as_bytes())
            for _, _ in enumerate(ImageSequence.Iterator(img)):
                num_pages += 1
            return num_pages
        elif blob.mimetype in ["image/png", "image/jpeg"]:
            return 1
        else:
            raise ValueError(f"unsupported mime type: {blob.mimetype}")
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@ -338,6 +338,42 @@ files = [
    {file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
 ]
 [[package]]
 name = "amazon-textract-caller"
 version = "0.0.29"
 description = "Amazon Textract Caller tools"
 category = "main"
 optional = true
 python-versions = ">=3.6"
 files = [
    {file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"},
    {file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"},
 ]
 [package.dependencies]
 amazon-textract-response-parser = ">=0.1.39"
 boto3 = ">=1.26.35"
 botocore = "*"
 [package.extras]
 testing = ["amazon-textract-response-parser", "pytest"]
 [[package]]
 name = "amazon-textract-response-parser"
 version = "1.0.0"
 description = "Easily parse JSON returned by Amazon Textract."
 category = "main"
 optional = true
 python-versions = ">=3.8"
 files = [
    {file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"},
    {file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"},
 ]
 [package.dependencies]
 boto3 = "*"
 marshmallow = ">=3.14,<4"
 [[package]]
 name = "anthropic"
 version = "0.3.2"
@ -4702,6 +4738,7 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
    {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
    {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
 ]
 [[package]]
@ -13539,7 +13576,7 @@ clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
+extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
 javascript = ["esprima"]
 llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
 openai = ["openai", "tiktoken"]
@ -13549,4 +13586,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
+content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04"
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true}
 librosa = {version="^0.10.0.post2", optional = true }
 feedparser = {version = "^6.0.10", optional = true}
 newspaper3k = {version = "^0.2.8", optional = true}
 amazon-textract-caller = {version = "<2", optional = true}
 [tool.poetry.group.test.dependencies]
 # The only dependencies that should be added are
@ -329,6 +330,7 @@ all = [
 # Please use new-line on formatting to make it easier to add new packages without
 # merge-conflicts
 extended_testing = [
 "amazon-textract-caller",
 "beautifulsoup4",
 "bibtexparser",
 "cassio",
--- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py
@ -1,6 +1,10 @@
 from pathlib import Path
 from typing import Sequence, Union
 import pytest
 from langchain.document_loaders import (
    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    PDFMinerLoader,
    PDFMinerPDFasHTMLLoader,
@ -136,3 +140,56 @@ def test_mathpix_loader() -> None:
    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].page_content)
@pytest.mark.parametrize(
    "file_path, features, docs_length, create_client",
    [
        (
            (
                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
                "/langchain/alejandro_rosalez_sample_1.jpg"
            ),
            ["FORMS", "TABLES"],
            1,
            False,
        ),
        (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
        (
            "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
            None,
            16,
            True,
        ),
    ],
 )
@pytest.mark.skip(reason="Requires AWS credentials to run")
 def test_amazontextract_loader(
    file_path: str,
    features: Union[Sequence[str], None],
    docs_length: int,
    create_client: bool,
 ) -> None:
    if create_client:
        import boto3
        textract_client = boto3.client("textract", region_name="us-east-2")
        loader = AmazonTextractPDFLoader(
            file_path, textract_features=features, client=textract_client
        )
    else:
        loader = AmazonTextractPDFLoader(file_path, textract_features=features)
    docs = loader.load()
    assert len(docs) == docs_length
@pytest.mark.skip(reason="Requires AWS credentials to run")
 def test_amazontextract_loader_failures() -> None:
    # 2-page PDF local file system
    two_page_pdf = str(
        Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
    )
    loader = AmazonTextractPDFLoader(two_page_pdf)
    with pytest.raises(ValueError):
        loader.load()
--- a/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf
+++ b/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf