Amazon Textract as document loader (#8661)

Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · 8374367de2
parent 82ef1f587d
commit 8374367de2
7 changed files with 368 additions and 18 deletions
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader
 from langchain.document_loaders.open_city_data import OpenCityDataLoader
 from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
 from langchain.document_loaders.pdf import (
+    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    OnlinePDFLoader,
    PDFMinerLoader,
@ -330,4 +331,5 @@ __all__ = [
    "YoutubeAudioLoader",
    "YoutubeLoader",
    "ConcurrentLoader",
+    "AmazonTextractPDFLoader",
 ]
--- a/libs/langchain/langchain/document_loaders/parsers/pdf.py
+++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py
@ -1,5 +1,6 @@
 """Module contains common parsers for PDFs."""
-from typing import Any, Iterator, Mapping, Optional, Union
+from typing import Any, Iterator, Mapping, Optional, Sequence, Union
+from urllib.parse import urlparse

 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob
@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser):
                )
                for page in doc.pages
            ]
+
+
+class AmazonTextractPDFParser(BaseBlobParser):
+    """Sends PDF files to Amazon Textract and parses them to generate Documents.
+
+    For parsing multi-page PDFs, they have to reside on S3.
+    """
+
+    def __init__(
+        self,
+        textract_features: Optional[Sequence[int]] = None,
+        client: Optional[Any] = None,
+    ) -> None:
+        """Initializes the parser.
+
+        Args:
+            textract_features: Features to be used for extraction, each feature
+                               should be passed as an int that conforms to the enum
+                               `Textract_Features`, see `amazon-textract-caller` pkg
+            client: boto3 textract client
+        """
+
+        try:
+            import textractcaller as tc
+
+            self.tc = tc
+            if textract_features is not None:
+                self.textract_features = [
+                    tc.Textract_Features(f) for f in textract_features
+                ]
+            else:
+                self.textract_features = []
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import amazon-textract-caller python package. "
+                "Please install it with `pip install amazon-textract-caller`."
+            )
+
+        if not client:
+            try:
+                import boto3
+
+                self.boto3_textract_client = boto3.client("textract")
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Could not import boto3 python package. "
+                    "Please install it with `pip install boto3`."
+                )
+        else:
+            self.boto3_textract_client = client
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Iterates over the Blob pages and returns an Iterator with a Document
+        for each page, like the other parsers If multi-page document, blob.path
+        has to be set to the S3 URI and for single page docs the blob.data is taken
+        """
+
+        url_parse_result = urlparse(str(blob.path)) if blob.path else None
+        # Either call with S3 path (multi-page) or with bytes (single-page)
+        if (
+            url_parse_result
+            and url_parse_result.scheme == "s3"
+            and url_parse_result.netloc
+        ):
+            textract_response_json = self.tc.call_textract(
+                input_document=str(blob.path),
+                features=self.textract_features,
+                boto3_textract_client=self.boto3_textract_client,
+            )
+        else:
+            textract_response_json = self.tc.call_textract(
+                input_document=blob.as_bytes(),
+                features=self.textract_features,
+                call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
+                boto3_textract_client=self.boto3_textract_client,
+            )
+
+        current_text = ""
+        current_page = 1
+        for block in textract_response_json["Blocks"]:
+            if "Page" in block and not (int(block["Page"]) == current_page):
+                yield Document(
+                    page_content=current_text,
+                    metadata={"source": blob.source, "page": current_page},
+                )
+                current_text = ""
+                current_page = int(block["Page"])
+            if "Text" in block:
+                current_text += block["Text"] + " "
+
+        yield Document(
+            page_content=current_text,
+            metadata={"source": blob.source, "page": current_page},
+        )
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@ -7,7 +7,7 @@ import time
 from abc import ABC
 from io import StringIO
 from pathlib import Path
-from typing import Any, Iterator, List, Mapping, Optional, Union
+from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
 from urllib.parse import urlparse

 import requests
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.blob_loaders import Blob
 from langchain.document_loaders.parsers.pdf import (
+    AmazonTextractPDFParser,
    PDFMinerParser,
    PDFPlumberParser,
    PyMuPDFParser,
@ -71,8 +72,14 @@ class BasePDFLoader(BaseLoader, ABC):
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

-        # If the file is a web path, download it to a temporary file, and use that
+        # If the file is a web path or S3, download it to a temporary file, and use that
        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
+            self.temp_dir = tempfile.TemporaryDirectory()
+            _, suffix = os.path.splitext(self.file_path)
+            temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
+            if self._is_s3_url(self.file_path):
+                self.web_path = self.file_path
+            else:
                r = requests.get(self.file_path)

                if r.status_code != 200:
@ -82,8 +89,6 @@ class BasePDFLoader(BaseLoader, ABC):
                    )

                self.web_path = self.file_path
-            self.temp_dir = tempfile.TemporaryDirectory()
-            temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
                with open(temp_pdf, mode="wb") as f:
                    f.write(r.content)
                self.file_path = str(temp_pdf)
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

+    @staticmethod
+    def _is_s3_url(url: str) -> bool:
+        """check if the url is S3"""
+        try:
+            result = urlparse(url)
+            if result.scheme == "s3" and result.netloc:
+                return True
+            return False
+        except ValueError:
+            return False
+
    @property
    def source(self) -> str:
        return self.web_path if self.web_path is not None else self.file_path
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
        parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
        blob = Blob.from_path(self.file_path)
        return parser.parse(blob)
+
+
+class AmazonTextractPDFLoader(BasePDFLoader):
+    """Loads a PDF document from local file system, HTTP or S3.
+
+    To authenticate, the AWS client uses the following methods to
+    automatically load credentials:
+    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
+
+    If a specific credential profile should be used, you must pass
+    the name of the profile from the ~/.aws/credentials file that is to be used.
+
+    Make sure the credentials / roles used have the required policies to
+    access the Amazon Textract service.
+
+    Example:
+        .. code-block:: python
+            from langchain.document_loaders import AmazonTextractPDFLoader
+            loader = AmazonTextractPDFLoader(
+                file_path="s3://pdfs/myfile.pdf"
+            )
+            document = loader.load()
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        textract_features: Optional[Sequence[str]] = None,
+        client: Optional[Any] = None,
+        credentials_profile_name: Optional[str] = None,
+        region_name: Optional[str] = None,
+        endpoint_url: Optional[str] = None,
+    ) -> None:
+        """Initialize the loader.
+
+        Args:
+            file_path: A file, url or s3 path for input file
+            textract_features: Features to be used for extraction, each feature
+                               should be passed as a str that conforms to the enum
+                               `Textract_Features`, see `amazon-textract-caller` pkg
+            client: boto3 textract client (Optional)
+            credentials_profile_name: AWS profile name, if not default (Optional)
+            region_name: AWS region, eg us-east-1 (Optional)
+            endpoint_url: endpoint url for the textract service (Optional)
+
+        """
+        super().__init__(file_path)
+
+        try:
+            import textractcaller as tc  # noqa: F401
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import amazon-textract-caller python package. "
+                "Please install it with `pip install amazon-textract-caller`."
+            )
+        if textract_features:
+            features = [tc.Textract_Features[x] for x in textract_features]
+        else:
+            features = []
+
+        if credentials_profile_name or region_name or endpoint_url:
+            try:
+                import boto3
+
+                if credentials_profile_name is not None:
+                    session = boto3.Session(profile_name=credentials_profile_name)
+                else:
+                    # use default credentials
+                    session = boto3.Session()
+
+                client_params = {}
+                if region_name:
+                    client_params["region_name"] = region_name
+                if endpoint_url:
+                    client_params["endpoint_url"] = endpoint_url
+
+                client = session.client("textract", **client_params)
+
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Could not import boto3 python package. "
+                    "Please install it with `pip install boto3`."
+                )
+            except Exception as e:
+                raise ValueError(
+                    "Could not load credentials to authenticate with AWS client. "
+                    "Please check that credentials in the specified "
+                    "profile name are valid."
+                ) from e
+        self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
+
+    def load(self) -> List[Document]:
+        """Load given path as pages."""
+        return list(self.lazy_load())
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load documents"""
+        # the self.file_path is local, but the blob has to include
+        # the S3 location if the file originated from S3 for multi-page documents
+        # raises ValueError when multi-page and not on S3"""
+
+        if self.web_path and self._is_s3_url(self.web_path):
+            blob = Blob(path=self.web_path)
+        else:
+            blob = Blob.from_path(self.file_path)
+            if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
+                raise ValueError(
+                    f"the file {blob.path} is a multi-page document, \
+                    but not stored on S3. \
+                    Textract requires multi-page documents to be on S3."
+                )
+
+        yield from self.parser.parse(blob)
+
+    @staticmethod
+    def _get_number_of_pages(blob: Blob) -> int:
+        try:
+            import pypdf
+            from PIL import Image, ImageSequence
+
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import pypdf or Pilloe python package. "
+                "Please install it with `pip install pypdf Pillow`."
+            )
+        if blob.mimetype == "application/pdf":
+            with blob.as_bytes_io() as input_pdf_file:
+                pdf_reader = pypdf.PdfReader(input_pdf_file)
+                return len(pdf_reader.pages)
+        elif blob.mimetype == "image/tiff":
+            num_pages = 0
+            img = Image.open(blob.as_bytes())
+            for _, _ in enumerate(ImageSequence.Iterator(img)):
+                num_pages += 1
+            return num_pages
+        elif blob.mimetype in ["image/png", "image/jpeg"]:
+            return 1
+        else:
+            raise ValueError(f"unsupported mime type: {blob.mimetype}")
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@ -338,6 +338,42 @@ files = [
    {file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
 ]

+[[package]]
+name = "amazon-textract-caller"
+version = "0.0.29"
+description = "Amazon Textract Caller tools"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+files = [
+    {file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"},
+    {file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"},
+]
+
+[package.dependencies]
+amazon-textract-response-parser = ">=0.1.39"
+boto3 = ">=1.26.35"
+botocore = "*"
+
+[package.extras]
+testing = ["amazon-textract-response-parser", "pytest"]
+
+[[package]]
+name = "amazon-textract-response-parser"
+version = "1.0.0"
+description = "Easily parse JSON returned by Amazon Textract."
+category = "main"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"},
+    {file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"},
+]
+
+[package.dependencies]
+boto3 = "*"
+marshmallow = ">=3.14,<4"
+
 [[package]]
 name = "anthropic"
 version = "0.3.2"
@ -4702,6 +4738,7 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
    {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
+    {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
 ]

 [[package]]
@ -13539,7 +13576,7 @@ clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
+extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
 javascript = ["esprima"]
 llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
 openai = ["openai", "tiktoken"]
@ -13549,4 +13586,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
+content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04"
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true}
 librosa = {version="^0.10.0.post2", optional = true }
 feedparser = {version = "^6.0.10", optional = true}
 newspaper3k = {version = "^0.2.8", optional = true}
+amazon-textract-caller = {version = "<2", optional = true}

 [tool.poetry.group.test.dependencies]
 # The only dependencies that should be added are
@ -329,6 +330,7 @@ all = [
 # Please use new-line on formatting to make it easier to add new packages without
 # merge-conflicts
 extended_testing = [
+ "amazon-textract-caller",
 "beautifulsoup4",
 "bibtexparser",
 "cassio",
--- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py
@ -1,6 +1,10 @@
 from pathlib import Path
+from typing import Sequence, Union
+
+import pytest

 from langchain.document_loaders import (
+    AmazonTextractPDFLoader,
    MathpixPDFLoader,
    PDFMinerLoader,
    PDFMinerPDFasHTMLLoader,
@ -136,3 +140,56 @@ def test_mathpix_loader() -> None:
    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].page_content)
+
+
+@pytest.mark.parametrize(
+    "file_path, features, docs_length, create_client",
+    [
+        (
+            (
+                "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
+                "/langchain/alejandro_rosalez_sample_1.jpg"
+            ),
+            ["FORMS", "TABLES"],
+            1,
+            False,
+        ),
+        (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
+        (
+            "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
+            None,
+            16,
+            True,
+        ),
+    ],
+)
+@pytest.mark.skip(reason="Requires AWS credentials to run")
+def test_amazontextract_loader(
+    file_path: str,
+    features: Union[Sequence[str], None],
+    docs_length: int,
+    create_client: bool,
+) -> None:
+    if create_client:
+        import boto3
+
+        textract_client = boto3.client("textract", region_name="us-east-2")
+        loader = AmazonTextractPDFLoader(
+            file_path, textract_features=features, client=textract_client
+        )
+    else:
+        loader = AmazonTextractPDFLoader(file_path, textract_features=features)
+    docs = loader.load()
+
+    assert len(docs) == docs_length
+
+
+@pytest.mark.skip(reason="Requires AWS credentials to run")
+def test_amazontextract_loader_failures() -> None:
+    # 2-page PDF local file system
+    two_page_pdf = str(
+        Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
+    )
+    loader = AmazonTextractPDFLoader(two_page_pdf)
+    with pytest.raises(ValueError):
+        loader.load()
--- a/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf
+++ b/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf