Amazon Textract as document loader (#8661)

Description: Adding support for [Amazon
Textract](https://aws.amazon.com/textract/) as a PDF document loader

---------

Co-authored-by: schadem <45048633+schadem@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/8297/head
Piyush Jain 1 year ago committed by GitHub
parent 82ef1f587d
commit 8374367de2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader
from langchain.document_loaders.open_city_data import OpenCityDataLoader from langchain.document_loaders.open_city_data import OpenCityDataLoader
from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
from langchain.document_loaders.pdf import ( from langchain.document_loaders.pdf import (
AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
OnlinePDFLoader, OnlinePDFLoader,
PDFMinerLoader, PDFMinerLoader,
@ -330,4 +331,5 @@ __all__ = [
"YoutubeAudioLoader", "YoutubeAudioLoader",
"YoutubeLoader", "YoutubeLoader",
"ConcurrentLoader", "ConcurrentLoader",
"AmazonTextractPDFLoader",
] ]

@ -1,5 +1,6 @@
"""Module contains common parsers for PDFs.""" """Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional, Union from typing import Any, Iterator, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse
from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.blob_loaders import Blob
@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser):
) )
for page in doc.pages for page in doc.pages
] ]
class AmazonTextractPDFParser(BaseBlobParser):
"""Sends PDF files to Amazon Textract and parses them to generate Documents.
For parsing multi-page PDFs, they have to reside on S3.
"""
def __init__(
self,
textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None,
) -> None:
"""Initializes the parser.
Args:
textract_features: Features to be used for extraction, each feature
should be passed as an int that conforms to the enum
`Textract_Features`, see `amazon-textract-caller` pkg
client: boto3 textract client
"""
try:
import textractcaller as tc
self.tc = tc
if textract_features is not None:
self.textract_features = [
tc.Textract_Features(f) for f in textract_features
]
else:
self.textract_features = []
except ImportError:
raise ModuleNotFoundError(
"Could not import amazon-textract-caller python package. "
"Please install it with `pip install amazon-textract-caller`."
)
if not client:
try:
import boto3
self.boto3_textract_client = boto3.client("textract")
except ImportError:
raise ModuleNotFoundError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
else:
self.boto3_textract_client = client
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Iterates over the Blob pages and returns an Iterator with a Document
for each page, like the other parsers If multi-page document, blob.path
has to be set to the S3 URI and for single page docs the blob.data is taken
"""
url_parse_result = urlparse(str(blob.path)) if blob.path else None
# Either call with S3 path (multi-page) or with bytes (single-page)
if (
url_parse_result
and url_parse_result.scheme == "s3"
and url_parse_result.netloc
):
textract_response_json = self.tc.call_textract(
input_document=str(blob.path),
features=self.textract_features,
boto3_textract_client=self.boto3_textract_client,
)
else:
textract_response_json = self.tc.call_textract(
input_document=blob.as_bytes(),
features=self.textract_features,
call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
boto3_textract_client=self.boto3_textract_client,
)
current_text = ""
current_page = 1
for block in textract_response_json["Blocks"]:
if "Page" in block and not (int(block["Page"]) == current_page):
yield Document(
page_content=current_text,
metadata={"source": blob.source, "page": current_page},
)
current_text = ""
current_page = int(block["Page"])
if "Text" in block:
current_text += block["Text"] + " "
yield Document(
page_content=current_text,
metadata={"source": blob.source, "page": current_page},
)

@ -7,7 +7,7 @@ import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional, Union from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.pdf import ( from langchain.document_loaders.parsers.pdf import (
AmazonTextractPDFParser,
PDFMinerParser, PDFMinerParser,
PDFPlumberParser, PDFPlumberParser,
PyMuPDFParser, PyMuPDFParser,
@ -71,8 +72,14 @@ class BasePDFLoader(BaseLoader, ABC):
if "~" in self.file_path: if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path) self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that # If the file is a web path or S3, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
if self._is_s3_url(self.file_path):
self.web_path = self.file_path
else:
r = requests.get(self.file_path) r = requests.get(self.file_path)
if r.status_code != 200: if r.status_code != 200:
@ -82,8 +89,6 @@ class BasePDFLoader(BaseLoader, ABC):
) )
self.web_path = self.file_path self.web_path = self.file_path
self.temp_dir = tempfile.TemporaryDirectory()
temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
with open(temp_pdf, mode="wb") as f: with open(temp_pdf, mode="wb") as f:
f.write(r.content) f.write(r.content)
self.file_path = str(temp_pdf) self.file_path = str(temp_pdf)
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
parsed = urlparse(url) parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme) return bool(parsed.netloc) and bool(parsed.scheme)
@staticmethod
def _is_s3_url(url: str) -> bool:
"""check if the url is S3"""
try:
result = urlparse(url)
if result.scheme == "s3" and result.netloc:
return True
return False
except ValueError:
return False
@property @property
def source(self) -> str: def source(self) -> str:
return self.web_path if self.web_path is not None else self.file_path return self.web_path if self.web_path is not None else self.file_path
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
parser = PDFPlumberParser(text_kwargs=self.text_kwargs) parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
blob = Blob.from_path(self.file_path) blob = Blob.from_path(self.file_path)
return parser.parse(blob) return parser.parse(blob)
class AmazonTextractPDFLoader(BasePDFLoader):
"""Loads a PDF document from local file system, HTTP or S3.
To authenticate, the AWS client uses the following methods to
automatically load credentials:
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
If a specific credential profile should be used, you must pass
the name of the profile from the ~/.aws/credentials file that is to be used.
Make sure the credentials / roles used have the required policies to
access the Amazon Textract service.
Example:
.. code-block:: python
from langchain.document_loaders import AmazonTextractPDFLoader
loader = AmazonTextractPDFLoader(
file_path="s3://pdfs/myfile.pdf"
)
document = loader.load()
"""
def __init__(
self,
file_path: str,
textract_features: Optional[Sequence[str]] = None,
client: Optional[Any] = None,
credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None,
endpoint_url: Optional[str] = None,
) -> None:
"""Initialize the loader.
Args:
file_path: A file, url or s3 path for input file
textract_features: Features to be used for extraction, each feature
should be passed as a str that conforms to the enum
`Textract_Features`, see `amazon-textract-caller` pkg
client: boto3 textract client (Optional)
credentials_profile_name: AWS profile name, if not default (Optional)
region_name: AWS region, eg us-east-1 (Optional)
endpoint_url: endpoint url for the textract service (Optional)
"""
super().__init__(file_path)
try:
import textractcaller as tc # noqa: F401
except ImportError:
raise ModuleNotFoundError(
"Could not import amazon-textract-caller python package. "
"Please install it with `pip install amazon-textract-caller`."
)
if textract_features:
features = [tc.Textract_Features[x] for x in textract_features]
else:
features = []
if credentials_profile_name or region_name or endpoint_url:
try:
import boto3
if credentials_profile_name is not None:
session = boto3.Session(profile_name=credentials_profile_name)
else:
# use default credentials
session = boto3.Session()
client_params = {}
if region_name:
client_params["region_name"] = region_name
if endpoint_url:
client_params["endpoint_url"] = endpoint_url
client = session.client("textract", **client_params)
except ImportError:
raise ModuleNotFoundError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
except Exception as e:
raise ValueError(
"Could not load credentials to authenticate with AWS client. "
"Please check that credentials in the specified "
"profile name are valid."
) from e
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load documents"""
# the self.file_path is local, but the blob has to include
# the S3 location if the file originated from S3 for multi-page documents
# raises ValueError when multi-page and not on S3"""
if self.web_path and self._is_s3_url(self.web_path):
blob = Blob(path=self.web_path)
else:
blob = Blob.from_path(self.file_path)
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
raise ValueError(
f"the file {blob.path} is a multi-page document, \
but not stored on S3. \
Textract requires multi-page documents to be on S3."
)
yield from self.parser.parse(blob)
@staticmethod
def _get_number_of_pages(blob: Blob) -> int:
try:
import pypdf
from PIL import Image, ImageSequence
except ImportError:
raise ModuleNotFoundError(
"Could not import pypdf or Pilloe python package. "
"Please install it with `pip install pypdf Pillow`."
)
if blob.mimetype == "application/pdf":
with blob.as_bytes_io() as input_pdf_file:
pdf_reader = pypdf.PdfReader(input_pdf_file)
return len(pdf_reader.pages)
elif blob.mimetype == "image/tiff":
num_pages = 0
img = Image.open(blob.as_bytes())
for _, _ in enumerate(ImageSequence.Iterator(img)):
num_pages += 1
return num_pages
elif blob.mimetype in ["image/png", "image/jpeg"]:
return 1
else:
raise ValueError(f"unsupported mime type: {blob.mimetype}")

@ -338,6 +338,42 @@ files = [
{file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"}, {file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
] ]
[[package]]
name = "amazon-textract-caller"
version = "0.0.29"
description = "Amazon Textract Caller tools"
category = "main"
optional = true
python-versions = ">=3.6"
files = [
{file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"},
{file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"},
]
[package.dependencies]
amazon-textract-response-parser = ">=0.1.39"
boto3 = ">=1.26.35"
botocore = "*"
[package.extras]
testing = ["amazon-textract-response-parser", "pytest"]
[[package]]
name = "amazon-textract-response-parser"
version = "1.0.0"
description = "Easily parse JSON returned by Amazon Textract."
category = "main"
optional = true
python-versions = ">=3.8"
files = [
{file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"},
{file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"},
]
[package.dependencies]
boto3 = "*"
marshmallow = ">=3.14,<4"
[[package]] [[package]]
name = "anthropic" name = "anthropic"
version = "0.3.2" version = "0.3.2"
@ -4702,6 +4738,7 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [ files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
] ]
[[package]] [[package]]
@ -13539,7 +13576,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"] cohere = ["cohere"]
docarray = ["docarray"] docarray = ["docarray"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"] extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
javascript = ["esprima"] javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
openai = ["openai", "tiktoken"] openai = ["openai", "tiktoken"]
@ -13549,4 +13586,4 @@ text-helpers = ["chardet"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290" content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04"

@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true}
librosa = {version="^0.10.0.post2", optional = true } librosa = {version="^0.10.0.post2", optional = true }
feedparser = {version = "^6.0.10", optional = true} feedparser = {version = "^6.0.10", optional = true}
newspaper3k = {version = "^0.2.8", optional = true} newspaper3k = {version = "^0.2.8", optional = true}
amazon-textract-caller = {version = "<2", optional = true}
[tool.poetry.group.test.dependencies] [tool.poetry.group.test.dependencies]
# The only dependencies that should be added are # The only dependencies that should be added are
@ -329,6 +330,7 @@ all = [
# Please use new-line on formatting to make it easier to add new packages without # Please use new-line on formatting to make it easier to add new packages without
# merge-conflicts # merge-conflicts
extended_testing = [ extended_testing = [
"amazon-textract-caller",
"beautifulsoup4", "beautifulsoup4",
"bibtexparser", "bibtexparser",
"cassio", "cassio",

@ -1,6 +1,10 @@
from pathlib import Path from pathlib import Path
from typing import Sequence, Union
import pytest
from langchain.document_loaders import ( from langchain.document_loaders import (
AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
PDFMinerLoader, PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
@ -136,3 +140,56 @@ def test_mathpix_loader() -> None:
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
print(docs[0].page_content) print(docs[0].page_content)
@pytest.mark.parametrize(
"file_path, features, docs_length, create_client",
[
(
(
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
"/langchain/alejandro_rosalez_sample_1.jpg"
),
["FORMS", "TABLES"],
1,
False,
),
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
(
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
None,
16,
True,
),
],
)
@pytest.mark.skip(reason="Requires AWS credentials to run")
def test_amazontextract_loader(
file_path: str,
features: Union[Sequence[str], None],
docs_length: int,
create_client: bool,
) -> None:
if create_client:
import boto3
textract_client = boto3.client("textract", region_name="us-east-2")
loader = AmazonTextractPDFLoader(
file_path, textract_features=features, client=textract_client
)
else:
loader = AmazonTextractPDFLoader(file_path, textract_features=features)
docs = loader.load()
assert len(docs) == docs_length
@pytest.mark.skip(reason="Requires AWS credentials to run")
def test_amazontextract_loader_failures() -> None:
# 2-page PDF local file system
two_page_pdf = str(
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
)
loader = AmazonTextractPDFLoader(two_page_pdf)
with pytest.raises(ValueError):
loader.load()

Loading…
Cancel
Save