mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
fa5d49f2c1
ran ```bash g grep -l "langchain.vectorstores" | xargs -L 1 sed -i '' "s/langchain\.vectorstores/langchain_community.vectorstores/g" g grep -l "langchain.document_loaders" | xargs -L 1 sed -i '' "s/langchain\.document_loaders/langchain_community.document_loaders/g" g grep -l "langchain.chat_loaders" | xargs -L 1 sed -i '' "s/langchain\.chat_loaders/langchain_community.chat_loaders/g" g grep -l "langchain.document_transformers" | xargs -L 1 sed -i '' "s/langchain\.document_transformers/langchain_community.document_transformers/g" g grep -l "langchain\.graphs" | xargs -L 1 sed -i '' "s/langchain\.graphs/langchain_community.graphs/g" g grep -l "langchain\.memory\.chat_message_histories" | xargs -L 1 sed -i '' "s/langchain\.memory\.chat_message_histories/langchain_community.chat_message_histories/g" gco master libs/langchain/tests/unit_tests/*/test_imports.py gco master libs/langchain/tests/unit_tests/**/test_public_api.py ```
582 lines
20 KiB
Python
582 lines
20 KiB
Python
"""Module contains common parsers for PDFs."""
|
|
from __future__ import annotations
|
|
|
|
import warnings
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Iterable,
|
|
Iterator,
|
|
Mapping,
|
|
Optional,
|
|
Sequence,
|
|
Union,
|
|
)
|
|
from urllib.parse import urlparse
|
|
|
|
import numpy as np
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseBlobParser
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
|
|
if TYPE_CHECKING:
|
|
import fitz.fitz
|
|
import pdfminer.layout
|
|
import pdfplumber.page
|
|
import pypdf._page
|
|
import pypdfium2._helpers.page
|
|
|
|
|
|
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
|
_PDF_FILTER_WITHOUT_LOSS = [
|
|
"LZWDecode",
|
|
"LZW",
|
|
"FlateDecode",
|
|
"Fl",
|
|
"ASCII85Decode",
|
|
"A85",
|
|
"ASCIIHexDecode",
|
|
"AHx",
|
|
"RunLengthDecode",
|
|
"RL",
|
|
"CCITTFaxDecode",
|
|
"CCF",
|
|
"JBIG2Decode",
|
|
]
|
|
|
|
|
|
def extract_from_images_with_rapidocr(
|
|
images: Sequence[Union[Iterable[np.ndarray], bytes]],
|
|
) -> str:
|
|
"""Extract text from images with RapidOCR.
|
|
|
|
Args:
|
|
images: Images to extract text from.
|
|
|
|
Returns:
|
|
Text extracted from images.
|
|
|
|
Raises:
|
|
ImportError: If `rapidocr-onnxruntime` package is not installed.
|
|
"""
|
|
try:
|
|
from rapidocr_onnxruntime import RapidOCR
|
|
except ImportError:
|
|
raise ImportError(
|
|
"`rapidocr-onnxruntime` package not found, please install it with "
|
|
"`pip install rapidocr-onnxruntime`"
|
|
)
|
|
ocr = RapidOCR()
|
|
text = ""
|
|
for img in images:
|
|
result, _ = ocr(img)
|
|
if result:
|
|
result = [text[1] for text in result]
|
|
text += "\n".join(result)
|
|
return text
|
|
|
|
|
|
class PyPDFParser(BaseBlobParser):
|
|
"""Load `PDF` using `pypdf`"""
|
|
|
|
def __init__(
|
|
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
|
|
):
|
|
self.password = password
|
|
self.extract_images = extract_images
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
import pypdf
|
|
|
|
with blob.as_bytes_io() as pdf_file_obj:
|
|
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
|
yield from [
|
|
Document(
|
|
page_content=page.extract_text()
|
|
+ self._extract_images_from_page(page),
|
|
metadata={"source": blob.source, "page": page_number},
|
|
)
|
|
for page_number, page in enumerate(pdf_reader.pages)
|
|
]
|
|
|
|
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
|
|
"""Extract images from page and get the text with RapidOCR."""
|
|
if not self.extract_images or "/XObject" not in page["/Resources"].keys():
|
|
return ""
|
|
|
|
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
|
|
images = []
|
|
for obj in xObject:
|
|
if xObject[obj]["/Subtype"] == "/Image":
|
|
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
|
|
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
|
|
|
images.append(
|
|
np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
|
|
height, width, -1
|
|
)
|
|
)
|
|
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
|
|
images.append(xObject[obj].get_data())
|
|
else:
|
|
warnings.warn("Unknown PDF Filter!")
|
|
return extract_from_images_with_rapidocr(images)
|
|
|
|
|
|
class PDFMinerParser(BaseBlobParser):
|
|
"""Parse `PDF` using `PDFMiner`."""
|
|
|
|
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
|
"""Initialize a parser based on PDFMiner.
|
|
|
|
Args:
|
|
extract_images: Whether to extract images from PDF.
|
|
concatenate_pages: If True, concatenate all PDF pages into one a single
|
|
document. Otherwise, return one document per page.
|
|
"""
|
|
self.extract_images = extract_images
|
|
self.concatenate_pages = concatenate_pages
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
|
|
if not self.extract_images:
|
|
from pdfminer.high_level import extract_text
|
|
|
|
with blob.as_bytes_io() as pdf_file_obj:
|
|
if self.concatenate_pages:
|
|
text = extract_text(pdf_file_obj)
|
|
metadata = {"source": blob.source}
|
|
yield Document(page_content=text, metadata=metadata)
|
|
else:
|
|
from pdfminer.pdfpage import PDFPage
|
|
|
|
pages = PDFPage.get_pages(pdf_file_obj)
|
|
for i, _ in enumerate(pages):
|
|
text = extract_text(pdf_file_obj, page_numbers=[i])
|
|
metadata = {"source": blob.source, "page": str(i)}
|
|
yield Document(page_content=text, metadata=metadata)
|
|
else:
|
|
import io
|
|
|
|
from pdfminer.converter import PDFPageAggregator, TextConverter
|
|
from pdfminer.layout import LAParams
|
|
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
|
from pdfminer.pdfpage import PDFPage
|
|
|
|
text_io = io.StringIO()
|
|
with blob.as_bytes_io() as pdf_file_obj:
|
|
pages = PDFPage.get_pages(pdf_file_obj)
|
|
rsrcmgr = PDFResourceManager()
|
|
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
|
|
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
|
|
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
|
|
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
|
|
for i, page in enumerate(pages):
|
|
interpreter_for_text.process_page(page)
|
|
interpreter_for_image.process_page(page)
|
|
content = text_io.getvalue() + self._extract_images_from_page(
|
|
device_for_image.get_result()
|
|
)
|
|
text_io.truncate(0)
|
|
text_io.seek(0)
|
|
metadata = {"source": blob.source, "page": str(i)}
|
|
yield Document(page_content=content, metadata=metadata)
|
|
|
|
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
|
|
"""Extract images from page and get the text with RapidOCR."""
|
|
import pdfminer
|
|
|
|
def get_image(layout_object: Any) -> Any:
|
|
if isinstance(layout_object, pdfminer.layout.LTImage):
|
|
return layout_object
|
|
if isinstance(layout_object, pdfminer.layout.LTContainer):
|
|
for child in layout_object:
|
|
return get_image(child)
|
|
else:
|
|
return None
|
|
|
|
images = []
|
|
|
|
for img in list(filter(bool, map(get_image, page))):
|
|
if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
|
images.append(
|
|
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
|
|
img.stream["Height"], img.stream["Width"], -1
|
|
)
|
|
)
|
|
elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
|
images.append(img.stream.get_data())
|
|
else:
|
|
warnings.warn("Unknown PDF Filter!")
|
|
return extract_from_images_with_rapidocr(images)
|
|
|
|
|
|
class PyMuPDFParser(BaseBlobParser):
|
|
"""Parse `PDF` using `PyMuPDF`."""
|
|
|
|
def __init__(
|
|
self,
|
|
text_kwargs: Optional[Mapping[str, Any]] = None,
|
|
extract_images: bool = False,
|
|
) -> None:
|
|
"""Initialize the parser.
|
|
|
|
Args:
|
|
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
|
|
"""
|
|
self.text_kwargs = text_kwargs or {}
|
|
self.extract_images = extract_images
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
import fitz
|
|
|
|
with blob.as_bytes_io() as file_path:
|
|
if blob.data is None:
|
|
doc = fitz.open(file_path)
|
|
else:
|
|
doc = fitz.open(stream=file_path, filetype="pdf")
|
|
|
|
yield from [
|
|
Document(
|
|
page_content=page.get_text(**self.text_kwargs)
|
|
+ self._extract_images_from_page(doc, page),
|
|
metadata=dict(
|
|
{
|
|
"source": blob.source,
|
|
"file_path": blob.source,
|
|
"page": page.number,
|
|
"total_pages": len(doc),
|
|
},
|
|
**{
|
|
k: doc.metadata[k]
|
|
for k in doc.metadata
|
|
if type(doc.metadata[k]) in [str, int]
|
|
},
|
|
),
|
|
)
|
|
for page in doc
|
|
]
|
|
|
|
def _extract_images_from_page(
|
|
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
|
|
) -> str:
|
|
"""Extract images from page and get the text with RapidOCR."""
|
|
if not self.extract_images:
|
|
return ""
|
|
import fitz
|
|
|
|
img_list = page.get_images()
|
|
imgs = []
|
|
for img in img_list:
|
|
xref = img[0]
|
|
pix = fitz.Pixmap(doc, xref)
|
|
imgs.append(
|
|
np.frombuffer(pix.samples, dtype=np.uint8).reshape(
|
|
pix.height, pix.width, -1
|
|
)
|
|
)
|
|
return extract_from_images_with_rapidocr(imgs)
|
|
|
|
|
|
class PyPDFium2Parser(BaseBlobParser):
|
|
"""Parse `PDF` with `PyPDFium2`."""
|
|
|
|
def __init__(self, extract_images: bool = False) -> None:
|
|
"""Initialize the parser."""
|
|
try:
|
|
import pypdfium2 # noqa:F401
|
|
except ImportError:
|
|
raise ImportError(
|
|
"pypdfium2 package not found, please install it with"
|
|
" `pip install pypdfium2`"
|
|
)
|
|
self.extract_images = extract_images
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
import pypdfium2
|
|
|
|
# pypdfium2 is really finicky with respect to closing things,
|
|
# if done incorrectly creates seg faults.
|
|
with blob.as_bytes_io() as file_path:
|
|
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
|
try:
|
|
for page_number, page in enumerate(pdf_reader):
|
|
text_page = page.get_textpage()
|
|
content = text_page.get_text_range()
|
|
text_page.close()
|
|
content += "\n" + self._extract_images_from_page(page)
|
|
page.close()
|
|
metadata = {"source": blob.source, "page": page_number}
|
|
yield Document(page_content=content, metadata=metadata)
|
|
finally:
|
|
pdf_reader.close()
|
|
|
|
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
|
|
"""Extract images from page and get the text with RapidOCR."""
|
|
if not self.extract_images:
|
|
return ""
|
|
|
|
import pypdfium2.raw as pdfium_c
|
|
|
|
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
|
|
|
|
images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
|
|
return extract_from_images_with_rapidocr(images)
|
|
|
|
|
|
class PDFPlumberParser(BaseBlobParser):
|
|
"""Parse `PDF` with `PDFPlumber`."""
|
|
|
|
def __init__(
|
|
self,
|
|
text_kwargs: Optional[Mapping[str, Any]] = None,
|
|
dedupe: bool = False,
|
|
extract_images: bool = False,
|
|
) -> None:
|
|
"""Initialize the parser.
|
|
|
|
Args:
|
|
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
|
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
|
"""
|
|
self.text_kwargs = text_kwargs or {}
|
|
self.dedupe = dedupe
|
|
self.extract_images = extract_images
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
import pdfplumber
|
|
|
|
with blob.as_bytes_io() as file_path:
|
|
doc = pdfplumber.open(file_path) # open document
|
|
|
|
yield from [
|
|
Document(
|
|
page_content=self._process_page_content(page)
|
|
+ "\n"
|
|
+ self._extract_images_from_page(page),
|
|
metadata=dict(
|
|
{
|
|
"source": blob.source,
|
|
"file_path": blob.source,
|
|
"page": page.page_number - 1,
|
|
"total_pages": len(doc.pages),
|
|
},
|
|
**{
|
|
k: doc.metadata[k]
|
|
for k in doc.metadata
|
|
if type(doc.metadata[k]) in [str, int]
|
|
},
|
|
),
|
|
)
|
|
for page in doc.pages
|
|
]
|
|
|
|
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
|
"""Process the page content based on dedupe."""
|
|
if self.dedupe:
|
|
return page.dedupe_chars().extract_text(**self.text_kwargs)
|
|
return page.extract_text(**self.text_kwargs)
|
|
|
|
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
|
|
"""Extract images from page and get the text with RapidOCR."""
|
|
if not self.extract_images:
|
|
return ""
|
|
|
|
images = []
|
|
for img in page.images:
|
|
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
|
images.append(
|
|
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
|
|
img["stream"]["Height"], img["stream"]["Width"], -1
|
|
)
|
|
)
|
|
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
|
images.append(img["stream"].get_data())
|
|
else:
|
|
warnings.warn("Unknown PDF Filter!")
|
|
|
|
return extract_from_images_with_rapidocr(images)
|
|
|
|
|
|
class AmazonTextractPDFParser(BaseBlobParser):
|
|
"""Send `PDF` files to `Amazon Textract` and parse them.
|
|
|
|
For parsing multi-page PDFs, they have to reside on S3.
|
|
|
|
The AmazonTextractPDFLoader calls the
|
|
[Amazon Textract Service](https://aws.amazon.com/textract/)
|
|
to convert PDFs into a Document structure.
|
|
Single and multi-page documents are supported with up to 3000 pages
|
|
and 512 MB of size.
|
|
|
|
For the call to be successful an AWS account is required,
|
|
similar to the
|
|
[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
|
|
requirements.
|
|
|
|
Besides the AWS configuration, it is very similar to the other PDF
|
|
loaders, while also supporting JPEG, PNG and TIFF and non-native
|
|
PDF formats.
|
|
|
|
```python
|
|
from langchain_community.document_loaders import AmazonTextractPDFLoader
|
|
loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
|
|
documents = loader.load()
|
|
```
|
|
|
|
One feature is the linearization of the output.
|
|
When using the features LAYOUT, FORMS or TABLES together with Textract
|
|
|
|
```python
|
|
from langchain_community.document_loaders import AmazonTextractPDFLoader
|
|
# you can mix and match each of the features
|
|
loader=AmazonTextractPDFLoader(
|
|
"example_data/alejandro_rosalez_sample-small.jpeg",
|
|
textract_features=["TABLES", "LAYOUT"])
|
|
documents = loader.load()
|
|
```
|
|
|
|
it will generate output that formats the text in reading order and
|
|
try to output the information in a tabular structure or
|
|
output the key/value pairs with a colon (key: value).
|
|
This helps most LLMs to achieve better accuracy when
|
|
processing these texts.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
textract_features: Optional[Sequence[int]] = None,
|
|
client: Optional[Any] = None,
|
|
) -> None:
|
|
"""Initializes the parser.
|
|
|
|
Args:
|
|
textract_features: Features to be used for extraction, each feature
|
|
should be passed as an int that conforms to the enum
|
|
`Textract_Features`, see `amazon-textract-caller` pkg
|
|
client: boto3 textract client
|
|
"""
|
|
|
|
try:
|
|
import textractcaller as tc
|
|
import textractor.entities.document as textractor
|
|
|
|
self.tc = tc
|
|
self.textractor = textractor
|
|
|
|
if textract_features is not None:
|
|
self.textract_features = [
|
|
tc.Textract_Features(f) for f in textract_features
|
|
]
|
|
else:
|
|
self.textract_features = []
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import amazon-textract-caller or "
|
|
"amazon-textract-textractor python package. Please install it "
|
|
"with `pip install amazon-textract-caller` & "
|
|
"`pip install amazon-textract-textractor`."
|
|
)
|
|
|
|
if not client:
|
|
try:
|
|
import boto3
|
|
|
|
self.boto3_textract_client = boto3.client("textract")
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import boto3 python package. "
|
|
"Please install it with `pip install boto3`."
|
|
)
|
|
else:
|
|
self.boto3_textract_client = client
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Iterates over the Blob pages and returns an Iterator with a Document
|
|
for each page, like the other parsers If multi-page document, blob.path
|
|
has to be set to the S3 URI and for single page docs
|
|
the blob.data is taken
|
|
"""
|
|
|
|
url_parse_result = urlparse(str(blob.path)) if blob.path else None
|
|
# Either call with S3 path (multi-page) or with bytes (single-page)
|
|
if (
|
|
url_parse_result
|
|
and url_parse_result.scheme == "s3"
|
|
and url_parse_result.netloc
|
|
):
|
|
textract_response_json = self.tc.call_textract(
|
|
input_document=str(blob.path),
|
|
features=self.textract_features,
|
|
boto3_textract_client=self.boto3_textract_client,
|
|
)
|
|
else:
|
|
textract_response_json = self.tc.call_textract(
|
|
input_document=blob.as_bytes(),
|
|
features=self.textract_features,
|
|
call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
|
|
boto3_textract_client=self.boto3_textract_client,
|
|
)
|
|
|
|
document = self.textractor.Document.open(textract_response_json)
|
|
|
|
linearizer_config = self.textractor.TextLinearizationConfig(
|
|
hide_figure_layout=True,
|
|
title_prefix="# ",
|
|
section_header_prefix="## ",
|
|
list_element_prefix="*",
|
|
)
|
|
for idx, page in enumerate(document.pages):
|
|
yield Document(
|
|
page_content=page.get_text(config=linearizer_config),
|
|
metadata={"source": blob.source, "page": idx + 1},
|
|
)
|
|
|
|
|
|
class DocumentIntelligenceParser(BaseBlobParser):
|
|
"""Loads a PDF with Azure Document Intelligence
|
|
(formerly Form Recognizer) and chunks at character level."""
|
|
|
|
def __init__(self, client: Any, model: str):
|
|
warnings.warn(
|
|
"langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser"
|
|
"and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader"
|
|
" are deprecated. Please upgrade to "
|
|
"langchain_community.document_loaders.DocumentIntelligenceLoader "
|
|
"for any file parsing purpose using Azure Document Intelligence "
|
|
"service."
|
|
)
|
|
self.client = client
|
|
self.model = model
|
|
|
|
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
|
|
for p in result.pages:
|
|
content = " ".join([line.content for line in p.lines])
|
|
|
|
d = Document(
|
|
page_content=content,
|
|
metadata={
|
|
"source": blob.source,
|
|
"page": p.page_number,
|
|
},
|
|
)
|
|
yield d
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Lazily parse the blob."""
|
|
|
|
with blob.as_bytes_io() as file_obj:
|
|
poller = self.client.begin_analyze_document(self.model, file_obj)
|
|
result = poller.result()
|
|
|
|
docs = self._generate_docs(blob, result)
|
|
|
|
yield from docs
|