From 6d82503eb1bd2e6ec7bba66e1855df109406b53f Mon Sep 17 00:00:00 2001 From: Lars von Wedel Date: Sun, 3 Sep 2023 23:25:39 +0200 Subject: [PATCH] Add parser and loader for Azure document intelligence service. (#10136) Hi, this PR contains loader / parser for Azure Document intelligence which is a ML-based service to ingest arbitrary PDFs / images, even if scanned. The loader generates Documents by pages of the original document. This is my first contribution to LangChain. Unfortunately I could not find the correct place for test cases. Happy to add one if you can point me to the location, but as this is a cloud-based service, a test would require network access and credentials - so might be of limited help. Dependencies: The needed dependency was already part of pyproject.toml, no change. Twitter: feel free to mention @LarsAC on the announcement --- .../azure_document_intelligence.ipynb | 138 ++++++++++++++++++ .../langchain/document_loaders/parsers/pdf.py | 33 +++++ .../langchain/document_loaders/pdf.py | 49 +++++++ 3 files changed, 220 insertions(+) create mode 100644 docs/extras/integrations/document_loaders/azure_document_intelligence.ipynb diff --git a/docs/extras/integrations/document_loaders/azure_document_intelligence.ipynb b/docs/extras/integrations/document_loaders/azure_document_intelligence.ipynb new file mode 100644 index 0000000000..b28acaebe0 --- /dev/null +++ b/docs/extras/integrations/document_loaders/azure_document_intelligence.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Azure Document Intelligence" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure Document Intelligence (formerly known as Azure Forms Recognizer) is machine-learning \n", + "based service that extracts text (including handwriting), tables or key-value-pairs from\n", + "scanned documents or images.\n", + "\n", + "This current implementation of a loader using Document Intelligence is able to incorporate content page-wise and turn it into LangChain documents.\n", + "\n", + "Document Intelligence supports PDF, JPEG, PNG, BMP, or TIFF.\n", + "\n", + "Further documentation is available at https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-3.1.0.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install langchain azure-ai-formrecognizer -q" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first example uses a local file which will be sent to Azure Document Intelligence.\n", + "\n", + "First, an instance of a DocumentAnalysisClient is created with endpoint and key for the Azure service. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.formrecognizer import DocumentAnalysisClient\n", + "from azure.core.credentials import AzureKeyCredential\n", + "\n", + "document_analysis_client = DocumentAnalysisClient(\n", + " endpoint=\"\", credential=AzureKeyCredential(\"\")\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the initialized document analysis client, we can proceed to create an instance of the DocumentIntelligenceLoader:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.pdf import DocumentIntelligenceLoader\n", + "loader = DocumentIntelligenceLoader(\n", + " \"\",\n", + " client=document_analysis_client,\n", + " model=\"\") # e.g. prebuilt-document\n", + "\n", + "documents = loader.load()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output contains each page of the source document as a LangChain document: " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='...', metadata={'source': '...', 'page': 1})]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.5" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 00d8d9adea..07681a6767 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -244,3 +244,36 @@ class AmazonTextractPDFParser(BaseBlobParser): page_content=current_text, metadata={"source": blob.source, "page": current_page}, ) + + +class DocumentIntelligenceParser(BaseBlobParser): + """Loads a PDF with Azure Document Intelligence + (formerly Forms Recognizer) and chunks at character level.""" + + def __init__(self, client: Any, model: str): + self.client = client + self.model = model + + def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: + for p in result.pages: + content = " ".join([line.content for line in p.lines]) + + d = Document( + page_content=content, + metadata={ + "source": blob.source, + "page": p.page_number, + }, + ) + yield d + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + + with blob.as_bytes_io() as file_obj: + poller = self.client.begin_analyze_document(self.model, file_obj) + result = poller.result() + + docs = self._generate_docs(blob, result) + + yield from docs diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 301af6953e..d907494d45 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -16,6 +16,7 @@ from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.parsers.pdf import ( AmazonTextractPDFParser, + DocumentIntelligenceParser, PDFMinerParser, PDFPlumberParser, PyMuPDFParser, @@ -597,3 +598,51 @@ class AmazonTextractPDFLoader(BasePDFLoader): return 1 else: raise ValueError(f"unsupported mime type: {blob.mimetype}") + + +class DocumentIntelligenceLoader(BasePDFLoader): + """Loads a PDF with Azure Document Intelligence""" + + def __init__( + self, file_path: str, client: Any, model: str = "prebuilt-document" + ) -> None: + """ + Initialize the object for file processing with Azure Document Intelligence + (formerly Form Recognizer). + + This constructor initializes a DocumentIntelligenceParser object to be used + for parsing files using the Azure Document Intelligence API. The load method + generates a Document node including metadata (source blob and page number) + for each page. + + Parameters: + ----------- + file_path : str + The path to the file that needs to be parsed. + client: Any + A DocumentAnalysisClient to perform the analysis of the blob + model : str + The model name or ID to be used for form recognition in Azure. + + Examples: + --------- + >>> obj = DocumentIntelligenceLoader( + ... file_path="path/to/file", + ... client=client, + ... model="prebuilt-document" + ... ) + """ + + self.parser = DocumentIntelligenceParser(client=client, model=model) + super().__init__(file_path) + + def load(self) -> List[Document]: + """Load given path as pages.""" + return list(self.lazy_load()) + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load given path as pages.""" + blob = Blob.from_path(self.file_path) + yield from self.parser.parse(blob)