langchain/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py

from typing import Any, Iterator, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob


class AzureAIDocumentIntelligenceParser(BaseBlobParser):
    """Loads a PDF with Azure Document Intelligence
    (formerly Forms Recognizer)."""

    def __init__(
        self,
        api_endpoint: str,
        api_key: str,
        api_version: Optional[str] = None,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
    ):
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.core.credentials import AzureKeyCredential

        kwargs = {}
        if api_version is not None:
            kwargs["api_version"] = api_version
        self.client = DocumentIntelligenceClient(
            endpoint=api_endpoint,
            credential=AzureKeyCredential(api_key),
            headers={"x-ms-useragent": "langchain-parser/1.0.0"},
            **kwargs,
        )
        self.api_model = api_model
        self.mode = mode
        assert self.mode in ["single", "page", "object", "markdown"]

    def _generate_docs_page(self, result: Any) -> Iterator[Document]:
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])

            d = Document(
                page_content=content,
                metadata={
                    "page": p.page_number,
                },
            )
            yield d

    def _generate_docs_single(self, result: Any) -> Iterator[Document]:
        yield Document(page_content=result.content, metadata={})

    def _generate_docs_object(self, result: Any) -> Iterator[Document]:
        # record relationship between page id and span offset
        page_offset = []
        for page in result.pages:
            # assume that spans only contain 1 element, to double check
            page_offset.append(page.spans[0]["offset"])

        # paragraph
        # warning: paragraph content is overlapping with table content
        for para in result.paragraphs:
            yield Document(
                page_content=para.content,
                metadata={
                    "role": para.role,
                    "page": para.bounding_regions[0].page_number,
                    "bounding_box": para.bounding_regions[0].polygon,
                    "type": "paragraph",
                },
            )

        # table
        for table in result.tables:
            yield Document(
                page_content=table.cells,  # json object
                metadata={
                    "footnote": table.footnotes,
                    "caption": table.caption,
                    "page": para.bounding_regions[0].page_number,
                    "bounding_box": para.bounding_regions[0].polygon,
                    "row_count": table.row_count,
                    "column_count": table.column_count,
                    "type": "table",
                },
            )

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

        with blob.as_bytes_io() as file_obj:
            poller = self.client.begin_analyze_document(
                self.api_model,
                file_obj,
                content_type="application/octet-stream",
                output_content_format="markdown" if self.mode == "markdown" else "text",
            )
            result = poller.result()

            if self.mode in ["single", "markdown"]:
                yield from self._generate_docs_single(result)
            elif self.mode == ["page"]:
                yield from self._generate_docs_page(result)
            else:
                yield from self._generate_docs_object(result)

    def parse_url(self, url: str) -> Iterator[Document]:
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

        poller = self.client.begin_analyze_document(
            self.api_model,
            AnalyzeDocumentRequest(url_source=url),
            # content_type="application/octet-stream",
            output_content_format="markdown" if self.mode == "markdown" else "text",
        )
        result = poller.result()

        if self.mode in ["single", "markdown"]:
            yield from self._generate_docs_single(result)
        elif self.mode == ["page"]:
            yield from self._generate_docs_page(result)
        else:
            yield from self._generate_docs_object(result)
community[minor]: Azure DocumentIntelligenceLoader/Parser support update with latest SDK (#14389) - Description: Add DocumentIntelligenceLoader & DocumentIntelligenceParser implementation using the latest Azure Document Intelligence SDK with markdown support. The core logic resides in DocumentIntelligenceParser and DocumentIntelligenceLoader is a mere wrapper of the parser. The parser will takes api_endpoint and api_key and creates DocumentIntelligenceClient for the user. 4 parsing modes are supported: 1. Markdown (default) 2. Single 3. Page 4. Object UT and notebook are also updated accordingly. - Dependencies: Azure Document Intelligence SDK: azure-ai-documentintelligence [azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python (github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0). --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2023-12-22 00:40:27 +00:00			`from typing import Any, Iterator, Optional`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseBlobParser`
			`from langchain_community.document_loaders.blob_loaders import Blob`


			`class AzureAIDocumentIntelligenceParser(BaseBlobParser):`
			`"""Loads a PDF with Azure Document Intelligence`
			`(formerly Forms Recognizer)."""`

			`def __init__(`
			`self,`
			`api_endpoint: str,`
			`api_key: str,`
			`api_version: Optional[str] = None,`
			`api_model: str = "prebuilt-layout",`
			`mode: str = "markdown",`
			`):`
			`from azure.ai.documentintelligence import DocumentIntelligenceClient`
			`from azure.core.credentials import AzureKeyCredential`

			`kwargs = {}`
			`if api_version is not None:`
			`kwargs["api_version"] = api_version`
			`self.client = DocumentIntelligenceClient(`
			`endpoint=api_endpoint,`
			`credential=AzureKeyCredential(api_key),`
			`headers={"x-ms-useragent": "langchain-parser/1.0.0"},`
			`**kwargs,`
			`)`
			`self.api_model = api_model`
			`self.mode = mode`
			`assert self.mode in ["single", "page", "object", "markdown"]`

			`def _generate_docs_page(self, result: Any) -> Iterator[Document]:`
			`for p in result.pages:`
			`content = " ".join([line.content for line in p.lines])`

			`d = Document(`
			`page_content=content,`
			`metadata={`
			`"page": p.page_number,`
			`},`
			`)`
			`yield d`

			`def _generate_docs_single(self, result: Any) -> Iterator[Document]:`
			`yield Document(page_content=result.content, metadata={})`

			`def _generate_docs_object(self, result: Any) -> Iterator[Document]:`
			`# record relationship between page id and span offset`
			`page_offset = []`
			`for page in result.pages:`
			`# assume that spans only contain 1 element, to double check`
			`page_offset.append(page.spans[0]["offset"])`

			`# paragraph`
			`# warning: paragraph content is overlapping with table content`
			`for para in result.paragraphs:`
			`yield Document(`
			`page_content=para.content,`
			`metadata={`
			`"role": para.role,`
			`"page": para.bounding_regions[0].page_number,`
			`"bounding_box": para.bounding_regions[0].polygon,`
			`"type": "paragraph",`
			`},`
			`)`

			`# table`
			`for table in result.tables:`
			`yield Document(`
			`page_content=table.cells, # json object`
			`metadata={`
			`"footnote": table.footnotes,`
			`"caption": table.caption,`
			`"page": para.bounding_regions[0].page_number,`
			`"bounding_box": para.bounding_regions[0].polygon,`
			`"row_count": table.row_count,`
			`"column_count": table.column_count,`
			`"type": "table",`
			`},`
			`)`

			`def lazy_parse(self, blob: Blob) -> Iterator[Document]:`
			`"""Lazily parse the blob."""`

			`with blob.as_bytes_io() as file_obj:`
			`poller = self.client.begin_analyze_document(`
			`self.api_model,`
			`file_obj,`
			`content_type="application/octet-stream",`
			`output_content_format="markdown" if self.mode == "markdown" else "text",`
			`)`
			`result = poller.result()`

			`if self.mode in ["single", "markdown"]:`
			`yield from self._generate_docs_single(result)`
			`elif self.mode == ["page"]:`
			`yield from self._generate_docs_page(result)`
			`else:`
			`yield from self._generate_docs_object(result)`

			`def parse_url(self, url: str) -> Iterator[Document]:`
			`from azure.ai.documentintelligence.models import AnalyzeDocumentRequest`

			`poller = self.client.begin_analyze_document(`
			`self.api_model,`
			`AnalyzeDocumentRequest(url_source=url),`
			`# content_type="application/octet-stream",`
			`output_content_format="markdown" if self.mode == "markdown" else "text",`
			`)`
			`result = poller.result()`

			`if self.mode in ["single", "markdown"]:`
			`yield from self._generate_docs_single(result)`
			`elif self.mode == ["page"]:`
			`yield from self._generate_docs_page(result)`
			`else:`
			`yield from self._generate_docs_object(result)`