mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
2460f977c5
- **Description:** Add DocumentIntelligenceLoader & DocumentIntelligenceParser implementation using the latest Azure Document Intelligence SDK with markdown support. The core logic resides in DocumentIntelligenceParser and DocumentIntelligenceLoader is a mere wrapper of the parser. The parser will takes api_endpoint and api_key and creates DocumentIntelligenceClient for the user. 4 parsing modes are supported: 1. Markdown (default) 2. Single 3. Page 4. Object UT and notebook are also updated accordingly. - **Dependencies:** Azure Document Intelligence SDK: azure-ai-documentintelligence [azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 · Azure/azure-sdk-for-python (github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0). --------- Co-authored-by: Erick Friis <erick@langchain.dev>
90 lines
3.1 KiB
Python
90 lines
3.1 KiB
Python
from typing import Iterator, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
from langchain_community.document_loaders.parsers import (
|
|
AzureAIDocumentIntelligenceParser,
|
|
)
|
|
|
|
|
|
class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|
"""Loads a PDF with Azure Document Intelligence"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_endpoint: str,
|
|
api_key: str,
|
|
file_path: Optional[str] = None,
|
|
url_path: Optional[str] = None,
|
|
api_version: Optional[str] = None,
|
|
api_model: str = "prebuilt-layout",
|
|
mode: str = "markdown",
|
|
) -> None:
|
|
"""
|
|
Initialize the object for file processing with Azure Document Intelligence
|
|
(formerly Form Recognizer).
|
|
|
|
This constructor initializes a AzureAIDocumentIntelligenceParser object to be
|
|
used for parsing files using the Azure Document Intelligence API. The load
|
|
method generates Documents whose content representations are determined by the
|
|
mode parameter.
|
|
|
|
Parameters:
|
|
-----------
|
|
api_endpoint: str
|
|
The API endpoint to use for DocumentIntelligenceClient construction.
|
|
api_key: str
|
|
The API key to use for DocumentIntelligenceClient construction.
|
|
file_path : Optional[str]
|
|
The path to the file that needs to be loaded.
|
|
Either file_path or url_path must be specified.
|
|
url_path : Optional[str]
|
|
The URL to the file that needs to be loaded.
|
|
Either file_path or url_path must be specified.
|
|
api_version: Optional[str]
|
|
The API version for DocumentIntelligenceClient. Setting None to use
|
|
the default value from SDK.
|
|
api_model: str
|
|
The model name or ID to be used for form recognition in Azure.
|
|
|
|
Examples:
|
|
---------
|
|
>>> obj = AzureAIDocumentIntelligenceLoader(
|
|
... file_path="path/to/file",
|
|
... api_endpoint="https://endpoint.azure.com",
|
|
... api_key="APIKEY",
|
|
... api_version="2023-10-31-preview",
|
|
... model="prebuilt-document"
|
|
... )
|
|
"""
|
|
|
|
assert (
|
|
file_path is not None or url_path is not None
|
|
), "file_path or url_path must be provided"
|
|
self.file_path = file_path
|
|
self.url_path = url_path
|
|
|
|
self.parser = AzureAIDocumentIntelligenceParser(
|
|
api_endpoint=api_endpoint,
|
|
api_key=api_key,
|
|
api_version=api_version,
|
|
api_model=api_model,
|
|
mode=mode,
|
|
)
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load given path as pages."""
|
|
return list(self.lazy_load())
|
|
|
|
def lazy_load(
|
|
self,
|
|
) -> Iterator[Document]:
|
|
"""Lazy load given path as pages."""
|
|
if self.file_path is not None:
|
|
blob = Blob.from_path(self.file_path)
|
|
yield from self.parser.parse(blob)
|
|
else:
|
|
yield from self.parser.parse_url(self.url_path)
|