mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
9fb09c1c30
**Description**: the "page" mode in the AzureAIDocumentIntelligenceParser is not accessible due to a wrong membership test. The mode argument can only be a string (also see the assertion in the `__init__`: `assert self.mode in ["single", "page", "object", "markdown"]`, so the check `elif self.mode == ["page"]:` always fails. As a result, effectively the "object" mode is used when selecting the "page" mode, which may lead to errors. The docstring of the `AzureAIDocumentIntelligenceLoader` also ommitted the `mode` parameter alltogether, so I added it. **Issue**: I could not find a related issue (this class is only 3 weeks old anyways) **Dependencies**: this PR does not introduce or affect dependencies. The current demo notebook and examples are not affected because they all use the default markdown mode.
93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
from typing import Iterator, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
from langchain_community.document_loaders.parsers import (
|
|
AzureAIDocumentIntelligenceParser,
|
|
)
|
|
|
|
|
|
class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|
"""Loads a PDF with Azure Document Intelligence"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_endpoint: str,
|
|
api_key: str,
|
|
file_path: Optional[str] = None,
|
|
url_path: Optional[str] = None,
|
|
api_version: Optional[str] = None,
|
|
api_model: str = "prebuilt-layout",
|
|
mode: str = "markdown",
|
|
) -> None:
|
|
"""
|
|
Initialize the object for file processing with Azure Document Intelligence
|
|
(formerly Form Recognizer).
|
|
|
|
This constructor initializes a AzureAIDocumentIntelligenceParser object to be
|
|
used for parsing files using the Azure Document Intelligence API. The load
|
|
method generates Documents whose content representations are determined by the
|
|
mode parameter.
|
|
|
|
Parameters:
|
|
-----------
|
|
api_endpoint: str
|
|
The API endpoint to use for DocumentIntelligenceClient construction.
|
|
api_key: str
|
|
The API key to use for DocumentIntelligenceClient construction.
|
|
file_path : Optional[str]
|
|
The path to the file that needs to be loaded.
|
|
Either file_path or url_path must be specified.
|
|
url_path : Optional[str]
|
|
The URL to the file that needs to be loaded.
|
|
Either file_path or url_path must be specified.
|
|
api_version: Optional[str]
|
|
The API version for DocumentIntelligenceClient. Setting None to use
|
|
the default value from SDK.
|
|
api_model: str
|
|
The model name or ID to be used for form recognition in Azure.
|
|
mode: Optional[str]
|
|
The type of content representation of the generated Documents.
|
|
|
|
Examples:
|
|
---------
|
|
>>> obj = AzureAIDocumentIntelligenceLoader(
|
|
... file_path="path/to/file",
|
|
... api_endpoint="https://endpoint.azure.com",
|
|
... api_key="APIKEY",
|
|
... api_version="2023-10-31-preview",
|
|
... model="prebuilt-document",
|
|
... mode="markdown"
|
|
... )
|
|
"""
|
|
|
|
assert (
|
|
file_path is not None or url_path is not None
|
|
), "file_path or url_path must be provided"
|
|
self.file_path = file_path
|
|
self.url_path = url_path
|
|
|
|
self.parser = AzureAIDocumentIntelligenceParser(
|
|
api_endpoint=api_endpoint,
|
|
api_key=api_key,
|
|
api_version=api_version,
|
|
api_model=api_model,
|
|
mode=mode,
|
|
)
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load given path as pages."""
|
|
return list(self.lazy_load())
|
|
|
|
def lazy_load(
|
|
self,
|
|
) -> Iterator[Document]:
|
|
"""Lazy load given path as pages."""
|
|
if self.file_path is not None:
|
|
blob = Blob.from_path(self.file_path)
|
|
yield from self.parser.parse(blob)
|
|
else:
|
|
yield from self.parser.parse_url(self.url_path)
|