You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py

397 lines
13 KiB
Python

import io
import json
import os
import warnings
from typing import Dict, Iterator, List, Literal, Optional, Union
import fitz # type: ignore
import requests
from fitz import Document as fitzDocument
from langchain_core.document_loaders import BaseBlobParser, Blob
from langchain_core.documents import Document
LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"
DEFAULT_NUMBER_OF_PAGE = 10
OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]
def validate_api_key(api_key: str) -> None:
"""
Validates the provided API key.
Args:
api_key (str): The API key to be validated.
Raises:
ValueError: If the API key is empty or None.
Returns:
None
"""
if not api_key:
raise ValueError("API Key is required for Upstage Document Loader")
def validate_file_path(file_path: str) -> None:
"""
Validates if a file exists at the given file path.
Args:
file_path (str): The path to the file.
Raises:
FileNotFoundError: If the file does not exist at the given file path.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
"""
Parse the output data based on the specified output type.
Args:
data (dict): The data to be parsed.
output_type (Union[OutputType, dict]): The output type to parse the element data
into.
Returns:
str: The parsed output.
Raises:
ValueError: If the output type is invalid.
"""
if isinstance(output_type, dict):
if data["category"] in output_type:
return data[output_type[data["category"]]]
else:
return data["text"]
elif isinstance(output_type, str):
if output_type == "text":
return data["text"]
elif output_type == "html":
return data["html"]
else:
raise ValueError(f"Invalid output type: {output_type}")
else:
raise ValueError(f"Invalid output type: {output_type}")
def get_from_param_or_env(
key: str,
param: Optional[str] = None,
env_key: Optional[str] = None,
default: Optional[str] = None,
) -> str:
"""Get a value from a param or an environment variable."""
if param is not None:
return param
elif env_key and env_key in os.environ and os.environ[env_key]:
return os.environ[env_key]
elif default is not None:
return default
else:
raise ValueError(
f"Did not find {key}, please add an environment variable"
f" `{env_key}` which contains it, or pass"
f" `{key}` as a named parameter."
)
class UpstageLayoutAnalysisParser(BaseBlobParser):
"""Upstage Layout Analysis Parser.
To use, you should have the environment variable `UPSTAGE_API_KEY`
set with your API key or pass it as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_upstage import UpstageLayoutAnalysisParser
loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
"""
def __init__(
self,
api_key: Optional[str] = None,
output_type: Union[OutputType, dict] = "html",
split: SplitType = "none",
use_ocr: bool = False,
exclude: list = [],
):
"""
Initializes an instance of the Upstage class.
Args:
api_key (str, optional): The API key for accessing the Upstage API.
Defaults to None, in which case it will be
fetched from the environment variable
`UPSTAGE_API_KEY`.
output_type (Union[OutputType, dict], optional): The type of output to be
generated by the parser.
Defaults to "html".
split (SplitType, optional): The type of splitting to be applied.
Defaults to "none" (no splitting).
use_ocr (bool, optional): Extract text from images in the document.
Defaults to False. (Use text info in PDF file)
exclude (list, optional): Exclude specific elements from the output.
Defaults to [] (all included).
"""
if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"):
warnings.warn(
"UPSTAGE_DOCUMENT_AI_API_KEY is deprecated."
"Please use UPSTAGE_API_KEY instead."
)
self.api_key = get_from_param_or_env(
"UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key
)
self.output_type = output_type
self.split = split
self.use_ocr = use_ocr
self.exclude = exclude
validate_api_key(self.api_key)
def _get_response(self, files: Dict) -> List:
"""
Sends a POST request to the API endpoint with the provided files and
returns the response.
Args:
files (dict): A dictionary containing the files to be sent in the request.
Returns:
dict: The JSON response from the API.
Raises:
ValueError: If there is an error in the API call.
"""
try:
headers = {"Authorization": f"Bearer {self.api_key}"}
options = {"ocr": self.use_ocr}
response = requests.post(
LAYOUT_ANALYSIS_URL, headers=headers, files=files, data=options
)
response.raise_for_status()
result = response.json().get("elements", [])
elements = [
element for element in result if element["category"] not in self.exclude
]
return elements
except requests.RequestException as req_err:
# Handle any request-related exceptions
print(f"Request Exception: {req_err}")
raise ValueError(f"Failed to send request: {req_err}")
except json.JSONDecodeError as json_err:
# Handle JSON decode errors
print(f"JSON Decode Error: {json_err}")
raise ValueError(f"Failed to decode JSON response: {json_err}")
return []
def _split_and_request(
self,
full_docs: fitzDocument,
start_page: int,
num_pages: int = DEFAULT_NUMBER_OF_PAGE,
) -> List:
"""
Splits the full pdf document into partial pages and sends a request to the
server.
Args:
full_docs (str): The full document to be split and requested.
start_page (int): The starting page number for splitting the document.
num_pages (int, optional): The number of pages to split the document
into.
Defaults to DEFAULT_NUMBER_OF_PAGE.
Returns:
response: The response from the server.
"""
with fitz.open() as chunk_pdf:
chunk_pdf.insert_pdf(
full_docs,
from_page=start_page,
to_page=start_page + num_pages - 1,
)
pdf_bytes = chunk_pdf.write()
with io.BytesIO(pdf_bytes) as f:
response = self._get_response({"document": f})
return response
def _element_document(self, elements: Dict) -> Document:
"""
Converts an elements into a Document object.
Args:
elements: The elements to convert.
Returns:
A list containing a single Document object.
"""
return Document(
page_content=(parse_output(elements, self.output_type)),
metadata={
"page": elements["page"],
"id": elements["id"],
"type": self.output_type,
"split": self.split,
"bbox": elements["bounding_box"],
"category": elements["category"],
},
)
def _page_document(self, elements: List) -> List[Document]:
"""
Combines elements with the same page number into a single Document object.
Args:
elements (List): A list of elements containing page numbers.
Returns:
List[Document]: A list of Document objects, each representing a page
with its content and metadata.
"""
_docs = []
pages = sorted(set(map(lambda x: x["page"], elements)))
page_group = [
[element for element in elements if element["page"] == x] for x in pages
]
for group in page_group:
page_content = " ".join(
[parse_output(element, self.output_type) for element in group]
)
_docs.append(
Document(
page_content=page_content,
metadata={
"page": group[0]["page"],
"type": self.output_type,
"split": self.split,
},
)
)
return _docs
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
"""
Lazily parses a document and yields Document objects based on the specified
split type.
Args:
blob (Blob): The input document blob to parse.
is_batch (bool, optional): Whether to parse the document in batches.
Defaults to False (single page parsing)
Yields:
Document: The parsed document object.
Raises:
ValueError: If an invalid split type is provided.
"""
if is_batch:
num_pages = DEFAULT_NUMBER_OF_PAGE
else:
num_pages = 1
full_docs = fitz.open(blob.path)
number_of_pages = full_docs.page_count
if self.split == "none":
if full_docs.is_pdf:
result = ""
start_page = 0
num_pages = DEFAULT_NUMBER_OF_PAGE
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
elements = self._split_and_request(full_docs, start_page, num_pages)
for element in elements:
result += parse_output(element, self.output_type)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
result = ""
with open(blob.path, "rb") as f:
elements = self._get_response({"document": f})
for element in elements:
result += parse_output(element, self.output_type)
yield Document(
page_content=result,
metadata={
"total_pages": number_of_pages,
"type": self.output_type,
"split": self.split,
},
)
elif self.split == "element":
if full_docs.is_pdf:
start_page = 0
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
elements = self._split_and_request(full_docs, start_page, num_pages)
for element in elements:
yield self._element_document(element)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
with open(blob.path, "rb") as f:
elements = self._get_response({"document": f})
for element in elements:
yield self._element_document(element)
elif self.split == "page":
if full_docs.is_pdf:
start_page = 0
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
elements = self._split_and_request(full_docs, start_page, num_pages)
yield from self._page_document(elements)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
with open(blob.path, "rb") as f:
elements = self._get_response({"document": f})
yield from self._page_document(elements)
else:
raise ValueError(f"Invalid split type: {self.split}")