mirror of https://github.com/hwchase17/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
397 lines
13 KiB
Python
397 lines
13 KiB
Python
import io
|
|
import json
|
|
import os
|
|
import warnings
|
|
from typing import Dict, Iterator, List, Literal, Optional, Union
|
|
|
|
import fitz # type: ignore
|
|
import requests
|
|
from fitz import Document as fitzDocument
|
|
from langchain_core.document_loaders import BaseBlobParser, Blob
|
|
from langchain_core.documents import Document
|
|
|
|
LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"
|
|
|
|
DEFAULT_NUMBER_OF_PAGE = 10
|
|
|
|
OutputType = Literal["text", "html"]
|
|
SplitType = Literal["none", "element", "page"]
|
|
|
|
|
|
def validate_api_key(api_key: str) -> None:
|
|
"""
|
|
Validates the provided API key.
|
|
|
|
Args:
|
|
api_key (str): The API key to be validated.
|
|
|
|
Raises:
|
|
ValueError: If the API key is empty or None.
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
if not api_key:
|
|
raise ValueError("API Key is required for Upstage Document Loader")
|
|
|
|
|
|
def validate_file_path(file_path: str) -> None:
|
|
"""
|
|
Validates if a file exists at the given file path.
|
|
|
|
Args:
|
|
file_path (str): The path to the file.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file does not exist at the given file path.
|
|
"""
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
|
|
def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
|
|
"""
|
|
Parse the output data based on the specified output type.
|
|
|
|
Args:
|
|
data (dict): The data to be parsed.
|
|
output_type (Union[OutputType, dict]): The output type to parse the element data
|
|
into.
|
|
|
|
Returns:
|
|
str: The parsed output.
|
|
|
|
Raises:
|
|
ValueError: If the output type is invalid.
|
|
"""
|
|
if isinstance(output_type, dict):
|
|
if data["category"] in output_type:
|
|
return data[output_type[data["category"]]]
|
|
else:
|
|
return data["text"]
|
|
elif isinstance(output_type, str):
|
|
if output_type == "text":
|
|
return data["text"]
|
|
elif output_type == "html":
|
|
return data["html"]
|
|
else:
|
|
raise ValueError(f"Invalid output type: {output_type}")
|
|
else:
|
|
raise ValueError(f"Invalid output type: {output_type}")
|
|
|
|
|
|
def get_from_param_or_env(
|
|
key: str,
|
|
param: Optional[str] = None,
|
|
env_key: Optional[str] = None,
|
|
default: Optional[str] = None,
|
|
) -> str:
|
|
"""Get a value from a param or an environment variable."""
|
|
if param is not None:
|
|
return param
|
|
elif env_key and env_key in os.environ and os.environ[env_key]:
|
|
return os.environ[env_key]
|
|
elif default is not None:
|
|
return default
|
|
else:
|
|
raise ValueError(
|
|
f"Did not find {key}, please add an environment variable"
|
|
f" `{env_key}` which contains it, or pass"
|
|
f" `{key}` as a named parameter."
|
|
)
|
|
|
|
|
|
class UpstageLayoutAnalysisParser(BaseBlobParser):
|
|
"""Upstage Layout Analysis Parser.
|
|
|
|
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
|
set with your API key or pass it as a named parameter to the constructor.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_upstage import UpstageLayoutAnalysisParser
|
|
|
|
loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_key: Optional[str] = None,
|
|
output_type: Union[OutputType, dict] = "html",
|
|
split: SplitType = "none",
|
|
use_ocr: bool = False,
|
|
exclude: list = [],
|
|
):
|
|
"""
|
|
Initializes an instance of the Upstage class.
|
|
|
|
Args:
|
|
api_key (str, optional): The API key for accessing the Upstage API.
|
|
Defaults to None, in which case it will be
|
|
fetched from the environment variable
|
|
`UPSTAGE_API_KEY`.
|
|
output_type (Union[OutputType, dict], optional): The type of output to be
|
|
generated by the parser.
|
|
Defaults to "html".
|
|
split (SplitType, optional): The type of splitting to be applied.
|
|
Defaults to "none" (no splitting).
|
|
use_ocr (bool, optional): Extract text from images in the document.
|
|
Defaults to False. (Use text info in PDF file)
|
|
exclude (list, optional): Exclude specific elements from the output.
|
|
Defaults to [] (all included).
|
|
"""
|
|
if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"):
|
|
warnings.warn(
|
|
"UPSTAGE_DOCUMENT_AI_API_KEY is deprecated."
|
|
"Please use UPSTAGE_API_KEY instead."
|
|
)
|
|
self.api_key = get_from_param_or_env(
|
|
"UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key
|
|
)
|
|
|
|
self.output_type = output_type
|
|
self.split = split
|
|
self.use_ocr = use_ocr
|
|
self.exclude = exclude
|
|
|
|
validate_api_key(self.api_key)
|
|
|
|
def _get_response(self, files: Dict) -> List:
|
|
"""
|
|
Sends a POST request to the API endpoint with the provided files and
|
|
returns the response.
|
|
|
|
Args:
|
|
files (dict): A dictionary containing the files to be sent in the request.
|
|
|
|
Returns:
|
|
dict: The JSON response from the API.
|
|
|
|
Raises:
|
|
ValueError: If there is an error in the API call.
|
|
"""
|
|
try:
|
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
options = {"ocr": self.use_ocr}
|
|
response = requests.post(
|
|
LAYOUT_ANALYSIS_URL, headers=headers, files=files, data=options
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json().get("elements", [])
|
|
|
|
elements = [
|
|
element for element in result if element["category"] not in self.exclude
|
|
]
|
|
|
|
return elements
|
|
|
|
except requests.RequestException as req_err:
|
|
# Handle any request-related exceptions
|
|
print(f"Request Exception: {req_err}")
|
|
raise ValueError(f"Failed to send request: {req_err}")
|
|
except json.JSONDecodeError as json_err:
|
|
# Handle JSON decode errors
|
|
print(f"JSON Decode Error: {json_err}")
|
|
raise ValueError(f"Failed to decode JSON response: {json_err}")
|
|
|
|
return []
|
|
|
|
def _split_and_request(
|
|
self,
|
|
full_docs: fitzDocument,
|
|
start_page: int,
|
|
num_pages: int = DEFAULT_NUMBER_OF_PAGE,
|
|
) -> List:
|
|
"""
|
|
Splits the full pdf document into partial pages and sends a request to the
|
|
server.
|
|
|
|
Args:
|
|
full_docs (str): The full document to be split and requested.
|
|
start_page (int): The starting page number for splitting the document.
|
|
num_pages (int, optional): The number of pages to split the document
|
|
into.
|
|
Defaults to DEFAULT_NUMBER_OF_PAGE.
|
|
|
|
Returns:
|
|
response: The response from the server.
|
|
"""
|
|
with fitz.open() as chunk_pdf:
|
|
chunk_pdf.insert_pdf(
|
|
full_docs,
|
|
from_page=start_page,
|
|
to_page=start_page + num_pages - 1,
|
|
)
|
|
pdf_bytes = chunk_pdf.write()
|
|
|
|
with io.BytesIO(pdf_bytes) as f:
|
|
response = self._get_response({"document": f})
|
|
|
|
return response
|
|
|
|
def _element_document(self, elements: Dict) -> Document:
|
|
"""
|
|
Converts an elements into a Document object.
|
|
|
|
Args:
|
|
elements: The elements to convert.
|
|
|
|
Returns:
|
|
A list containing a single Document object.
|
|
|
|
"""
|
|
return Document(
|
|
page_content=(parse_output(elements, self.output_type)),
|
|
metadata={
|
|
"page": elements["page"],
|
|
"id": elements["id"],
|
|
"type": self.output_type,
|
|
"split": self.split,
|
|
"bbox": elements["bounding_box"],
|
|
"category": elements["category"],
|
|
},
|
|
)
|
|
|
|
def _page_document(self, elements: List) -> List[Document]:
|
|
"""
|
|
Combines elements with the same page number into a single Document object.
|
|
|
|
Args:
|
|
elements (List): A list of elements containing page numbers.
|
|
|
|
Returns:
|
|
List[Document]: A list of Document objects, each representing a page
|
|
with its content and metadata.
|
|
"""
|
|
_docs = []
|
|
pages = sorted(set(map(lambda x: x["page"], elements)))
|
|
|
|
page_group = [
|
|
[element for element in elements if element["page"] == x] for x in pages
|
|
]
|
|
|
|
for group in page_group:
|
|
page_content = " ".join(
|
|
[parse_output(element, self.output_type) for element in group]
|
|
)
|
|
|
|
_docs.append(
|
|
Document(
|
|
page_content=page_content,
|
|
metadata={
|
|
"page": group[0]["page"],
|
|
"type": self.output_type,
|
|
"split": self.split,
|
|
},
|
|
)
|
|
)
|
|
|
|
return _docs
|
|
|
|
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
|
"""
|
|
Lazily parses a document and yields Document objects based on the specified
|
|
split type.
|
|
|
|
Args:
|
|
blob (Blob): The input document blob to parse.
|
|
is_batch (bool, optional): Whether to parse the document in batches.
|
|
Defaults to False (single page parsing)
|
|
|
|
Yields:
|
|
Document: The parsed document object.
|
|
|
|
Raises:
|
|
ValueError: If an invalid split type is provided.
|
|
|
|
"""
|
|
|
|
if is_batch:
|
|
num_pages = DEFAULT_NUMBER_OF_PAGE
|
|
else:
|
|
num_pages = 1
|
|
|
|
full_docs = fitz.open(blob.path)
|
|
number_of_pages = full_docs.page_count
|
|
|
|
if self.split == "none":
|
|
if full_docs.is_pdf:
|
|
result = ""
|
|
start_page = 0
|
|
num_pages = DEFAULT_NUMBER_OF_PAGE
|
|
for _ in range(number_of_pages):
|
|
if start_page >= number_of_pages:
|
|
break
|
|
|
|
elements = self._split_and_request(full_docs, start_page, num_pages)
|
|
for element in elements:
|
|
result += parse_output(element, self.output_type)
|
|
|
|
start_page += num_pages
|
|
|
|
else:
|
|
if not blob.path:
|
|
raise ValueError("Blob path is required for non-PDF files.")
|
|
|
|
result = ""
|
|
with open(blob.path, "rb") as f:
|
|
elements = self._get_response({"document": f})
|
|
|
|
for element in elements:
|
|
result += parse_output(element, self.output_type)
|
|
|
|
yield Document(
|
|
page_content=result,
|
|
metadata={
|
|
"total_pages": number_of_pages,
|
|
"type": self.output_type,
|
|
"split": self.split,
|
|
},
|
|
)
|
|
|
|
elif self.split == "element":
|
|
if full_docs.is_pdf:
|
|
start_page = 0
|
|
for _ in range(number_of_pages):
|
|
if start_page >= number_of_pages:
|
|
break
|
|
|
|
elements = self._split_and_request(full_docs, start_page, num_pages)
|
|
for element in elements:
|
|
yield self._element_document(element)
|
|
|
|
start_page += num_pages
|
|
|
|
else:
|
|
if not blob.path:
|
|
raise ValueError("Blob path is required for non-PDF files.")
|
|
with open(blob.path, "rb") as f:
|
|
elements = self._get_response({"document": f})
|
|
|
|
for element in elements:
|
|
yield self._element_document(element)
|
|
|
|
elif self.split == "page":
|
|
if full_docs.is_pdf:
|
|
start_page = 0
|
|
for _ in range(number_of_pages):
|
|
if start_page >= number_of_pages:
|
|
break
|
|
|
|
elements = self._split_and_request(full_docs, start_page, num_pages)
|
|
yield from self._page_document(elements)
|
|
|
|
start_page += num_pages
|
|
else:
|
|
if not blob.path:
|
|
raise ValueError("Blob path is required for non-PDF files.")
|
|
with open(blob.path, "rb") as f:
|
|
elements = self._get_response({"document": f})
|
|
|
|
yield from self._page_document(elements)
|
|
|
|
else:
|
|
raise ValueError(f"Invalid split type: {self.split}")
|