You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/partners/upstage/langchain_upstage/layout_analysis.py

249 lines
8.7 KiB
Python

import os
import warnings
from pathlib import Path
from typing import Any, Dict, Iterator, List, Literal, Optional, Union
from langchain_core.document_loaders import BaseLoader, Blob
from langchain_core.documents import Document
from .layout_analysis_parsers import UpstageLayoutAnalysisParser
DEFAULT_PAGE_BATCH_SIZE = 10
OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]
def validate_api_key(api_key: str) -> None:
"""
Validates the provided API key.
Args:
api_key (str): The API key to be validated.
Raises:
ValueError: If the API key is empty or None.
Returns:
None
"""
if not api_key:
raise ValueError("API Key is required for Upstage Document Loader")
def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None:
"""
Validates if a file exists at the given file path.
Args:
file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be
validated.
Raises:
FileNotFoundError: If the file or any of the files in the list do not exist.
"""
if isinstance(file_path, list):
for path in file_path:
validate_file_path(path)
return
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
def get_from_param_or_env(
key: str,
param: Optional[str] = None,
env_key: Optional[str] = None,
default: Optional[str] = None,
) -> str:
"""Get a value from a param or an environment variable."""
if param is not None:
return param
elif env_key and env_key in os.environ and os.environ[env_key]:
return os.environ[env_key]
elif default is not None:
return default
else:
raise ValueError(
f"Did not find {key}, please add an environment variable"
f" `{env_key}` which contains it, or pass"
f" `{key}` as a named parameter."
)
class UpstageLayoutAnalysisLoader(BaseLoader):
"""Upstage Layout Analysis.
To use, you should have the environment variable `UPSTAGE_API_KEY`
set with your API key or pass it as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_upstage import UpstageLayoutAnalysis
file_path = "/PATH/TO/YOUR/FILE.pdf"
loader = UpstageLayoutAnalysis(
file_path, split="page", output_type="text"
)
"""
def __init__(
self,
file_path: Union[str, Path, List[str], List[Path]],
output_type: Union[OutputType, dict] = "html",
split: SplitType = "none",
api_key: Optional[str] = None,
use_ocr: bool = False,
exclude: list = ["header", "footer"],
):
"""
Initializes an instance of the Upstage document loader.
Args:
file_path (Union[str, Path, List[str], List[Path]): The path to the document
to be loaded.
output_type (Union[OutputType, dict], optional): The type of output to be
generated by the parser.
Defaults to "html".
split (SplitType, optional): The type of splitting to be applied.
Defaults to "none" (no splitting).
api_key (str, optional): The API key for accessing the Upstage API.
Defaults to None, in which case it will be
fetched from the environment variable
`UPSTAGE_API_KEY`.
use_ocr (bool, optional): Extract text from images in the document.
Defaults to False. (Use text info in PDF file)
exclude (list, optional): Exclude specific elements from
the output.
Defaults to ["header", "footer"].
"""
self.file_path = file_path
self.output_type = output_type
self.split = split
if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"):
warnings.warn(
"UPSTAGE_DOCUMENT_AI_API_KEY is deprecated."
"Please use UPSTAGE_API_KEY instead."
)
self.api_key = get_from_param_or_env(
"UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key
)
self.use_ocr = use_ocr
self.exclude = exclude
validate_file_path(self.file_path)
validate_api_key(self.api_key)
def load(self) -> List[Document]:
"""
Loads and parses the document using the UpstageLayoutAnalysisParser.
Returns:
A list of Document objects representing the parsed layout analysis.
"""
if isinstance(self.file_path, list):
result = []
for file_path in self.file_path:
blob = Blob.from_path(file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
exclude=self.exclude,
)
result.extend(list(parser.lazy_parse(blob, is_batch=True)))
return result
else:
blob = Blob.from_path(self.file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
exclude=self.exclude,
)
return list(parser.lazy_parse(blob, is_batch=True))
def lazy_load(self) -> Iterator[Document]:
"""
Lazily loads and parses the document using the UpstageLayoutAnalysisParser.
Returns:
An iterator of Document objects representing the parsed layout analysis.
"""
if isinstance(self.file_path, list):
for file_path in self.file_path:
blob = Blob.from_path(file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
exclude=self.exclude,
)
yield from parser.lazy_parse(blob, is_batch=True)
else:
blob = Blob.from_path(self.file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
exclude=self.exclude,
)
yield from parser.lazy_parse(blob)
def merge_and_split(
self, documents: List[Document], splitter: Optional[object] = None
) -> List[Document]:
"""
Merges the page content and metadata of multiple documents into a single
document, or splits the documents using a custom splitter.
Args:
documents (list): A list of Document objects to be merged and split.
splitter (object, optional): An optional splitter object that implements the
`split_documents` method. If provided, the documents will be split using
this splitter. Defaults to None, in which case the documents are merged.
Returns:
list: A list of Document objects. If no splitter is provided, a single
Document object is returned with the merged content and combined metadata.
If a splitter is provided, the documents are split and a list of Document
objects is returned.
Raises:
AssertionError: If a splitter is provided but it does not implement the
`split_documents` method.
"""
if splitter is None:
merged_content = " ".join([doc.page_content for doc in documents])
metadatas: Dict[str, Any] = dict()
for _meta in [doc.metadata for doc in documents]:
for key, value in _meta.items():
if key in metadatas:
metadatas[key].append(value)
else:
metadatas[key] = [value]
return [Document(page_content=merged_content, metadata=metadatas)]
else:
assert hasattr(
splitter, "split_documents"
), "splitter must implement split_documents method"
return splitter.split_documents(documents)