langchain/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py

import io
import json
import os
import warnings
from typing import Dict, Iterator, List, Literal, Optional, Union

import fitz  # type: ignore
import requests
from fitz import Document as fitzDocument
from langchain_core.document_loaders import BaseBlobParser, Blob
from langchain_core.documents import Document

LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"

DEFAULT_NUMBER_OF_PAGE = 10

OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]


def validate_api_key(api_key: str) -> None:
    """
    Validates the provided API key.

    Args:
        api_key (str): The API key to be validated.

    Raises:
        ValueError: If the API key is empty or None.

    Returns:
        None
    """
    if not api_key:
        raise ValueError("API Key is required for Upstage Document Loader")


def validate_file_path(file_path: str) -> None:
    """
    Validates if a file exists at the given file path.

    Args:
        file_path (str): The path to the file.

    Raises:
        FileNotFoundError: If the file does not exist at the given file path.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")


def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
    """
    Parse the output data based on the specified output type.

    Args:
        data (dict): The data to be parsed.
        output_type (Union[OutputType, dict]): The output type to parse the element data
                                               into.

    Returns:
        str: The parsed output.

    Raises:
        ValueError: If the output type is invalid.
    """
    if isinstance(output_type, dict):
        if data["category"] in output_type:
            return data[output_type[data["category"]]]
        else:
            return data["text"]
    elif isinstance(output_type, str):
        if output_type == "text":
            return data["text"]
        elif output_type == "html":
            return data["html"]
        else:
            raise ValueError(f"Invalid output type: {output_type}")
    else:
        raise ValueError(f"Invalid output type: {output_type}")


def get_from_param_or_env(
    key: str,
    param: Optional[str] = None,
    env_key: Optional[str] = None,
    default: Optional[str] = None,
) -> str:
    """Get a value from a param or an environment variable."""
    if param is not None:
        return param
    elif env_key and env_key in os.environ and os.environ[env_key]:
        return os.environ[env_key]
    elif default is not None:
        return default
    else:
        raise ValueError(
            f"Did not find {key}, please add an environment variable"
            f" `{env_key}` which contains it, or pass"
            f"  `{key}` as a named parameter."
        )


class UpstageLayoutAnalysisParser(BaseBlobParser):
    """Upstage Layout Analysis Parser.

    To use, you should have the environment variable `UPSTAGE_API_KEY`
    set with your API key or pass it as a named parameter to the constructor.

    Example:
        .. code-block:: python

            from langchain_upstage import UpstageLayoutAnalysisParser

            loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
    """

    def __init__(
        self,
        api_key: Optional[str] = None,
        output_type: Union[OutputType, dict] = "html",
        split: SplitType = "none",
        use_ocr: bool = False,
        exclude: list = [],
    ):
        """
        Initializes an instance of the Upstage class.

        Args:
            api_key (str, optional): The API key for accessing the Upstage API.
                                     Defaults to None, in which case it will be
                                     fetched from the environment variable
                                     `UPSTAGE_API_KEY`.
            output_type (Union[OutputType, dict], optional): The type of output to be
                                                             generated by the parser.
                                                             Defaults to "html".
            split (SplitType, optional): The type of splitting to be applied.
                                         Defaults to "none" (no splitting).
            use_ocr (bool, optional): Extract text from images in the document.
                                      Defaults to False. (Use text info in PDF file)
            exclude (list, optional): Exclude specific elements from the output.
                                      Defaults to [] (all included).
        """
        if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"):
            warnings.warn(
                "UPSTAGE_DOCUMENT_AI_API_KEY is deprecated."
                "Please use UPSTAGE_API_KEY instead."
            )
        self.api_key = get_from_param_or_env(
            "UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key
        )

        self.output_type = output_type
        self.split = split
        self.use_ocr = use_ocr
        self.exclude = exclude

        validate_api_key(self.api_key)

    def _get_response(self, files: Dict) -> List:
        """
        Sends a POST request to the API endpoint with the provided files and
        returns the response.

        Args:
            files (dict): A dictionary containing the files to be sent in the request.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If there is an error in the API call.
        """
        try:
            headers = {"Authorization": f"Bearer {self.api_key}"}
            options = {"ocr": self.use_ocr}
            response = requests.post(
                LAYOUT_ANALYSIS_URL, headers=headers, files=files, data=options
            )
            response.raise_for_status()

            result = response.json().get("elements", [])

            elements = [
                element for element in result if element["category"] not in self.exclude
            ]

            return elements

        except requests.RequestException as req_err:
            # Handle any request-related exceptions
            print(f"Request Exception: {req_err}")
            raise ValueError(f"Failed to send request: {req_err}")
        except json.JSONDecodeError as json_err:
            # Handle JSON decode errors
            print(f"JSON Decode Error: {json_err}")
            raise ValueError(f"Failed to decode JSON response: {json_err}")

        return []

    def _split_and_request(
        self,
        full_docs: fitzDocument,
        start_page: int,
        num_pages: int = DEFAULT_NUMBER_OF_PAGE,
    ) -> List:
        """
        Splits the full pdf document into partial pages and sends a request to the
        server.

        Args:
            full_docs (str): The full document to be split and requested.
            start_page (int): The starting page number for splitting the document.
            num_pages (int, optional): The number of pages to split the document
                                             into.
                                             Defaults to DEFAULT_NUMBER_OF_PAGE.

        Returns:
            response: The response from the server.
        """
        with fitz.open() as chunk_pdf:
            chunk_pdf.insert_pdf(
                full_docs,
                from_page=start_page,
                to_page=start_page + num_pages - 1,
            )
            pdf_bytes = chunk_pdf.write()

        with io.BytesIO(pdf_bytes) as f:
            response = self._get_response({"document": f})

        return response

    def _element_document(self, elements: Dict) -> Document:
        """
        Converts an elements into a Document object.

        Args:
            elements: The elements to convert.

        Returns:
            A list containing a single Document object.

        """
        return Document(
            page_content=(parse_output(elements, self.output_type)),
            metadata={
                "page": elements["page"],
                "id": elements["id"],
                "type": self.output_type,
                "split": self.split,
                "bbox": elements["bounding_box"],
                "category": elements["category"],
            },
        )

    def _page_document(self, elements: List) -> List[Document]:
        """
        Combines elements with the same page number into a single Document object.

        Args:
            elements (List): A list of elements containing page numbers.

        Returns:
            List[Document]: A list of Document objects, each representing a page
                            with its content and metadata.
        """
        _docs = []
        pages = sorted(set(map(lambda x: x["page"], elements)))

        page_group = [
            [element for element in elements if element["page"] == x] for x in pages
        ]

        for group in page_group:
            page_content = " ".join(
                [parse_output(element, self.output_type) for element in group]
            )

            _docs.append(
                Document(
                    page_content=page_content,
                    metadata={
                        "page": group[0]["page"],
                        "type": self.output_type,
                        "split": self.split,
                    },
                )
            )

        return _docs

    def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
        """
        Lazily parses a document and yields Document objects based on the specified
        split type.

        Args:
            blob (Blob): The input document blob to parse.
            is_batch (bool, optional): Whether to parse the document in batches.
                                       Defaults to False (single page parsing)

        Yields:
            Document: The parsed document object.

        Raises:
            ValueError: If an invalid split type is provided.

        """

        if is_batch:
            num_pages = DEFAULT_NUMBER_OF_PAGE
        else:
            num_pages = 1

        full_docs = fitz.open(blob.path)
        number_of_pages = full_docs.page_count

        if self.split == "none":
            if full_docs.is_pdf:
                result = ""
                start_page = 0
                num_pages = DEFAULT_NUMBER_OF_PAGE
                for _ in range(number_of_pages):
                    if start_page >= number_of_pages:
                        break

                    elements = self._split_and_request(full_docs, start_page, num_pages)
                    for element in elements:
                        result += parse_output(element, self.output_type)

                    start_page += num_pages

            else:
                if not blob.path:
                    raise ValueError("Blob path is required for non-PDF files.")

                result = ""
                with open(blob.path, "rb") as f:
                    elements = self._get_response({"document": f})

                for element in elements:
                    result += parse_output(element, self.output_type)

            yield Document(
                page_content=result,
                metadata={
                    "total_pages": number_of_pages,
                    "type": self.output_type,
                    "split": self.split,
                },
            )

        elif self.split == "element":
            if full_docs.is_pdf:
                start_page = 0
                for _ in range(number_of_pages):
                    if start_page >= number_of_pages:
                        break

                    elements = self._split_and_request(full_docs, start_page, num_pages)
                    for element in elements:
                        yield self._element_document(element)

                    start_page += num_pages

            else:
                if not blob.path:
                    raise ValueError("Blob path is required for non-PDF files.")
                with open(blob.path, "rb") as f:
                    elements = self._get_response({"document": f})

                for element in elements:
                    yield self._element_document(element)

        elif self.split == "page":
            if full_docs.is_pdf:
                start_page = 0
                for _ in range(number_of_pages):
                    if start_page >= number_of_pages:
                        break

                    elements = self._split_and_request(full_docs, start_page, num_pages)
                    yield from self._page_document(elements)

                    start_page += num_pages
            else:
                if not blob.path:
                    raise ValueError("Blob path is required for non-PDF files.")
                with open(blob.path, "rb") as f:
                    elements = self._get_response({"document": f})

                yield from self._page_document(elements)

        else:
            raise ValueError(f"Invalid split type: {self.split}")