diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis.py b/libs/partners/upstage/langchain_upstage/layout_analysis.py index 40dd74b344..fc68f4d139 100644 --- a/libs/partners/upstage/langchain_upstage/layout_analysis.py +++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py @@ -1,7 +1,7 @@ import os import warnings from pathlib import Path -from typing import Iterator, List, Literal, Optional, Union +from typing import Any, Dict, Iterator, List, Literal, Optional, Union from langchain_core.document_loaders import BaseLoader, Blob from langchain_core.documents import Document @@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader): exclude=self.exclude, ) yield from parser.lazy_parse(blob) + + def merge_and_split( + self, documents: List[Document], splitter: Optional[object] = None + ) -> List[Document]: + """ + Merges the page content and metadata of multiple documents into a single + document, or splits the documents using a custom splitter. + + Args: + documents (list): A list of Document objects to be merged and split. + splitter (object, optional): An optional splitter object that implements the + `split_documents` method. If provided, the documents will be split using + this splitter. Defaults to None, in which case the documents are merged. + + Returns: + list: A list of Document objects. If no splitter is provided, a single + Document object is returned with the merged content and combined metadata. + If a splitter is provided, the documents are split and a list of Document + objects is returned. + + Raises: + AssertionError: If a splitter is provided but it does not implement the + `split_documents` method. + """ + if splitter is None: + merged_content = " ".join([doc.page_content for doc in documents]) + + metadatas: Dict[str, Any] = dict() + for _meta in [doc.metadata for doc in documents]: + for key, value in _meta.items(): + if key in metadatas: + metadatas[key].append(value) + else: + metadatas[key] = [value] + + return [Document(page_content=merged_content, metadata=metadatas)] + else: + assert hasattr( + splitter, "split_documents" + ), "splitter must implement split_documents method" + + return splitter.split_documents(documents) diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py index 7979db1275..5a4056dfe6 100644 --- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py +++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py @@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser): "id": elements["id"], "type": self.output_type, "split": self.split, + "bbox": elements["bounding_box"], + "category": elements["category"], }, )