upstage[minor]: add merge_and_split function for document loader (#21603)

- Introduce the `merge_and_split` function in the `UpstageLayoutAnalysisLoader`. - The `merge_and_split` function takes a list of documents and a splitter as inputs. - This function merges all documents and then divides them using the `split_documents` method, which is a proprietary function of the splitter. - If the provided splitter is `None` (which is the default setting), the function will simply merge the documents without splitting them.
2024-11-16 06:13:16 +00:00 · 2024-05-13 23:55:19 +09:00 · 2024-05-13 23:55:19 +09:00 · 480c02bf55
commit 480c02bf55
parent 500569da48
2 changed files with 45 additions and 1 deletions
--- a/libs/partners/upstage/langchain_upstage/layout_analysis.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py
@ -1,7 +1,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Iterator, List, Literal, Optional, Union
+from typing import Any, Dict, Iterator, List, Literal, Optional, Union

 from langchain_core.document_loaders import BaseLoader, Blob
 from langchain_core.documents import Document
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
                exclude=self.exclude,
            )
            yield from parser.lazy_parse(blob)
+
+    def merge_and_split(
+        self, documents: List[Document], splitter: Optional[object] = None
+    ) -> List[Document]:
+        """
+        Merges the page content and metadata of multiple documents into a single
+        document, or splits the documents using a custom splitter.
+
+        Args:
+            documents (list): A list of Document objects to be merged and split.
+            splitter (object, optional): An optional splitter object that implements the
+                `split_documents` method. If provided, the documents will be split using
+                this splitter. Defaults to None, in which case the documents are merged.
+
+        Returns:
+            list: A list of Document objects. If no splitter is provided, a single
+            Document object is returned with the merged content and combined metadata.
+            If a splitter is provided, the documents are split and a list of Document
+            objects is returned.
+
+        Raises:
+            AssertionError: If a splitter is provided but it does not implement the
+            `split_documents` method.
+        """
+        if splitter is None:
+            merged_content = " ".join([doc.page_content for doc in documents])
+
+            metadatas: Dict[str, Any] = dict()
+            for _meta in [doc.metadata for doc in documents]:
+                for key, value in _meta.items():
+                    if key in metadatas:
+                        metadatas[key].append(value)
+                    else:
+                        metadatas[key] = [value]
+
+            return [Document(page_content=merged_content, metadata=metadatas)]
+        else:
+            assert hasattr(
+                splitter, "split_documents"
+            ), "splitter must implement split_documents method"
+
+            return splitter.split_documents(documents)
--- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
                "id": elements["id"],
                "type": self.output_type,
                "split": self.split,
+                "bbox": elements["bounding_box"],
+                "category": elements["category"],
            },
        )