upstage[minor]: add merge_and_split function for document loader (#21603)

- Introduce the `merge_and_split` function in the
`UpstageLayoutAnalysisLoader`.
- The `merge_and_split` function takes a list of documents and a
splitter as inputs.
- This function merges all documents and then divides them using the
`split_documents` method, which is a proprietary function of the
splitter.
- If the provided splitter is `None` (which is the default setting), the
function will simply merge the documents without splitting them.
This commit is contained in:
junkeon 2024-05-13 23:55:19 +09:00 committed by GitHub
parent 500569da48
commit 480c02bf55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 1 deletions

View File

@ -1,7 +1,7 @@
import os
import warnings
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union
from typing import Any, Dict, Iterator, List, Literal, Optional, Union
from langchain_core.document_loaders import BaseLoader, Blob
from langchain_core.documents import Document
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
exclude=self.exclude,
)
yield from parser.lazy_parse(blob)
def merge_and_split(
self, documents: List[Document], splitter: Optional[object] = None
) -> List[Document]:
"""
Merges the page content and metadata of multiple documents into a single
document, or splits the documents using a custom splitter.
Args:
documents (list): A list of Document objects to be merged and split.
splitter (object, optional): An optional splitter object that implements the
`split_documents` method. If provided, the documents will be split using
this splitter. Defaults to None, in which case the documents are merged.
Returns:
list: A list of Document objects. If no splitter is provided, a single
Document object is returned with the merged content and combined metadata.
If a splitter is provided, the documents are split and a list of Document
objects is returned.
Raises:
AssertionError: If a splitter is provided but it does not implement the
`split_documents` method.
"""
if splitter is None:
merged_content = " ".join([doc.page_content for doc in documents])
metadatas: Dict[str, Any] = dict()
for _meta in [doc.metadata for doc in documents]:
for key, value in _meta.items():
if key in metadatas:
metadatas[key].append(value)
else:
metadatas[key] = [value]
return [Document(page_content=merged_content, metadata=metadatas)]
else:
assert hasattr(
splitter, "split_documents"
), "splitter must implement split_documents method"
return splitter.split_documents(documents)

View File

@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
"id": elements["id"],
"type": self.output_type,
"split": self.split,
"bbox": elements["bounding_box"],
"category": elements["category"],
},
)