mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
upstage[minor]: add merge_and_split function for document loader (#21603)
- Introduce the `merge_and_split` function in the `UpstageLayoutAnalysisLoader`. - The `merge_and_split` function takes a list of documents and a splitter as inputs. - This function merges all documents and then divides them using the `split_documents` method, which is a proprietary function of the splitter. - If the provided splitter is `None` (which is the default setting), the function will simply merge the documents without splitting them.
This commit is contained in:
parent
500569da48
commit
480c02bf55
@ -1,7 +1,7 @@
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Literal, Optional, Union
|
||||
from typing import Any, Dict, Iterator, List, Literal, Optional, Union
|
||||
|
||||
from langchain_core.document_loaders import BaseLoader, Blob
|
||||
from langchain_core.documents import Document
|
||||
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
|
||||
exclude=self.exclude,
|
||||
)
|
||||
yield from parser.lazy_parse(blob)
|
||||
|
||||
def merge_and_split(
|
||||
self, documents: List[Document], splitter: Optional[object] = None
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Merges the page content and metadata of multiple documents into a single
|
||||
document, or splits the documents using a custom splitter.
|
||||
|
||||
Args:
|
||||
documents (list): A list of Document objects to be merged and split.
|
||||
splitter (object, optional): An optional splitter object that implements the
|
||||
`split_documents` method. If provided, the documents will be split using
|
||||
this splitter. Defaults to None, in which case the documents are merged.
|
||||
|
||||
Returns:
|
||||
list: A list of Document objects. If no splitter is provided, a single
|
||||
Document object is returned with the merged content and combined metadata.
|
||||
If a splitter is provided, the documents are split and a list of Document
|
||||
objects is returned.
|
||||
|
||||
Raises:
|
||||
AssertionError: If a splitter is provided but it does not implement the
|
||||
`split_documents` method.
|
||||
"""
|
||||
if splitter is None:
|
||||
merged_content = " ".join([doc.page_content for doc in documents])
|
||||
|
||||
metadatas: Dict[str, Any] = dict()
|
||||
for _meta in [doc.metadata for doc in documents]:
|
||||
for key, value in _meta.items():
|
||||
if key in metadatas:
|
||||
metadatas[key].append(value)
|
||||
else:
|
||||
metadatas[key] = [value]
|
||||
|
||||
return [Document(page_content=merged_content, metadata=metadatas)]
|
||||
else:
|
||||
assert hasattr(
|
||||
splitter, "split_documents"
|
||||
), "splitter must implement split_documents method"
|
||||
|
||||
return splitter.split_documents(documents)
|
||||
|
@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
|
||||
"id": elements["id"],
|
||||
"type": self.output_type,
|
||||
"split": self.split,
|
||||
"bbox": elements["bounding_box"],
|
||||
"category": elements["category"],
|
||||
},
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user