From 955cf186d207e72e98309864f946524bfffb10ee Mon Sep 17 00:00:00 2001 From: Rahul Triptahi Date: Sat, 27 Apr 2024 06:20:57 +0530 Subject: [PATCH] community[patch]: Ingest source, owner and full_path if present in Document's metadata. (#20949) Description: The PebbloSafeLoader should first check for owner, full_path and size in metadata before implementing its own logic. Dependencies: None Documentation: NA. Signed-off-by: Rahul Tripathi Co-authored-by: Rahul Tripathi --- .../document_loaders/pebblo.py | 17 ++++++++++------- .../langchain_community/utilities/pebblo.py | 4 +++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 9dd466e496..a0cc2eb411 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -157,16 +157,19 @@ class PebbloSafeLoader(BaseLoader): doc_content = [doc.dict() for doc in loaded_docs] docs = [] for doc in doc_content: - doc_authorized_identities = doc.get("metadata", {}).get( - "authorized_identities", [] - ) + doc_metadata = doc.get("metadata", {}) + doc_authorized_identities = doc_metadata.get("authorized_identities", []) doc_source_path = get_full_path( - doc.get("metadata", {}).get("source", self.source_path) + doc_metadata.get( + "full_path", doc_metadata.get("source", self.source_path) + ) + ) + doc_source_owner = doc_metadata.get( + "owner", PebbloSafeLoader.get_file_owner_from_path(doc_source_path) ) - doc_source_owner = PebbloSafeLoader.get_file_owner_from_path( - doc_source_path + doc_source_size = doc_metadata.get( + "size", self.get_source_size(doc_source_path) ) - doc_source_size = self.get_source_size(doc_source_path) page_content = str(doc.get("page_content")) page_content_size = self.calculate_content_size(page_content) self.source_aggregate_size += page_content_size diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py index df799c7fe0..d94ba80985 100644 --- a/libs/community/langchain_community/utilities/pebblo.py +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -169,7 +169,9 @@ def get_full_path(path: str) -> str: or (path in ["unknown", "-", "in-memory"]) ): return path - full_path = pathlib.Path(path).resolve() + full_path = pathlib.Path(path) + if full_path.exists(): + full_path = full_path.resolve() return str(full_path)