community[patch]: Ingest source, owner and full_path if present in Document's metadata. (#20949)

Description: The PebbloSafeLoader should first check for owner,
full_path and size in metadata before implementing its own logic.
Dependencies: None
Documentation: NA.

Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
pull/20961/head
Rahul Triptahi 3 weeks ago committed by GitHub
parent 790ea75cf7
commit 955cf186d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -157,16 +157,19 @@ class PebbloSafeLoader(BaseLoader):
doc_content = [doc.dict() for doc in loaded_docs]
docs = []
for doc in doc_content:
doc_authorized_identities = doc.get("metadata", {}).get(
"authorized_identities", []
)
doc_metadata = doc.get("metadata", {})
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
doc_source_path = get_full_path(
doc.get("metadata", {}).get("source", self.source_path)
doc_metadata.get(
"full_path", doc_metadata.get("source", self.source_path)
)
)
doc_source_owner = doc_metadata.get(
"owner", PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
)
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
doc_source_path
doc_source_size = doc_metadata.get(
"size", self.get_source_size(doc_source_path)
)
doc_source_size = self.get_source_size(doc_source_path)
page_content = str(doc.get("page_content"))
page_content_size = self.calculate_content_size(page_content)
self.source_aggregate_size += page_content_size

@ -169,7 +169,9 @@ def get_full_path(path: str) -> str:
or (path in ["unknown", "-", "in-memory"])
):
return path
full_path = pathlib.Path(path).resolve()
full_path = pathlib.Path(path)
if full_path.exists():
full_path = full_path.resolve()
return str(full_path)

Loading…
Cancel
Save