[Community][minor]: Updating source path, and file path for SharePoint loader in PebbloSafeLoader (#25592)

- **Description:** Updating source path and file path in Pebblo safe loader for SharePoint apps during loading - **Issue:** NA - **Dependencies:** NA - **Tests:** NA - **Docs** NA --------- Co-authored-by: dristy.cd <dristy@clouddefense.io>
2024-11-10 01:10:59 +00:00 · 2024-08-26 18:08:40 +05:30 · 2024-08-26 18:08:40 +05:30 · fbb4761199
commit fbb4761199
parent 745d1c2b8d
2 changed files with 33 additions and 12 deletions
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@ -252,11 +252,16 @@ class PebbloSafeLoader(BaseLoader):
        """Add Pebblo specific metadata to documents."""
        for doc in self.docs_with_id:
            doc_metadata = doc.metadata
-            doc_metadata["full_path"] = get_full_path(
-                doc_metadata.get(
-                    "full_path", doc_metadata.get("source", self.source_path)
+            if self.loader.__class__.__name__ == "SharePointLoader":
+                doc_metadata["full_path"] = get_full_path(
+                    doc_metadata.get("source", self.source_path)
+                )
+            else:
+                doc_metadata["full_path"] = get_full_path(
+                    doc_metadata.get(
+                        "full_path", doc_metadata.get("source", self.source_path)
+                    )
                )
-            )
            doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
                "pb_checksum", None
            )
--- a/libs/community/langchain_community/utilities/pebblo.py
+++ b/libs/community/langchain_community/utilities/pebblo.py
@ -488,7 +488,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
        source_owner = get_file_owner_from_path(source_path)
        # Prepare docs for classification
        docs, source_aggregate_size = self.prepare_docs_for_classification(
-            docs_with_id, source_path
+            docs_with_id, source_path, loader_details
        )
        # Build payload for classification
        payload = self.build_classification_payload(
@ -659,7 +659,9 @@ class PebbloLoaderAPIWrapper(BaseModel):

    @staticmethod
    def prepare_docs_for_classification(
-        docs_with_id: List[IndexedDocument], source_path: str
+        docs_with_id: List[IndexedDocument],
+        source_path: str,
+        loader_details: dict,
    ) -> Tuple[List[dict], int]:
        """
        Prepare documents for classification.
@ -667,22 +669,30 @@ class PebbloLoaderAPIWrapper(BaseModel):
        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
+            loader_details (dict): Contains loader info.

        Returns:
-            Tuple[List[dict], int]: Documents and the aggregate size of the source.
+            Tuple[List[dict], int]: Documents and the aggregate size
+            of the source.
        """
        docs = []
        source_aggregate_size = 0
        doc_content = [doc.dict() for doc in docs_with_id]
+        source_path_update = False
        for doc in doc_content:
            doc_metadata = doc.get("metadata", {})
            doc_authorized_identities = doc_metadata.get("authorized_identities", [])
-            doc_source_path = get_full_path(
-                doc_metadata.get(
-                    "full_path",
-                    doc_metadata.get("source", source_path),
+            if loader_details["loader"] == "SharePointLoader":
+                doc_source_path = get_full_path(
+                    doc_metadata.get("source", loader_details["source_path"])
+                )
+            else:
+                doc_source_path = get_full_path(
+                    doc_metadata.get(
+                        "full_path",
+                        doc_metadata.get("source", source_path),
+                    )
                )
-            )
            doc_source_owner = doc_metadata.get(
                "owner", get_file_owner_from_path(doc_source_path)
            )
@ -710,6 +720,12 @@ class PebbloLoaderAPIWrapper(BaseModel):
                    ),
                }
            )
+            if (
+                loader_details["loader"] == "SharePointLoader"
+                and not source_path_update
+            ):
+                loader_details["source_path"] = doc_metadata.get("source_full_url")
+                source_path_update = True
        return docs, source_aggregate_size

    @staticmethod