From 020cc1cf3e68a85c26f787ad7ab7605e72be0b1b Mon Sep 17 00:00:00 2001
From: Dristy Srivastava <58721149+dristysrivastava@users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:22:54 +0530
Subject: [PATCH] Community[minor]: Added checksum in while send data to
 pebblo-cloud (#23968)

- **Description:**
            - Updated checksum in doc metadata
- Sending checksum and removing actual content, while sending data to
`pebblo-cloud` if `classifier-location `is `pebblo-cloud` in
`/loader/doc` API
            - Adding `pb_id` i.e. pebblo id to doc metadata
            - Refactoring as needed.
- Sending `content-checksum` and removing actual content, while sending
data to `pebblo-cloud` if `classifier-location `is `pebblo-cloud` in
`prmopt` API
- **Issue:** NA
- **Dependencies:** NA
- **Tests:** Updated
- **Docs** NA

---------

Co-authored-by: dristy.cd <dristy@clouddefense.io>
---
 .../chains/pebblo_retrieval/base.py           |  40 ++++---
 .../chains/pebblo_retrieval/models.py         |   1 +
 .../document_loaders/pebblo.py                | 106 +++++++++---------
 .../langchain_community/utilities/pebblo.py   |   2 +-
 .../document_loaders/test_pebblo.py           |  18 ++-
 5 files changed, 95 insertions(+), 72 deletions(-)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index 97c939b4fc..02d3553c4b 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -124,6 +124,11 @@ class PebbloRetrievalQA(Chain):
                     ),
                     "doc": doc.page_content,
                     "vector_db": self.retriever.vectorstore.__class__.__name__,
+                    **(
+                        {"pb_checksum": doc.metadata.get("pb_checksum")}
+                        if doc.metadata.get("pb_checksum")
+                        else {}
+                    ),
                 }
                 for doc in docs
                 if isinstance(doc, Document)
@@ -457,25 +462,24 @@ class PebbloRetrievalQA(Chain):
         if self.api_key:
             if self.classifier_location == "local":
                 if pebblo_resp:
-                    payload["response"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("response", {})
-                    )
-                    payload["context"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("context", [])
-                    )
-                    payload["prompt"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("prompt", {})
-                    )
+                    resp = json.loads(pebblo_resp.text)
+                    if resp:
+                        payload["response"].update(
+                            resp.get("retrieval_data", {}).get("response", {})
+                        )
+                        payload["response"].pop("data")
+                        payload["prompt"].update(
+                            resp.get("retrieval_data", {}).get("prompt", {})
+                        )
+                        payload["prompt"].pop("data")
+                        context = payload["context"]
+                        for context_data in context:
+                            context_data.pop("doc")
+                        payload["context"] = context
                 else:
-                    payload["response"] = None
-                    payload["context"] = None
-                    payload["prompt"] = None
+                    payload["response"] = {}
+                    payload["prompt"] = {}
+                    payload["context"] = []
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}"
             try:
diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/models.py b/libs/community/langchain_community/chains/pebblo_retrieval/models.py
index 3b7f94d44c..13a54537b9 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/models.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/models.py
@@ -129,6 +129,7 @@ class Context(BaseModel):
     retrieved_from: Optional[str]
     doc: Optional[str]
     vector_db: str
+    pb_checksum: Optional[str]
 
 
 class Prompt(BaseModel):
diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index a695582e2f..2e31b370cb 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -5,7 +5,7 @@ import logging
 import os
 import uuid
 from http import HTTPStatus
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional
 
 import requests  # type: ignore
 from langchain_core.documents import Document
@@ -61,7 +61,7 @@ class PebbloSafeLoader(BaseLoader):
         self.source_path = get_loader_full_path(self.loader)
         self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
         self.docs: List[Document] = []
-        self.docs_with_id: Union[List[IndexedDocument], List[Document], List] = []
+        self.docs_with_id: List[IndexedDocument] = []
         loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
         self.source_type = get_loader_type(loader_name)
         self.source_path_size = self.get_source_size(self.source_path)
@@ -89,17 +89,13 @@ class PebbloSafeLoader(BaseLoader):
             list: Documents fetched from load method of the wrapped `loader`.
         """
         self.docs = self.loader.load()
-        # Add pebblo-specific metadata to docs
-        self._add_pebblo_specific_metadata()
-        if not self.load_semantic:
-            self._classify_doc(self.docs, loading_end=True)
-            return self.docs
         self.docs_with_id = self._index_docs()
-        classified_docs = self._classify_doc(self.docs_with_id, loading_end=True)
-        self.docs_with_id = self._add_semantic_to_docs(
-            self.docs_with_id, classified_docs
-        )
-        self.docs = self._unindex_docs(self.docs_with_id)  # type: ignore
+        classified_docs = self._classify_doc(loading_end=True)
+        self._add_pebblo_specific_metadata(classified_docs)
+        if self.load_semantic:
+            self.docs = self._add_semantic_to_docs(classified_docs)
+        else:
+            self.docs = self._unindex_docs()  # type: ignore
         return self.docs
 
     def lazy_load(self) -> Iterator[Document]:
@@ -125,19 +121,14 @@ class PebbloSafeLoader(BaseLoader):
                 self.docs = []
                 break
             self.docs = list((doc,))
-            # Add pebblo-specific metadata to docs
-            self._add_pebblo_specific_metadata()
-            if not self.load_semantic:
-                self._classify_doc(self.docs, loading_end=True)
-                yield self.docs[0]
+            self.docs_with_id = self._index_docs()
+            classified_doc = self._classify_doc()
+            self._add_pebblo_specific_metadata(classified_doc)
+            if self.load_semantic:
+                self.docs = self._add_semantic_to_docs(classified_doc)
             else:
-                self.docs_with_id = self._index_docs()
-                classified_doc = self._classify_doc(self.docs)
-                self.docs_with_id = self._add_semantic_to_docs(
-                    self.docs_with_id, classified_doc
-                )
-                self.docs = self._unindex_docs(self.docs_with_id)  # type: ignore
-                yield self.docs[0]
+                self.docs = self._unindex_docs()
+            yield self.docs[0]
 
     @classmethod
     def set_discover_sent(cls) -> None:
@@ -147,13 +138,12 @@ class PebbloSafeLoader(BaseLoader):
     def set_loader_sent(cls) -> None:
         cls._loader_sent = True
 
-    def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
+    def _classify_doc(self, loading_end: bool = False) -> dict:
         """Send documents fetched from loader to pebblo-server. Then send
         classified documents to Daxa cloud(If api_key is present). Internal method.
 
         Args:
 
-            loaded_docs (list): List of documents fetched from loader's load operation.
             loading_end (bool, optional): Flag indicating the halt of data
                                           loading by loader. Defaults to False.
         """
@@ -163,9 +153,8 @@ class PebbloSafeLoader(BaseLoader):
         }
         if loading_end is True:
             PebbloSafeLoader.set_loader_sent()
-        doc_content = [doc.dict() for doc in loaded_docs]
+        doc_content = [doc.dict() for doc in self.docs_with_id]
         docs = []
-        classified_docs = []
         for doc in doc_content:
             doc_metadata = doc.get("metadata", {})
             doc_authorized_identities = doc_metadata.get("authorized_identities", [])
@@ -183,12 +172,12 @@ class PebbloSafeLoader(BaseLoader):
             page_content = str(doc.get("page_content"))
             page_content_size = self.calculate_content_size(page_content)
             self.source_aggregate_size += page_content_size
-            doc_id = doc.get("id", None) or 0
+            doc_id = doc.get("pb_id", None) or 0
             docs.append(
                 {
                     "doc": page_content,
                     "source_path": doc_source_path,
-                    "id": doc_id,
+                    "pb_id": doc_id,
                     "last_modified": doc.get("metadata", {}).get("last_modified"),
                     "file_owner": doc_source_owner,
                     **(
@@ -221,6 +210,7 @@ class PebbloSafeLoader(BaseLoader):
                     self.source_aggregate_size
                 )
         payload = Doc(**payload).dict(exclude_unset=True)
+        classified_docs = {}
         # Raw payload to be sent to classifier
         if self.classifier_location == "local":
             load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}"
@@ -228,7 +218,10 @@ class PebbloSafeLoader(BaseLoader):
                 pebblo_resp = requests.post(
                     load_doc_url, headers=headers, json=payload, timeout=300
                 )
-                classified_docs = json.loads(pebblo_resp.text).get("docs", None)
+
+                # Updating the structure of pebblo response docs for efficient searching
+                for classified_doc in json.loads(pebblo_resp.text).get("docs", []):
+                    classified_docs.update({classified_doc["pb_id"]: classified_doc})
                 if pebblo_resp.status_code not in [
                     HTTPStatus.OK,
                     HTTPStatus.BAD_GATEWAY,
@@ -257,7 +250,21 @@ class PebbloSafeLoader(BaseLoader):
 
         if self.api_key:
             if self.classifier_location == "local":
-                payload["docs"] = classified_docs
+                docs = payload["docs"]
+                for doc_data in docs:
+                    classified_data = classified_docs.get(doc_data["pb_id"], {})
+                    doc_data.update(
+                        {
+                            "pb_checksum": classified_data.get("pb_checksum", None),
+                            "loader_source_path": classified_data.get(
+                                "loader_source_path", None
+                            ),
+                            "entities": classified_data.get("entities", {}),
+                            "topics": classified_data.get("topics", {}),
+                        }
+                    )
+                    doc_data.pop("doc")
+
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}"
             try:
@@ -453,33 +460,29 @@ class PebbloSafeLoader(BaseLoader):
             List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
         """
         docs_with_id = [
-            IndexedDocument(id=hex(i)[2:], **doc.dict())
+            IndexedDocument(pb_id=str(i), **doc.dict())
             for i, doc in enumerate(self.docs)
         ]
         return docs_with_id
 
-    def _add_semantic_to_docs(
-        self, docs_with_id: List[IndexedDocument], classified_docs: List[dict]
-    ) -> List[Document]:
+    def _add_semantic_to_docs(self, classified_docs: Dict) -> List[Document]:
         """
         Adds semantic metadata to the given list of documents.
 
         Args:
-            docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects
-                containing the documents with their IDs.
-            classified_docs (List[dict]): A list of dictionaries containing the
-                classified documents.
+            classified_docs (Dict): A dictionary of dictionaries containing the
+                classified documents with pb_id as key.
 
         Returns:
             List[Document]: A list of Document objects with added semantic metadata.
         """
         indexed_docs = {
-            doc.id: Document(page_content=doc.page_content, metadata=doc.metadata)
-            for doc in docs_with_id
+            doc.pb_id: Document(page_content=doc.page_content, metadata=doc.metadata)
+            for doc in self.docs_with_id
         }
 
-        for classified_doc in classified_docs:
-            doc_id = classified_doc.get("id")
+        for classified_doc in classified_docs.values():
+            doc_id = classified_doc.get("pb_id")
             if doc_id in indexed_docs:
                 self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
 
@@ -487,19 +490,16 @@ class PebbloSafeLoader(BaseLoader):
 
         return semantic_metadata_docs
 
-    def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]:
+    def _unindex_docs(self) -> List[Document]:
         """
         Converts a list of IndexedDocument objects to a list of Document objects.
 
-        Args:
-            docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects.
-
         Returns:
             List[Document]: A list of Document objects.
         """
         docs = [
             Document(page_content=doc.page_content, metadata=doc.metadata)
-            for i, doc in enumerate(docs_with_id)
+            for i, doc in enumerate(self.docs_with_id)
         ]
         return docs
 
@@ -522,12 +522,16 @@ class PebbloSafeLoader(BaseLoader):
         )
         return doc
 
-    def _add_pebblo_specific_metadata(self) -> None:
+    def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
         """Add Pebblo specific metadata to documents."""
-        for doc in self.docs:
+        for doc in self.docs_with_id:
             doc_metadata = doc.metadata
             doc_metadata["full_path"] = get_full_path(
                 doc_metadata.get(
                     "full_path", doc_metadata.get("source", self.source_path)
                 )
             )
+            doc_metadata["pb_id"] = doc.pb_id
+            doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
+                "pb_checksum", None
+            )
diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py
index a9bbb47241..568efae7b6 100644
--- a/libs/community/langchain_community/utilities/pebblo.py
+++ b/libs/community/langchain_community/utilities/pebblo.py
@@ -66,7 +66,7 @@ logger = logging.getLogger(__name__)
 class IndexedDocument(Document):
     """Pebblo Indexed Document."""
 
-    id: str
+    pb_id: str
     """Unique ID of the document."""
 
 
diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
index 1cee8a849d..d0a71faae7 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
@@ -65,12 +65,26 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
     full_file_path = os.path.abspath(file_path)
     expected_docs = [
         Document(
+            metadata={
+                "source": full_file_path,
+                "row": 0,
+                "full_path": full_file_path,
+                "pb_id": "0",
+                # For UT as here we are not calculating checksum
+                "pb_checksum": None,
+            },
             page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
-            metadata={"source": file_path, "row": 0, "full_path": full_file_path},
         ),
         Document(
+            metadata={
+                "source": full_file_path,
+                "row": 1,
+                "full_path": full_file_path,
+                "pb_id": "1",
+                # For UT as here we are not calculating checksum
+                "pb_checksum": None,
+            },
             page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
-            metadata={"source": file_path, "row": 1, "full_path": full_file_path},
         ),
     ]