Community[minor]: Added checksum in while send data to pebblo-cloud (#23968)

- **Description:** 
            - Updated checksum in doc metadata
- Sending checksum and removing actual content, while sending data to
`pebblo-cloud` if `classifier-location `is `pebblo-cloud` in
`/loader/doc` API
            - Adding `pb_id` i.e. pebblo id to doc metadata
            - Refactoring as needed.
- Sending `content-checksum` and removing actual content, while sending
data to `pebblo-cloud` if `classifier-location `is `pebblo-cloud` in
`prmopt` API
- **Issue:** NA
- **Dependencies:** NA
- **Tests:** Updated
- **Docs** NA

---------

Co-authored-by: dristy.cd <dristy@clouddefense.io>
This commit is contained in:
Dristy Srivastava 2024-07-19 23:22:54 +05:30 committed by GitHub
parent 9aae8ef416
commit 020cc1cf3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 95 additions and 72 deletions

View File

@ -124,6 +124,11 @@ class PebbloRetrievalQA(Chain):
), ),
"doc": doc.page_content, "doc": doc.page_content,
"vector_db": self.retriever.vectorstore.__class__.__name__, "vector_db": self.retriever.vectorstore.__class__.__name__,
**(
{"pb_checksum": doc.metadata.get("pb_checksum")}
if doc.metadata.get("pb_checksum")
else {}
),
} }
for doc in docs for doc in docs
if isinstance(doc, Document) if isinstance(doc, Document)
@ -457,25 +462,24 @@ class PebbloRetrievalQA(Chain):
if self.api_key: if self.api_key:
if self.classifier_location == "local": if self.classifier_location == "local":
if pebblo_resp: if pebblo_resp:
payload["response"] = ( resp = json.loads(pebblo_resp.text)
json.loads(pebblo_resp.text) if resp:
.get("retrieval_data", {}) payload["response"].update(
.get("response", {}) resp.get("retrieval_data", {}).get("response", {})
) )
payload["context"] = ( payload["response"].pop("data")
json.loads(pebblo_resp.text) payload["prompt"].update(
.get("retrieval_data", {}) resp.get("retrieval_data", {}).get("prompt", {})
.get("context", []) )
) payload["prompt"].pop("data")
payload["prompt"] = ( context = payload["context"]
json.loads(pebblo_resp.text) for context_data in context:
.get("retrieval_data", {}) context_data.pop("doc")
.get("prompt", {}) payload["context"] = context
)
else: else:
payload["response"] = None payload["response"] = {}
payload["context"] = None payload["prompt"] = {}
payload["prompt"] = None payload["context"] = []
headers.update({"x-api-key": self.api_key}) headers.update({"x-api-key": self.api_key})
pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}" pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}"
try: try:

View File

@ -129,6 +129,7 @@ class Context(BaseModel):
retrieved_from: Optional[str] retrieved_from: Optional[str]
doc: Optional[str] doc: Optional[str]
vector_db: str vector_db: str
pb_checksum: Optional[str]
class Prompt(BaseModel): class Prompt(BaseModel):

View File

@ -5,7 +5,7 @@ import logging
import os import os
import uuid import uuid
from http import HTTPStatus from http import HTTPStatus
from typing import Any, Dict, Iterator, List, Optional, Union from typing import Any, Dict, Iterator, List, Optional
import requests # type: ignore import requests # type: ignore
from langchain_core.documents import Document from langchain_core.documents import Document
@ -61,7 +61,7 @@ class PebbloSafeLoader(BaseLoader):
self.source_path = get_loader_full_path(self.loader) self.source_path = get_loader_full_path(self.loader)
self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path) self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
self.docs: List[Document] = [] self.docs: List[Document] = []
self.docs_with_id: Union[List[IndexedDocument], List[Document], List] = [] self.docs_with_id: List[IndexedDocument] = []
loader_name = str(type(self.loader)).split(".")[-1].split("'")[0] loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
self.source_type = get_loader_type(loader_name) self.source_type = get_loader_type(loader_name)
self.source_path_size = self.get_source_size(self.source_path) self.source_path_size = self.get_source_size(self.source_path)
@ -89,17 +89,13 @@ class PebbloSafeLoader(BaseLoader):
list: Documents fetched from load method of the wrapped `loader`. list: Documents fetched from load method of the wrapped `loader`.
""" """
self.docs = self.loader.load() self.docs = self.loader.load()
# Add pebblo-specific metadata to docs
self._add_pebblo_specific_metadata()
if not self.load_semantic:
self._classify_doc(self.docs, loading_end=True)
return self.docs
self.docs_with_id = self._index_docs() self.docs_with_id = self._index_docs()
classified_docs = self._classify_doc(self.docs_with_id, loading_end=True) classified_docs = self._classify_doc(loading_end=True)
self.docs_with_id = self._add_semantic_to_docs( self._add_pebblo_specific_metadata(classified_docs)
self.docs_with_id, classified_docs if self.load_semantic:
) self.docs = self._add_semantic_to_docs(classified_docs)
self.docs = self._unindex_docs(self.docs_with_id) # type: ignore else:
self.docs = self._unindex_docs() # type: ignore
return self.docs return self.docs
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
@ -125,19 +121,14 @@ class PebbloSafeLoader(BaseLoader):
self.docs = [] self.docs = []
break break
self.docs = list((doc,)) self.docs = list((doc,))
# Add pebblo-specific metadata to docs self.docs_with_id = self._index_docs()
self._add_pebblo_specific_metadata() classified_doc = self._classify_doc()
if not self.load_semantic: self._add_pebblo_specific_metadata(classified_doc)
self._classify_doc(self.docs, loading_end=True) if self.load_semantic:
yield self.docs[0] self.docs = self._add_semantic_to_docs(classified_doc)
else: else:
self.docs_with_id = self._index_docs() self.docs = self._unindex_docs()
classified_doc = self._classify_doc(self.docs) yield self.docs[0]
self.docs_with_id = self._add_semantic_to_docs(
self.docs_with_id, classified_doc
)
self.docs = self._unindex_docs(self.docs_with_id) # type: ignore
yield self.docs[0]
@classmethod @classmethod
def set_discover_sent(cls) -> None: def set_discover_sent(cls) -> None:
@ -147,13 +138,12 @@ class PebbloSafeLoader(BaseLoader):
def set_loader_sent(cls) -> None: def set_loader_sent(cls) -> None:
cls._loader_sent = True cls._loader_sent = True
def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: def _classify_doc(self, loading_end: bool = False) -> dict:
"""Send documents fetched from loader to pebblo-server. Then send """Send documents fetched from loader to pebblo-server. Then send
classified documents to Daxa cloud(If api_key is present). Internal method. classified documents to Daxa cloud(If api_key is present). Internal method.
Args: Args:
loaded_docs (list): List of documents fetched from loader's load operation.
loading_end (bool, optional): Flag indicating the halt of data loading_end (bool, optional): Flag indicating the halt of data
loading by loader. Defaults to False. loading by loader. Defaults to False.
""" """
@ -163,9 +153,8 @@ class PebbloSafeLoader(BaseLoader):
} }
if loading_end is True: if loading_end is True:
PebbloSafeLoader.set_loader_sent() PebbloSafeLoader.set_loader_sent()
doc_content = [doc.dict() for doc in loaded_docs] doc_content = [doc.dict() for doc in self.docs_with_id]
docs = [] docs = []
classified_docs = []
for doc in doc_content: for doc in doc_content:
doc_metadata = doc.get("metadata", {}) doc_metadata = doc.get("metadata", {})
doc_authorized_identities = doc_metadata.get("authorized_identities", []) doc_authorized_identities = doc_metadata.get("authorized_identities", [])
@ -183,12 +172,12 @@ class PebbloSafeLoader(BaseLoader):
page_content = str(doc.get("page_content")) page_content = str(doc.get("page_content"))
page_content_size = self.calculate_content_size(page_content) page_content_size = self.calculate_content_size(page_content)
self.source_aggregate_size += page_content_size self.source_aggregate_size += page_content_size
doc_id = doc.get("id", None) or 0 doc_id = doc.get("pb_id", None) or 0
docs.append( docs.append(
{ {
"doc": page_content, "doc": page_content,
"source_path": doc_source_path, "source_path": doc_source_path,
"id": doc_id, "pb_id": doc_id,
"last_modified": doc.get("metadata", {}).get("last_modified"), "last_modified": doc.get("metadata", {}).get("last_modified"),
"file_owner": doc_source_owner, "file_owner": doc_source_owner,
**( **(
@ -221,6 +210,7 @@ class PebbloSafeLoader(BaseLoader):
self.source_aggregate_size self.source_aggregate_size
) )
payload = Doc(**payload).dict(exclude_unset=True) payload = Doc(**payload).dict(exclude_unset=True)
classified_docs = {}
# Raw payload to be sent to classifier # Raw payload to be sent to classifier
if self.classifier_location == "local": if self.classifier_location == "local":
load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}" load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}"
@ -228,7 +218,10 @@ class PebbloSafeLoader(BaseLoader):
pebblo_resp = requests.post( pebblo_resp = requests.post(
load_doc_url, headers=headers, json=payload, timeout=300 load_doc_url, headers=headers, json=payload, timeout=300
) )
classified_docs = json.loads(pebblo_resp.text).get("docs", None)
# Updating the structure of pebblo response docs for efficient searching
for classified_doc in json.loads(pebblo_resp.text).get("docs", []):
classified_docs.update({classified_doc["pb_id"]: classified_doc})
if pebblo_resp.status_code not in [ if pebblo_resp.status_code not in [
HTTPStatus.OK, HTTPStatus.OK,
HTTPStatus.BAD_GATEWAY, HTTPStatus.BAD_GATEWAY,
@ -257,7 +250,21 @@ class PebbloSafeLoader(BaseLoader):
if self.api_key: if self.api_key:
if self.classifier_location == "local": if self.classifier_location == "local":
payload["docs"] = classified_docs docs = payload["docs"]
for doc_data in docs:
classified_data = classified_docs.get(doc_data["pb_id"], {})
doc_data.update(
{
"pb_checksum": classified_data.get("pb_checksum", None),
"loader_source_path": classified_data.get(
"loader_source_path", None
),
"entities": classified_data.get("entities", {}),
"topics": classified_data.get("topics", {}),
}
)
doc_data.pop("doc")
headers.update({"x-api-key": self.api_key}) headers.update({"x-api-key": self.api_key})
pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}" pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}"
try: try:
@ -453,33 +460,29 @@ class PebbloSafeLoader(BaseLoader):
List[IndexedDocument]: A list of IndexedDocument objects with unique IDs. List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
""" """
docs_with_id = [ docs_with_id = [
IndexedDocument(id=hex(i)[2:], **doc.dict()) IndexedDocument(pb_id=str(i), **doc.dict())
for i, doc in enumerate(self.docs) for i, doc in enumerate(self.docs)
] ]
return docs_with_id return docs_with_id
def _add_semantic_to_docs( def _add_semantic_to_docs(self, classified_docs: Dict) -> List[Document]:
self, docs_with_id: List[IndexedDocument], classified_docs: List[dict]
) -> List[Document]:
""" """
Adds semantic metadata to the given list of documents. Adds semantic metadata to the given list of documents.
Args: Args:
docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects classified_docs (Dict): A dictionary of dictionaries containing the
containing the documents with their IDs. classified documents with pb_id as key.
classified_docs (List[dict]): A list of dictionaries containing the
classified documents.
Returns: Returns:
List[Document]: A list of Document objects with added semantic metadata. List[Document]: A list of Document objects with added semantic metadata.
""" """
indexed_docs = { indexed_docs = {
doc.id: Document(page_content=doc.page_content, metadata=doc.metadata) doc.pb_id: Document(page_content=doc.page_content, metadata=doc.metadata)
for doc in docs_with_id for doc in self.docs_with_id
} }
for classified_doc in classified_docs: for classified_doc in classified_docs.values():
doc_id = classified_doc.get("id") doc_id = classified_doc.get("pb_id")
if doc_id in indexed_docs: if doc_id in indexed_docs:
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc) self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
@ -487,19 +490,16 @@ class PebbloSafeLoader(BaseLoader):
return semantic_metadata_docs return semantic_metadata_docs
def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]: def _unindex_docs(self) -> List[Document]:
""" """
Converts a list of IndexedDocument objects to a list of Document objects. Converts a list of IndexedDocument objects to a list of Document objects.
Args:
docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects.
Returns: Returns:
List[Document]: A list of Document objects. List[Document]: A list of Document objects.
""" """
docs = [ docs = [
Document(page_content=doc.page_content, metadata=doc.metadata) Document(page_content=doc.page_content, metadata=doc.metadata)
for i, doc in enumerate(docs_with_id) for i, doc in enumerate(self.docs_with_id)
] ]
return docs return docs
@ -522,12 +522,16 @@ class PebbloSafeLoader(BaseLoader):
) )
return doc return doc
def _add_pebblo_specific_metadata(self) -> None: def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
"""Add Pebblo specific metadata to documents.""" """Add Pebblo specific metadata to documents."""
for doc in self.docs: for doc in self.docs_with_id:
doc_metadata = doc.metadata doc_metadata = doc.metadata
doc_metadata["full_path"] = get_full_path( doc_metadata["full_path"] = get_full_path(
doc_metadata.get( doc_metadata.get(
"full_path", doc_metadata.get("source", self.source_path) "full_path", doc_metadata.get("source", self.source_path)
) )
) )
doc_metadata["pb_id"] = doc.pb_id
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
"pb_checksum", None
)

View File

@ -66,7 +66,7 @@ logger = logging.getLogger(__name__)
class IndexedDocument(Document): class IndexedDocument(Document):
"""Pebblo Indexed Document.""" """Pebblo Indexed Document."""
id: str pb_id: str
"""Unique ID of the document.""" """Unique ID of the document."""

View File

@ -65,12 +65,26 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
full_file_path = os.path.abspath(file_path) full_file_path = os.path.abspath(file_path)
expected_docs = [ expected_docs = [
Document( Document(
metadata={
"source": full_file_path,
"row": 0,
"full_path": full_file_path,
"pb_id": "0",
# For UT as here we are not calculating checksum
"pb_checksum": None,
},
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={"source": file_path, "row": 0, "full_path": full_file_path},
), ),
Document( Document(
metadata={
"source": full_file_path,
"row": 1,
"full_path": full_file_path,
"pb_id": "1",
# For UT as here we are not calculating checksum
"pb_checksum": None,
},
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
metadata={"source": file_path, "row": 1, "full_path": full_file_path},
), ),
] ]