From cb31c3611f6dadb9895011b3a3c2aa6f183d58de Mon Sep 17 00:00:00 2001 From: Yash Date: Wed, 8 May 2024 18:30:58 -0500 Subject: [PATCH] Ndb enterprise (#21233) Description: Adds NeuralDBClientVectorStore to the langchain, which is our enterprise client. --------- Co-authored-by: kartikTAI <129414343+kartikTAI@users.noreply.github.com> Co-authored-by: Kartik Sarangmath --- .../vectorstores/__init__.py | 3 + .../vectorstores/thirdai_neuraldb.py | 160 +++++++++++++++++- .../unit_tests/vectorstores/test_imports.py | 1 + .../vectorstores/test_public_api.py | 1 + .../langchain/vectorstores/__init__.py | 3 + .../vectorstores/test_public_api.py | 1 + 6 files changed, 168 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index fe04af06db..1348490f0f 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -236,6 +236,7 @@ if TYPE_CHECKING: TencentVectorDB, ) from langchain_community.vectorstores.thirdai_neuraldb import ( + NeuralDBClientVectorStore, NeuralDBVectorStore, ) from langchain_community.vectorstores.tidb_vector import ( @@ -345,6 +346,7 @@ __all__ = [ "MyScale", "MyScaleSettings", "Neo4jVector", + "NeuralDBClientVectorStore", "NeuralDBVectorStore", "OracleVS", "OpenSearchVectorSearch", @@ -441,6 +443,7 @@ _module_lookup = { "MyScale": "langchain_community.vectorstores.myscale", "MyScaleSettings": "langchain_community.vectorstores.myscale", "Neo4jVector": "langchain_community.vectorstores.neo4j_vector", + "NeuralDBClientVectorStore": "langchain_community.vectorstores.thirdai_neuraldb", # noqa: E501 "NeuralDBVectorStore": "langchain_community.vectorstores.thirdai_neuraldb", "OpenSearchVectorSearch": "langchain_community.vectorstores.opensearch_vector_search", # noqa: E501 "OracleVS": "langchain_community.vectorstores.oraclevs", diff --git a/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py b/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py index fa8c110474..f7a80232d9 100644 --- a/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py +++ b/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py @@ -166,7 +166,7 @@ class NeuralDBVectorStore(VectorStore): offset = self.db._savable_state.documents.get_source_by_id(source_id)[1] return [str(offset + i) for i in range(len(texts))] # type: ignore[arg-type] - @root_validator() + @root_validator(allow_reuse=True) def validate_environments(cls, values: Dict) -> Dict: """Validate ThirdAI environment variables.""" values["thirdai_key"] = convert_to_secret_str( @@ -314,3 +314,161 @@ class NeuralDBVectorStore(VectorStore): path: path on disk to save the NeuralDB instance to. """ self.db.save(path) + + +class NeuralDBClientVectorStore(VectorStore): + """Vectorstore that uses ThirdAI's NeuralDB Enterprise Python Client for NeuralDBs. + + To use, you should have the ``thirdai[neural_db]`` python package installed. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import NeuralDBClientVectorStore + from thirdai.neural_db import ModelBazaar, NeuralDBClient + + bazaar = ModelBazaar(base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/") + bazaar.log_in(email="user@thirdai.com", password="1234") + + ndb_client = NeuralDBClient( + deployment_identifier="user/model-0:user/deployment-0", + base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/", + bazaar=bazaar + ) + vectorstore = NeuralDBClientVectorStore(db=ndb_client) + retriever = vectorstore.as_retriever(search_kwargs={'k':5}) + + """ + + def __init__(self, db: Any) -> None: + self.db = db + + db: Any = None #: :meta private: + """NeuralDB Client instance""" + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + underscore_attrs_are_private = True + + def similarity_search( + self, query: str, k: int = 10, **kwargs: Any + ) -> List[Document]: + """Retrieve {k} contexts with for a given query + + Args: + query: Query to submit to the model + k: The max number of context results to retrieve. Defaults to 10. + """ + try: + references = self.db.search(query=query, top_k=k, **kwargs)["references"] + return [ + Document( + page_content=ref["text"], + metadata={ + "id": ref["id"], + "source": ref["source"], + "metadata": ref["metadata"], + "score": ref["source"], + "context": ref["context"], + }, + ) + for ref in references + ] + except Exception as e: + raise ValueError(f"Error while retrieving documents: {e}") from e + + def insert(self, documents: List[Dict[str, Any]]): # type: ignore[no-untyped-def, no-untyped-def] + """ + Inserts documents into the VectorStore and return the corresponding Sources. + + Args: + documents (List[Dict[str, Any]]): A list of dictionaries that + represent documents to be inserted to the VectorStores. + The document dictionaries must be in the following format: + {"document_type": "DOCUMENT_TYPE", **kwargs} where "DOCUMENT_TYPE" + is one of the following: + "PDF", "CSV", "DOCX", "URL", "SentenceLevelPDF", "SentenceLevelDOCX", + "Unstructured", "InMemoryText". + The kwargs for each document type are shown below: + + class PDF(Document): + document_type: Literal["PDF"] + path: str + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + version: str = "v1" + chunk_size: int = 100 + stride: int = 40 + emphasize_first_words: int = 0 + ignore_header_footer: bool = True + ignore_nonstandard_orientation: bool = True + + class CSV(Document): + document_type: Literal["CSV"] + path: str + id_column: Optional[str] = None + strong_columns: Optional[List[str]] = None + weak_columns: Optional[List[str]] = None + reference_columns: Optional[List[str]] = None + save_extra_info: bool = True + metadata: Optional[dict[str, Any]] = None + has_offset: bool = False + on_disk: bool = False + + class DOCX(Document): + document_type: Literal["DOCX"] + path: str + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + class URL(Document): + document_type: Literal["URL"] + url: str + save_extra_info: bool = True + title_is_strong: bool = False + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + class SentenceLevelPDF(Document): + document_type: Literal["SentenceLevelPDF"] + path: str + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + class SentenceLevelDOCX(Document): + document_type: Literal["SentenceLevelDOCX"] + path: str + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + class Unstructured(Document): + document_type: Literal["Unstructured"] + path: str + save_extra_info: bool = True + metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + class InMemoryText(Document): + document_type: Literal["InMemoryText"] + name: str + texts: list[str] + metadatas: Optional[list[dict[str, Any]]] = None + global_metadata: Optional[dict[str, Any]] = None + on_disk: bool = False + + For Document types with the arg "path", ensure that + the path exists on your local machine. + """ + return self.db.insert(documents) + + def remove_documents(self, source_ids: List[str]): # type: ignore[no-untyped-def] + """ + Deletes documents from the VectorStore using source ids. + + Args: + files (List[str]): A list of source ids to delete from the VectorStore. + """ + + self.db.delete(source_ids) diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 6ce51c1669..ba3cbdf8f3 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -58,6 +58,7 @@ EXPECTED_ALL = [ "MyScale", "MyScaleSettings", "Neo4jVector", + "NeuralDBClientVectorStore", "NeuralDBVectorStore", "OpenSearchVectorSearch", "OracleVS", diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 0c2dbe6673..958f7a85f1 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -93,6 +93,7 @@ _EXPECTED = [ "AzureCosmosDBVectorSearch", "VectorStore", "Yellowbrick", + "NeuralDBClientVectorStore", "NeuralDBVectorStore", "CouchbaseVectorStore", ] diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index 48792306dc..9666487383 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -66,6 +66,7 @@ if TYPE_CHECKING: MyScale, MyScaleSettings, Neo4jVector, + NeuralDBClientVectorStore, NeuralDBVectorStore, OpenSearchVectorSearch, PGEmbedding, @@ -142,6 +143,7 @@ DEPRECATED_LOOKUP = { "MyScale": "langchain_community.vectorstores", "MyScaleSettings": "langchain_community.vectorstores", "Neo4jVector": "langchain_community.vectorstores", + "NeuralDBClientVectorStore": "langchain_community.vectorstores", "NeuralDBVectorStore": "langchain_community.vectorstores", "NEuralDBVectorStore": "langchain_community.vectorstores", "OpenSearchVectorSearch": "langchain_community.vectorstores", @@ -224,6 +226,7 @@ __all__ = [ "MyScale", "MyScaleSettings", "Neo4jVector", + "NeuralDBClientVectorStore", "NeuralDBVectorStore", "OpenSearchVectorSearch", "PGEmbedding", diff --git a/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py b/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py index eead8e9cf2..dcccd2ff4b 100644 --- a/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py @@ -42,6 +42,7 @@ _EXPECTED = [ "MyScale", "MyScaleSettings", "Neo4jVector", + "NeuralDBClientVectorStore", "NeuralDBVectorStore", "OpenSearchVectorSearch", "PGEmbedding",