Ndb enterprise (#21233)

Description: Adds NeuralDBClientVectorStore to the langchain, which is our enterprise client. --------- Co-authored-by: kartikTAI <129414343+kartikTAI@users.noreply.github.com> Co-authored-by: Kartik Sarangmath <kartik@thirdai.com>
5 months ago · cb31c3611f
parent 74044e44a5
commit cb31c3611f
6 changed files with 168 additions and 1 deletions
--- a/libs/community/langchain_community/vectorstores/init.py
+++ b/libs/community/langchain_community/vectorstores/init.py
@ -236,6 +236,7 @@ if TYPE_CHECKING:
        TencentVectorDB,
    )
    from langchain_community.vectorstores.thirdai_neuraldb import (
        NeuralDBClientVectorStore,
        NeuralDBVectorStore,
    )
    from langchain_community.vectorstores.tidb_vector import (
@ -345,6 +346,7 @@ __all__ = [
    "MyScale",
    "MyScaleSettings",
    "Neo4jVector",
    "NeuralDBClientVectorStore",
    "NeuralDBVectorStore",
    "OracleVS",
    "OpenSearchVectorSearch",
@ -441,6 +443,7 @@ _module_lookup = {
    "MyScale": "langchain_community.vectorstores.myscale",
    "MyScaleSettings": "langchain_community.vectorstores.myscale",
    "Neo4jVector": "langchain_community.vectorstores.neo4j_vector",
    "NeuralDBClientVectorStore": "langchain_community.vectorstores.thirdai_neuraldb",  # noqa: E501
    "NeuralDBVectorStore": "langchain_community.vectorstores.thirdai_neuraldb",
    "OpenSearchVectorSearch": "langchain_community.vectorstores.opensearch_vector_search",  # noqa: E501
    "OracleVS": "langchain_community.vectorstores.oraclevs",
--- a/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py
+++ b/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py
@ -166,7 +166,7 @@ class NeuralDBVectorStore(VectorStore):
        offset = self.db._savable_state.documents.get_source_by_id(source_id)[1]
        return [str(offset + i) for i in range(len(texts))]  # type: ignore[arg-type]
-    @root_validator()
+    @root_validator(allow_reuse=True)
    def validate_environments(cls, values: Dict) -> Dict:
        """Validate ThirdAI environment variables."""
        values["thirdai_key"] = convert_to_secret_str(
@ -314,3 +314,161 @@ class NeuralDBVectorStore(VectorStore):
            path: path on disk to save the NeuralDB instance to.
        """
        self.db.save(path)
 class NeuralDBClientVectorStore(VectorStore):
    """Vectorstore that uses ThirdAI's NeuralDB Enterprise Python Client for NeuralDBs.
    To use, you should have the ``thirdai[neural_db]`` python package installed.
    Example:
        .. code-block:: python
            from langchain_community.vectorstores import NeuralDBClientVectorStore
            from thirdai.neural_db import ModelBazaar, NeuralDBClient
            bazaar = ModelBazaar(base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/")
            bazaar.log_in(email="user@thirdai.com", password="1234")
            ndb_client = NeuralDBClient(
                deployment_identifier="user/model-0:user/deployment-0",
                base_url="http://{NEURAL_DB_ENTERPRISE_IP}/api/",
                bazaar=bazaar
            )
            vectorstore = NeuralDBClientVectorStore(db=ndb_client)
            retriever = vectorstore.as_retriever(search_kwargs={'k':5})
    """
    def __init__(self, db: Any) -> None:
        self.db = db
    db: Any = None  #: :meta private:
    """NeuralDB Client instance"""
    class Config:
        """Configuration for this pydantic object."""
        extra = Extra.forbid
        underscore_attrs_are_private = True
    def similarity_search(
        self, query: str, k: int = 10, **kwargs: Any
    ) -> List[Document]:
        """Retrieve {k} contexts with for a given query
        Args:
            query: Query to submit to the model
            k: The max number of context results to retrieve. Defaults to 10.
        """
        try:
            references = self.db.search(query=query, top_k=k, **kwargs)["references"]
            return [
                Document(
                    page_content=ref["text"],
                    metadata={
                        "id": ref["id"],
                        "source": ref["source"],
                        "metadata": ref["metadata"],
                        "score": ref["source"],
                        "context": ref["context"],
                    },
                )
                for ref in references
            ]
        except Exception as e:
            raise ValueError(f"Error while retrieving documents: {e}") from e
    def insert(self, documents: List[Dict[str, Any]]):  # type: ignore[no-untyped-def, no-untyped-def]
        """
        Inserts documents into the VectorStore and return the corresponding Sources.
        Args:
            documents (List[Dict[str, Any]]): A list of dictionaries that
            represent documents to be inserted to the VectorStores.
            The document dictionaries must be in the following format:
            {"document_type": "DOCUMENT_TYPE", **kwargs} where "DOCUMENT_TYPE"
            is one of the following:
            "PDF", "CSV", "DOCX", "URL", "SentenceLevelPDF", "SentenceLevelDOCX",
            "Unstructured", "InMemoryText".
            The kwargs for each document type are shown below:
            class PDF(Document):
                document_type: Literal["PDF"]
                path: str
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
                version: str = "v1"
                chunk_size: int = 100
                stride: int = 40
                emphasize_first_words: int = 0
                ignore_header_footer: bool = True
                ignore_nonstandard_orientation: bool = True
            class CSV(Document):
                document_type: Literal["CSV"]
                path: str
                id_column: Optional[str] = None
                strong_columns: Optional[List[str]] = None
                weak_columns: Optional[List[str]] = None
                reference_columns: Optional[List[str]] = None
                save_extra_info: bool = True
                metadata: Optional[dict[str, Any]] = None
                has_offset: bool = False
                on_disk: bool = False
            class DOCX(Document):
                document_type: Literal["DOCX"]
                path: str
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            class URL(Document):
                document_type: Literal["URL"]
                url: str
                save_extra_info: bool = True
                title_is_strong: bool = False
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            class SentenceLevelPDF(Document):
                document_type: Literal["SentenceLevelPDF"]
                path: str
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            class SentenceLevelDOCX(Document):
                document_type: Literal["SentenceLevelDOCX"]
                path: str
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            class Unstructured(Document):
                document_type: Literal["Unstructured"]
                path: str
                save_extra_info: bool = True
                metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            class InMemoryText(Document):
                document_type: Literal["InMemoryText"]
                name: str
                texts: list[str]
                metadatas: Optional[list[dict[str, Any]]] = None
                global_metadata: Optional[dict[str, Any]] = None
                on_disk: bool = False
            For Document types with the arg "path", ensure that
            the path exists on your local machine.
        """
        return self.db.insert(documents)
    def remove_documents(self, source_ids: List[str]):  # type: ignore[no-untyped-def]
        """
        Deletes documents from the VectorStore using source ids.
        Args:
            files (List[str]): A list of source ids to delete from the VectorStore.
        """
        self.db.delete(source_ids)
--- a/libs/community/tests/unit_tests/vectorstores/test_imports.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py
@ -58,6 +58,7 @@ EXPECTED_ALL = [
    "MyScale",
    "MyScaleSettings",
    "Neo4jVector",
    "NeuralDBClientVectorStore",
    "NeuralDBVectorStore",
    "OpenSearchVectorSearch",
    "OracleVS",
--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@ -93,6 +93,7 @@ _EXPECTED = [
    "AzureCosmosDBVectorSearch",
    "VectorStore",
    "Yellowbrick",
    "NeuralDBClientVectorStore",
    "NeuralDBVectorStore",
    "CouchbaseVectorStore",
 ]
--- a/libs/langchain/langchain/vectorstores/init.py
+++ b/libs/langchain/langchain/vectorstores/init.py
@ -66,6 +66,7 @@ if TYPE_CHECKING:
        MyScale,
        MyScaleSettings,
        Neo4jVector,
        NeuralDBClientVectorStore,
        NeuralDBVectorStore,
        OpenSearchVectorSearch,
        PGEmbedding,
@ -142,6 +143,7 @@ DEPRECATED_LOOKUP = {
    "MyScale": "langchain_community.vectorstores",
    "MyScaleSettings": "langchain_community.vectorstores",
    "Neo4jVector": "langchain_community.vectorstores",
    "NeuralDBClientVectorStore": "langchain_community.vectorstores",
    "NeuralDBVectorStore": "langchain_community.vectorstores",
    "NEuralDBVectorStore": "langchain_community.vectorstores",
    "OpenSearchVectorSearch": "langchain_community.vectorstores",
@ -224,6 +226,7 @@ __all__ = [
    "MyScale",
    "MyScaleSettings",
    "Neo4jVector",
    "NeuralDBClientVectorStore",
    "NeuralDBVectorStore",
    "OpenSearchVectorSearch",
    "PGEmbedding",
--- a/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py
@ -42,6 +42,7 @@ _EXPECTED = [
    "MyScale",
    "MyScaleSettings",
    "Neo4jVector",
    "NeuralDBClientVectorStore",
    "NeuralDBVectorStore",
    "OpenSearchVectorSearch",
    "PGEmbedding",