From ca24dc2d5f4cae30be3b22d8787c1970d7de7e20 Mon Sep 17 00:00:00 2001 From: ljeagle Date: Thu, 22 Jun 2023 14:15:18 +0800 Subject: [PATCH] Upgrade the version of AwaDB and add some new interfaces (#6565) 1. upgrade the version of AwaDB 2. add some new interfaces 3. fix bug of packing page content error @dev2049 please review, thanks! --------- Co-authored-by: vincent --- langchain/vectorstores/awadb.py | 106 +++++++++++++++++++++++++------- poetry.lock | 16 ++--- 2 files changed, 92 insertions(+), 30 deletions(-) diff --git a/langchain/vectorstores/awadb.py b/langchain/vectorstores/awadb.py index 01d14e31..59a4be8a 100644 --- a/langchain/vectorstores/awadb.py +++ b/langchain/vectorstores/awadb.py @@ -2,6 +2,7 @@ from __future__ import annotations import logging +import uuid from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type from langchain.docstore.document import Document @@ -48,10 +49,15 @@ class AwaDB(VectorStore): else: self.awadb_client = awadb.Client() - self.awadb_client.Create(table_name) - self.embedding_model = embedding_model + if table_name == self._DEFAULT_TABLE_NAME: + table_name += "_" + table_name += str(uuid.uuid4()).split("-")[-1] - self.added_doc_count = 0 + self.awadb_client.Create(table_name) + self.table2embeddings: dict[str, Embeddings] = {} + if embedding_model is not None: + self.table2embeddings[table_name] = embedding_model + self.using_table_name = table_name def add_texts( self, @@ -74,16 +80,23 @@ class AwaDB(VectorStore): raise ValueError("AwaDB client is None!!!") embeddings = None - if self.embedding_model is not None: - embeddings = self.embedding_model.embed_documents(list(texts)) + if self.using_table_name in self.table2embeddings: + embeddings = self.table2embeddings[self.using_table_name].embed_documents( + list(texts) + ) return self.awadb_client.AddTexts( - "text", "text_embedding", texts, embeddings, metadatas, is_duplicate_texts + "embedding_text", + "text_embedding", + texts, + embeddings, + metadatas, + is_duplicate_texts, ) def load_local( self, - table_name: str = _DEFAULT_TABLE_NAME, + table_name: str, **kwargs: Any, ) -> bool: if self.awadb_client is None: @@ -102,8 +115,8 @@ class AwaDB(VectorStore): raise ValueError("AwaDB client is None!!!") embedding = None - if self.embedding_model is not None: - embedding = self.embedding_model.embed_query(query) + if self.using_table_name in self.table2embeddings: + embedding = self.table2embeddings[self.using_table_name].embed_query(query) else: from awadb import llm_embedding @@ -127,21 +140,16 @@ class AwaDB(VectorStore): raise ValueError("AwaDB client is None!!!") embedding = None - if self.embedding_model is not None: - embedding = self.embedding_model.embed_query(query) + if self.using_table_name in self.table2embeddings: + embedding = self.table2embeddings[self.using_table_name].embed_query(query) else: from awadb import llm_embedding llm = llm_embedding.LLMEmbedding() embedding = llm.Embedding(query) - # show_results = self.awadb_client.Search(embedding, k) - results: List[Tuple[Document, float]] = [] - # if show_results.__len__() == 0: - # return results - scores: List[float] = [] retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) @@ -173,8 +181,8 @@ class AwaDB(VectorStore): raise ValueError("AwaDB client is None!!!") embedding = None - if self.embedding_model is not None: - embedding = self.embedding_model.embed_query(query) + if self.using_table_name in self.table2embeddings: + embedding = self.table2embeddings[self.using_table_name].embed_query(query) show_results = self.awadb_client.Search(embedding, k) @@ -234,12 +242,15 @@ class AwaDB(VectorStore): meta_data = {} for item_key in item_detail: if ( - item_key == "Field@0" and self.embedding_model is not None + item_key == "Field@0" + and self.using_table_name in self.table2embeddings ): # text for the document content = item_detail[item_key] - elif self.embedding_model is None and item_key == "embedding_text": + elif item_key == "embedding_text": content = item_detail[item_key] - elif item_key == "Field@1": # embedding field for the document + elif ( + item_key == "Field@1" or item_key == "text_embedding" + ): # embedding field for the document continue elif item_key == "score": # L2 distance if scores is not None: @@ -250,6 +261,57 @@ class AwaDB(VectorStore): results.append(Document(page_content=content, metadata=meta_data)) return results + def create_table( + self, + table_name: str, + **kwargs: Any, + ) -> bool: + """Create a new table.""" + + if self.awadb_client is None: + return False + + ret = self.awadb_client.Create(table_name) + + if ret: + self.using_table_name = table_name + return ret + + def use( + self, + table_name: str, + **kwargs: Any, + ) -> bool: + """Use the specified table. Don't know the tables, please invoke list_tables.""" + + if self.awadb_client is None: + return False + + ret = self.awadb_client.Use(table_name) + if ret: + self.using_table_name = table_name + + return ret + + def list_tables( + self, + **kwargs: Any, + ) -> List[str]: + """List all the tables created by the client.""" + + if self.awadb_client is None: + return [] + + return self.awadb_client.ListAllTables() + + def get_current_table( + self, + **kwargs: Any, + ) -> str: + """Get the current table.""" + + return self.using_table_name + @classmethod def from_texts( cls: Type[AwaDB], @@ -300,7 +362,7 @@ class AwaDB(VectorStore): Args: documents (List[Document]): List of documents to add to the vectorstore. embedding (Optional[Embeddings]): Embedding function. Defaults to None. - table_name (str): Name of the collection to create. + table_name (str): Name of the table to create. logging_and_data_dir (Optional[str]): Directory to persist the table. client (Optional[awadb.Client]): AwaDB client diff --git a/poetry.lock b/poetry.lock index 8de0b4ce..f1402bd4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -572,19 +572,19 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"] [[package]] name = "awadb" -version = "0.3.3" +version = "0.3.5" description = "The AI Native database for embedding vectors" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "awadb-0.3.3-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:daebc108103c8cace41dfb3235fcfdda28ea48e6cd6548b6072f7ad49b64274b"}, - {file = "awadb-0.3.3-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:2bb3ca2f943448060b1bba4395dd99e2218d7f2149507a8fdfa7a3fd4cfe97ec"}, - {file = "awadb-0.3.3-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7b99662af9f7b58e217661a70c295e40605900552bec6d8e9553d90dbf19c5c1"}, - {file = "awadb-0.3.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:94be44e587f28fa26b2cade0b6f4c04689f50cb0c07183db5ee50e48fe2e9ae3"}, - {file = "awadb-0.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:314929dc3a8d25c0f234a2b86c920543050f4eb298a6f68bd2c97c9fe3fb6224"}, - {file = "awadb-0.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8bfccff1c7373899153427d93d96a97ae5371e8a6f09ff4dcbd28fb9f3f63ff4"}, - {file = "awadb-0.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:810021a90b873f668d8ab63e2c2747b2b2835bf0ae25f4223b6c94f06faffea4"}, + {file = "awadb-0.3.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:9addae6f0cfd57cdb3e8915778772c51fc1c41f5cacebad1322c5bfe30c95516"}, + {file = "awadb-0.3.5-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:1b06099c4baf906829e4550f3cf0da602aba44465c89ede5889943619c0b49ce"}, + {file = "awadb-0.3.5-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:dedbb62496aadb70894fa965922625ff3003397445f38d5da7f4092e17f93725"}, + {file = "awadb-0.3.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0f7eafcbe5311cc7b976faca368bc666a70f22ebd7f0039b5c5f791f2909377e"}, + {file = "awadb-0.3.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:950ddb5c483ea1ce24550d32c79e4c40c10dfc73cafc6f6faa8b14f22271dce5"}, + {file = "awadb-0.3.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:98b5d5a7a0d11253e23bec48295dc45356eead2338eac0d4f73a3755060992db"}, + {file = "awadb-0.3.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1c14d9014e7e0ccf8eaecc540e0c88893fcdd3a1438f71c110b9ec80b565dae6"}, ] [package.extras]