From c5bce4a465912d6127e0cbe2c6e59f4ab8b9583b Mon Sep 17 00:00:00 2001 From: ljeagle Date: Mon, 12 Jun 2023 10:35:03 +0800 Subject: [PATCH] add from_documents interface in awadb vector store (#6023) added new interface from_documents in awadb vector store @dev2049 --------- Co-authored-by: vincent --- langchain/vectorstores/awadb.py | 71 ++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/langchain/vectorstores/awadb.py b/langchain/vectorstores/awadb.py index 9c7d8a38..1de28595 100644 --- a/langchain/vectorstores/awadb.py +++ b/langchain/vectorstores/awadb.py @@ -49,8 +49,7 @@ class AwaDB(VectorStore): self.awadb_client = awadb.Client() self.awadb_client.Create(table_name) - if embedding_model is not None: - self.embedding_model = embedding_model + self.embedding_model = embedding_model self.added_doc_count = 0 @@ -121,6 +120,11 @@ class AwaDB(VectorStore): embedding = None if self.embedding_model is not None: embedding = self.embedding_model.embed_query(query) + else: + from awadb import llm_embedding + + llm = llm_embedding.LLMEmbedding() + embedding = llm.Embedding(query) return self.similarity_search_by_vector(embedding, k) @@ -141,13 +145,18 @@ class AwaDB(VectorStore): embedding = None if self.embedding_model is not None: embedding = self.embedding_model.embed_query(query) + else: + from awadb import llm_embedding - show_results = self.awadb_client.Search(embedding, k) + llm = llm_embedding.LLMEmbedding() + embedding = llm.Embedding(query) + + # show_results = self.awadb_client.Search(embedding, k) results: List[Tuple[Document, float]] = [] - if show_results.__len__() == 0: - return results + # if show_results.__len__() == 0: + # return results scores: List[float] = [] retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) @@ -159,7 +168,7 @@ class AwaDB(VectorStore): L2_Norm = pow(L2_Norm, 0.5) doc_no = 0 for doc in retrieval_docs: - doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm) + doc_tuple = (doc, 1 - (scores[doc_no] / L2_Norm)) results.append(doc_tuple) doc_no = doc_no + 1 @@ -208,7 +217,7 @@ class AwaDB(VectorStore): def similarity_search_by_vector( self, - embedding: List[float], + embedding: Optional[List[float]] = None, k: int = DEFAULT_TOPN, scores: Optional[list] = None, **kwargs: Any, @@ -226,10 +235,13 @@ class AwaDB(VectorStore): if self.awadb_client is None: raise ValueError("AwaDB client is None!!!") - show_results = self.awadb_client.Search(embedding, k) - results: List[Document] = [] + if embedding is None: + return results + + show_results = self.awadb_client.Search(embedding, k) + if show_results.__len__() == 0: return results @@ -237,7 +249,11 @@ class AwaDB(VectorStore): content = "" meta_data = {} for item_key in item_detail: - if item_key == "Field@0": # text for the document + if ( + item_key == "Field@0" and self.embedding_model is not None + ): # text for the document + content = item_detail[item_key] + elif self.embedding_model is None and item_key == "embedding_text": content = item_detail[item_key] elif item_key == "Field@1": # embedding field for the document continue @@ -282,3 +298,38 @@ class AwaDB(VectorStore): ) awadb_client.add_texts(texts=texts, metadatas=metadatas) return awadb_client + + @classmethod + def from_documents( + cls: Type[AwaDB], + documents: List[Document], + embedding: Optional[Embeddings] = None, + table_name: str = _DEFAULT_TABLE_NAME, + logging_and_data_dir: Optional[str] = None, + client: Optional[awadb.Client] = None, + **kwargs: Any, + ) -> AwaDB: + """Create an AwaDB vectorstore from a list of documents. + + If a logging_and_data_dir specified, the table will be persisted there. + + Args: + documents (List[Document]): List of documents to add to the vectorstore. + embedding (Optional[Embeddings]): Embedding function. Defaults to None. + table_name (str): Name of the collection to create. + logging_and_data_dir (Optional[str]): Directory to persist the table. + client (Optional[awadb.Client]): AwaDB client + + Returns: + AwaDB: AwaDB vectorstore. + """ + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return cls.from_texts( + texts=texts, + embedding=embedding, + metadatas=metadatas, + table_name=table_name, + logging_and_data_dir=logging_and_data_dir, + client=client, + )