From ad324a39aee345f47a57c858feee3189cab827eb Mon Sep 17 00:00:00 2001 From: ljeagle Date: Sat, 17 Jun 2023 07:50:01 +0800 Subject: [PATCH] Improve the performance of add_texts interface and upgrade the AwaDB from 0.3.2 to 0.3.3 (#6316) 1. Changed the implementation of add_texts interface for the AwaDB vector store in order to improve the performance 2. Upgrade the AwaDB from 0.3.2 to 0.3.3 --------- Co-authored-by: vincent --- .../vectorstores/integrations/awadb.ipynb | 63 +++++++++---------- langchain/vectorstores/awadb.py | 28 ++------- poetry.lock | 20 +++--- pyproject.toml | 2 +- 4 files changed, 47 insertions(+), 66 deletions(-) diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/awadb.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/awadb.ipynb index 9e153c75..aedfc8fe 100644 --- a/docs/extras/modules/data_connection/vectorstores/integrations/awadb.ipynb +++ b/docs/extras/modules/data_connection/vectorstores/integrations/awadb.ipynb @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)" @@ -59,22 +59,22 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "62b7a4c5", - "metadata": {}, - "outputs": [], + "execution_count": 4, + "id": "4b172de8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], "source": [ "print(docs[0].page_content)" ] }, - { - "cell_type": "markdown", - "id": "a9b4be48", - "metadata": {}, - "source": [ - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence." - ] - }, { "cell_type": "markdown", "id": "87fec6b5", @@ -103,22 +103,22 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f0045583", - "metadata": {}, - "outputs": [], + "execution_count": 4, + "id": "93cd0b7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)\n" + ] + } + ], "source": [ "print(docs[0])" ] }, - { - "cell_type": "markdown", - "id": "8c2da99d", - "metadata": {}, - "source": [ - "(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)" - ] - }, { "cell_type": "markdown", "id": "0b49fb59", @@ -153,19 +153,16 @@ "outputs": [], "source": [ "awadb_client = awadb.Client()\n", - "ret = awadb_client.Load(\"langchain_awadb\")\n", - "if ret:\n", - " print(\"awadb load table success\")\n", + "ret = awadb_client.Load('langchain_awadb')\n", + "if ret : print('awadb load table success')\n", "else:\n", - " print(\"awadb load table failed\")" + " print('awadb load table failed')" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "5ae9a9dd", + "cell_type": "raw", + "id": "aba255c2", "metadata": {}, - "outputs": [], "source": [ "awadb load table success" ] diff --git a/langchain/vectorstores/awadb.py b/langchain/vectorstores/awadb.py index 1de28595..01d14e31 100644 --- a/langchain/vectorstores/awadb.py +++ b/langchain/vectorstores/awadb.py @@ -57,13 +57,15 @@ class AwaDB(VectorStore): self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, + is_duplicate_texts: Optional[bool] = None, **kwargs: Any, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. - kwargs: vectorstore specific parameters + is_duplicate_texts: Optional whether to duplicate texts. + kwargs: vectorstore specific parameters. Returns: List of ids from adding the texts into the vectorstore. @@ -74,28 +76,10 @@ class AwaDB(VectorStore): embeddings = None if self.embedding_model is not None: embeddings = self.embedding_model.embed_documents(list(texts)) - added_results: List[str] = [] - doc_no = 0 - for text in texts: - doc: List[Any] = [] - if embeddings is not None: - doc.append(text) - doc.append(embeddings[doc_no]) - else: - dict_tmp = {} - dict_tmp["embedding_text"] = text - doc.append(dict_tmp) - - if metadatas is not None: - if doc_no < metadatas.__len__(): - doc.append(metadatas[doc_no]) - self.awadb_client.Add(doc) - added_results.append(str(self.added_doc_count)) - doc_no = doc_no + 1 - self.added_doc_count = self.added_doc_count + 1 - - return added_results + return self.awadb_client.AddTexts( + "text", "text_embedding", texts, embeddings, metadatas, is_duplicate_texts + ) def load_local( self, diff --git a/poetry.lock b/poetry.lock index 92094baa..5652b4f3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -572,19 +572,19 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"] [[package]] name = "awadb" -version = "0.3.2" +version = "0.3.3" description = "The AI Native database for embedding vectors" category = "main" optional = true -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "awadb-0.3.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:f3ce3b066198782fa413f452c56001c58ebec71a1e1dca0eee68f73321ba15a9"}, - {file = "awadb-0.3.2-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:c96b5e263c32b2563b1fa027035bdcf50540808ad303071cc1aed3471c3c39b7"}, - {file = "awadb-0.3.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:3e43b5a74753261857d0b146543a4620e00938833181259f138f07457fa84812"}, - {file = "awadb-0.3.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6330b4d18a814c1562113b3b7897db629c2ac9b5818236ead0fc5f3445b6b7fb"}, - {file = "awadb-0.3.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:82b4e61cc905339868a9f833d0988098f56411b42e0f8dd571aec7c8d6a3f1fa"}, - {file = "awadb-0.3.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5efaa93d69c467f16ec4f65ed250ec26015781826c0d059c8a54613a5d3e2c3e"}, - {file = "awadb-0.3.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7be0811550d72f49018e4790d290cf521f92ffa84d65ef1073e621f225d142ec"}, + {file = "awadb-0.3.3-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:daebc108103c8cace41dfb3235fcfdda28ea48e6cd6548b6072f7ad49b64274b"}, + {file = "awadb-0.3.3-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:2bb3ca2f943448060b1bba4395dd99e2218d7f2149507a8fdfa7a3fd4cfe97ec"}, + {file = "awadb-0.3.3-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7b99662af9f7b58e217661a70c295e40605900552bec6d8e9553d90dbf19c5c1"}, + {file = "awadb-0.3.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:94be44e587f28fa26b2cade0b6f4c04689f50cb0c07183db5ee50e48fe2e9ae3"}, + {file = "awadb-0.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:314929dc3a8d25c0f234a2b86c920543050f4eb298a6f68bd2c97c9fe3fb6224"}, + {file = "awadb-0.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8bfccff1c7373899153427d93d96a97ae5371e8a6f09ff4dcbd28fb9f3f63ff4"}, + {file = "awadb-0.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:810021a90b873f668d8ab63e2c2747b2b2835bf0ae25f4223b6c94f06faffea4"}, ] [package.extras] @@ -11486,4 +11486,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "b4a782d8223ccc19b2dfb777978c3ad636b11a79cc58a5c45e4dcdb0fe5e29c1" +content-hash = "dd54bb9201b260b734ceebad2c6629b4b1b3bf224731de092782678219f32120" diff --git a/pyproject.toml b/pyproject.toml index 7ec28933..088e0ec2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ pyspark = {version = "^3.4.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} nebula3-python = {version = "^3.4.0", optional = true} langchainplus-sdk = ">=0.0.9" -awadb = {version = "^0.3.2", optional = true} +awadb = {version = "^0.3.3", optional = true} azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true} [tool.poetry.group.docs.dependencies]