From cbaea8d63b98b3c1a8cb3778c9bceec28e258896 Mon Sep 17 00:00:00 2001 From: Keras Conv3d <35223092+tianhanwen@users.noreply.github.com> Date: Thu, 24 Aug 2023 07:38:31 +0800 Subject: [PATCH] tair fix distance_type error, and add hybrid search (#9531) - fix: distance_type error, - feature: Tair add hybrid search --------- Co-authored-by: thw Co-authored-by: Bagatur --- .../integrations/vectorstores/tair.ipynb | 79 ++++++++++++++++--- libs/langchain/langchain/vectorstores/tair.py | 40 +++++++--- 2 files changed, 99 insertions(+), 20 deletions(-) diff --git a/docs/extras/integrations/vectorstores/tair.ipynb b/docs/extras/integrations/vectorstores/tair.ipynb index e3e7b024d8..0b55b9f561 100644 --- a/docs/extras/integrations/vectorstores/tair.ipynb +++ b/docs/extras/integrations/vectorstores/tair.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -27,13 +27,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "Error loading ../../../state_of_the_union.txt", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:40\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 40\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfile_path, encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mencoding) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 41\u001b[0m text \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39mread()\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../../state_of_the_union.txt'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdocument_loaders\u001b[39;00m \u001b[39mimport\u001b[39;00m TextLoader\n\u001b[1;32m 3\u001b[0m loader \u001b[39m=\u001b[39m TextLoader(\u001b[39m\"\u001b[39m\u001b[39m../../../state_of_the_union.txt\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m documents \u001b[39m=\u001b[39m loader\u001b[39m.\u001b[39;49mload()\n\u001b[1;32m 5\u001b[0m text_splitter \u001b[39m=\u001b[39m CharacterTextSplitter(chunk_size\u001b[39m=\u001b[39m\u001b[39m1000\u001b[39m, chunk_overlap\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 6\u001b[0m docs \u001b[39m=\u001b[39m text_splitter\u001b[39m.\u001b[39msplit_documents(documents)\n", + "File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:56\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m---> 56\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 58\u001b[0m metadata \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39msource\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path}\n\u001b[1;32m 59\u001b[0m \u001b[39mreturn\u001b[39;00m [Document(page_content\u001b[39m=\u001b[39mtext, metadata\u001b[39m=\u001b[39mmetadata)]\n", + "\u001b[0;31mRuntimeError\u001b[0m: Error loading ../../../state_of_the_union.txt" + ] + } + ], "source": [ "from langchain.document_loaders import TextLoader\n", "\n", - "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", @@ -57,16 +74,28 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'docs' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[39m# drop first if index already exists\u001b[39;00m\n\u001b[1;32m 4\u001b[0m Tair\u001b[39m.\u001b[39mdrop_index(tair_url\u001b[39m=\u001b[39mtair_url)\n\u001b[0;32m----> 6\u001b[0m vector_store \u001b[39m=\u001b[39m Tair\u001b[39m.\u001b[39mfrom_documents(docs, embeddings, tair_url\u001b[39m=\u001b[39mtair_url)\n", + "\u001b[0;31mNameError\u001b[0m: name 'docs' is not defined" + ] + } + ], "source": [ "tair_url = \"redis://localhost:6379\"\n", "\n", "# drop first if index already exists\n", "Tair.drop_index(tair_url=tair_url)\n", "\n", - "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)" + "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)\n" ] }, { @@ -98,12 +127,44 @@ "docs[0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tair Hybrid Search Index build" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# drop first if index already exists\n", + "Tair.drop_index(tair_url=tair_url)\n", + "\n", + "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url, index_params={\"lexical_algorithm\":\"bm25\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tair Hybrid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "# hybrid_ratio: 0.5 hybrid search, 0.9999 vector search, 0.0001 text search\n", + "kwargs = {\"TEXT\" : query, \"hybrid_ratio\" : 0.5}\n", + "docs = vector_store.similarity_search(query, **kwargs)\n", + "docs[0]" + ] } ], "metadata": { @@ -122,7 +183,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.8" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/vectorstores/tair.py b/libs/langchain/langchain/vectorstores/tair.py index 0bda5c6816..26108da592 100644 --- a/libs/langchain/langchain/vectorstores/tair.py +++ b/libs/langchain/langchain/vectorstores/tair.py @@ -86,6 +86,10 @@ class Tair(VectorStore): """Add texts data to an existing index.""" ids = [] keys = kwargs.get("keys", None) + use_hybrid_search = False + index = self.client.tvs_get_index(self.index_name) + if index is not None and index.get("lexical_algorithm") == "bm25": + use_hybrid_search = True # Write data to tair pipeline = self.client.pipeline(transaction=False) embeddings = self.embedding_function.embed_documents(list(texts)) @@ -93,16 +97,30 @@ class Tair(VectorStore): # Use provided key otherwise use default key key = keys[i] if keys else _uuid_key() metadata = metadatas[i] if metadatas else {} - pipeline.tvs_hset( - self.index_name, - key, - embeddings[i], - False, - **{ - self.content_key: text, - self.metadata_key: json.dumps(metadata), - }, - ) + if use_hybrid_search: + # tair use TEXT attr hybrid search + pipeline.tvs_hset( + self.index_name, + key, + embeddings[i], + False, + **{ + "TEXT": text, + self.content_key: text, + self.metadata_key: json.dumps(metadata), + }, + ) + else: + pipeline.tvs_hset( + self.index_name, + key, + embeddings[i], + False, + **{ + self.content_key: text, + self.metadata_key: json.dumps(metadata), + }, + ) ids.append(key) pipeline.execute() return ids @@ -166,7 +184,7 @@ class Tair(VectorStore): distance_type = tairvector.DistanceMetric.InnerProduct if "distance_type" in kwargs: - distance_type = kwargs.pop("distance_typ") + distance_type = kwargs.pop("distance_type") index_type = tairvector.IndexType.HNSW if "index_type" in kwargs: index_type = kwargs.pop("index_type")