tair fix distance_type error, and add hybrid search (#9531)

- fix: distance_type error, - feature: Tair add hybrid search --------- Co-authored-by: thw <hanwen.thw@alibaba-inc.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-11-04 06:00:26 +00:00 · 2023-08-24 07:38:31 +08:00 · 2023-08-24 07:38:31 +08:00 · cbaea8d63b
commit cbaea8d63b
parent cd81e8a8f2
2 changed files with 99 additions and 20 deletions
--- a/docs/extras/integrations/vectorstores/tair.ipynb
+++ b/docs/extras/integrations/vectorstores/tair.ipynb
@ -16,7 +16,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
@ -27,13 +27,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 30,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Error loading ../../../state_of_the_union.txt",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:40\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     39\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 40\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfile_path, encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mencoding) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m     41\u001b[0m         text \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39mread()\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../../state_of_the_union.txt'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdocument_loaders\u001b[39;00m \u001b[39mimport\u001b[39;00m TextLoader\n\u001b[1;32m      3\u001b[0m loader \u001b[39m=\u001b[39m TextLoader(\u001b[39m\"\u001b[39m\u001b[39m../../../state_of_the_union.txt\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m documents \u001b[39m=\u001b[39m loader\u001b[39m.\u001b[39;49mload()\n\u001b[1;32m      5\u001b[0m text_splitter \u001b[39m=\u001b[39m CharacterTextSplitter(chunk_size\u001b[39m=\u001b[39m\u001b[39m1000\u001b[39m, chunk_overlap\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m      6\u001b[0m docs \u001b[39m=\u001b[39m text_splitter\u001b[39m.\u001b[39msplit_documents(documents)\n",
+      "File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:56\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     54\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m     55\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m---> 56\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m     58\u001b[0m metadata \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39msource\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path}\n\u001b[1;32m     59\u001b[0m \u001b[39mreturn\u001b[39;00m [Document(page_content\u001b[39m=\u001b[39mtext, metadata\u001b[39m=\u001b[39mmetadata)]\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Error loading ../../../state_of_the_union.txt"
+     ]
+    }
+   ],
   "source": [
    "from langchain.document_loaders import TextLoader\n",
    "\n",
-    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
+    "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n",
    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
@ -57,16 +74,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'docs' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[39m# drop first if index already exists\u001b[39;00m\n\u001b[1;32m      4\u001b[0m Tair\u001b[39m.\u001b[39mdrop_index(tair_url\u001b[39m=\u001b[39mtair_url)\n\u001b[0;32m----> 6\u001b[0m vector_store \u001b[39m=\u001b[39m Tair\u001b[39m.\u001b[39mfrom_documents(docs, embeddings, tair_url\u001b[39m=\u001b[39mtair_url)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'docs' is not defined"
+     ]
+    }
+   ],
   "source": [
    "tair_url = \"redis://localhost:6379\"\n",
    "\n",
    "# drop first if index already exists\n",
    "Tair.drop_index(tair_url=tair_url)\n",
    "\n",
-    "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)"
+    "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)\n"
   ]
  },
  {
@ -98,12 +127,44 @@
    "docs[0]"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tair Hybrid Search Index build"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# drop first if index already exists\n",
+    "Tair.drop_index(tair_url=tair_url)\n",
+    "\n",
+    "vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url, index_params={\"lexical_algorithm\":\"bm25\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tair Hybrid Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
+    "# hybrid_ratio: 0.5 hybrid search, 0.9999 vector search, 0.0001 text search\n",
+    "kwargs = {\"TEXT\" : query, \"hybrid_ratio\" : 0.5}\n",
+    "docs = vector_store.similarity_search(query, **kwargs)\n",
+    "docs[0]"
+   ]
  }
 ],
 "metadata": {
@ -122,7 +183,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.8"
  }
 },
 "nbformat": 4,
--- a/libs/langchain/langchain/vectorstores/tair.py
+++ b/libs/langchain/langchain/vectorstores/tair.py
@ -86,6 +86,10 @@ class Tair(VectorStore):
        """Add texts data to an existing index."""
        ids = []
        keys = kwargs.get("keys", None)
+        use_hybrid_search = False
+        index = self.client.tvs_get_index(self.index_name)
+        if index is not None and index.get("lexical_algorithm") == "bm25":
+            use_hybrid_search = True
        # Write data to tair
        pipeline = self.client.pipeline(transaction=False)
        embeddings = self.embedding_function.embed_documents(list(texts))
@ -93,16 +97,30 @@ class Tair(VectorStore):
            # Use provided key otherwise use default key
            key = keys[i] if keys else _uuid_key()
            metadata = metadatas[i] if metadatas else {}
-            pipeline.tvs_hset(
-                self.index_name,
-                key,
-                embeddings[i],
-                False,
-                **{
-                    self.content_key: text,
-                    self.metadata_key: json.dumps(metadata),
-                },
-            )
+            if use_hybrid_search:
+                # tair use TEXT attr hybrid search
+                pipeline.tvs_hset(
+                    self.index_name,
+                    key,
+                    embeddings[i],
+                    False,
+                    **{
+                        "TEXT": text,
+                        self.content_key: text,
+                        self.metadata_key: json.dumps(metadata),
+                    },
+                )
+            else:
+                pipeline.tvs_hset(
+                    self.index_name,
+                    key,
+                    embeddings[i],
+                    False,
+                    **{
+                        self.content_key: text,
+                        self.metadata_key: json.dumps(metadata),
+                    },
+                )
            ids.append(key)
        pipeline.execute()
        return ids
@ -166,7 +184,7 @@ class Tair(VectorStore):

        distance_type = tairvector.DistanceMetric.InnerProduct
        if "distance_type" in kwargs:
-            distance_type = kwargs.pop("distance_typ")
+            distance_type = kwargs.pop("distance_type")
        index_type = tairvector.IndexType.HNSW
        if "index_type" in kwargs:
            index_type = kwargs.pop("index_type")