tair fix distance_type error, and add hybrid search (#9531)

- fix: distance_type error, 
- feature: Tair add hybrid search

---------

Co-authored-by: thw <hanwen.thw@alibaba-inc.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Keras Conv3d 2023-08-24 07:38:31 +08:00 committed by GitHub
parent cd81e8a8f2
commit cbaea8d63b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 99 additions and 20 deletions

View File

@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@ -27,13 +27,30 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 30,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "RuntimeError",
"evalue": "Error loading ../../../state_of_the_union.txt",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:40\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 40\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfile_path, encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mencoding) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 41\u001b[0m text \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39mread()\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../../state_of_the_union.txt'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdocument_loaders\u001b[39;00m \u001b[39mimport\u001b[39;00m TextLoader\n\u001b[1;32m 3\u001b[0m loader \u001b[39m=\u001b[39m TextLoader(\u001b[39m\"\u001b[39m\u001b[39m../../../state_of_the_union.txt\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m documents \u001b[39m=\u001b[39m loader\u001b[39m.\u001b[39;49mload()\n\u001b[1;32m 5\u001b[0m text_splitter \u001b[39m=\u001b[39m CharacterTextSplitter(chunk_size\u001b[39m=\u001b[39m\u001b[39m1000\u001b[39m, chunk_overlap\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 6\u001b[0m docs \u001b[39m=\u001b[39m text_splitter\u001b[39m.\u001b[39msplit_documents(documents)\n",
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:56\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m---> 56\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 58\u001b[0m metadata \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39msource\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path}\n\u001b[1;32m 59\u001b[0m \u001b[39mreturn\u001b[39;00m [Document(page_content\u001b[39m=\u001b[39mtext, metadata\u001b[39m=\u001b[39mmetadata)]\n",
"\u001b[0;31mRuntimeError\u001b[0m: Error loading ../../../state_of_the_union.txt"
]
}
],
"source": [
"from langchain.document_loaders import TextLoader\n",
"\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
@ -57,16 +74,28 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'docs' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[39m# drop first if index already exists\u001b[39;00m\n\u001b[1;32m 4\u001b[0m Tair\u001b[39m.\u001b[39mdrop_index(tair_url\u001b[39m=\u001b[39mtair_url)\n\u001b[0;32m----> 6\u001b[0m vector_store \u001b[39m=\u001b[39m Tair\u001b[39m.\u001b[39mfrom_documents(docs, embeddings, tair_url\u001b[39m=\u001b[39mtair_url)\n",
"\u001b[0;31mNameError\u001b[0m: name 'docs' is not defined"
]
}
],
"source": [
"tair_url = \"redis://localhost:6379\"\n",
"\n",
"# drop first if index already exists\n",
"Tair.drop_index(tair_url=tair_url)\n",
"\n",
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)"
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)\n"
]
},
{
@ -98,12 +127,44 @@
"docs[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tair Hybrid Search Index build"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# drop first if index already exists\n",
"Tair.drop_index(tair_url=tair_url)\n",
"\n",
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url, index_params={\"lexical_algorithm\":\"bm25\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tair Hybrid Search"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"# hybrid_ratio: 0.5 hybrid search, 0.9999 vector search, 0.0001 text search\n",
"kwargs = {\"TEXT\" : query, \"hybrid_ratio\" : 0.5}\n",
"docs = vector_store.similarity_search(query, **kwargs)\n",
"docs[0]"
]
}
],
"metadata": {
@ -122,7 +183,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.8"
}
},
"nbformat": 4,

View File

@ -86,6 +86,10 @@ class Tair(VectorStore):
"""Add texts data to an existing index."""
ids = []
keys = kwargs.get("keys", None)
use_hybrid_search = False
index = self.client.tvs_get_index(self.index_name)
if index is not None and index.get("lexical_algorithm") == "bm25":
use_hybrid_search = True
# Write data to tair
pipeline = self.client.pipeline(transaction=False)
embeddings = self.embedding_function.embed_documents(list(texts))
@ -93,16 +97,30 @@ class Tair(VectorStore):
# Use provided key otherwise use default key
key = keys[i] if keys else _uuid_key()
metadata = metadatas[i] if metadatas else {}
pipeline.tvs_hset(
self.index_name,
key,
embeddings[i],
False,
**{
self.content_key: text,
self.metadata_key: json.dumps(metadata),
},
)
if use_hybrid_search:
# tair use TEXT attr hybrid search
pipeline.tvs_hset(
self.index_name,
key,
embeddings[i],
False,
**{
"TEXT": text,
self.content_key: text,
self.metadata_key: json.dumps(metadata),
},
)
else:
pipeline.tvs_hset(
self.index_name,
key,
embeddings[i],
False,
**{
self.content_key: text,
self.metadata_key: json.dumps(metadata),
},
)
ids.append(key)
pipeline.execute()
return ids
@ -166,7 +184,7 @@ class Tair(VectorStore):
distance_type = tairvector.DistanceMetric.InnerProduct
if "distance_type" in kwargs:
distance_type = kwargs.pop("distance_typ")
distance_type = kwargs.pop("distance_type")
index_type = tairvector.IndexType.HNSW
if "index_type" in kwargs:
index_type = kwargs.pop("index_type")