mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
tair fix distance_type error, and add hybrid search (#9531)
- fix: distance_type error, - feature: Tair add hybrid search --------- Co-authored-by: thw <hanwen.thw@alibaba-inc.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
cd81e8a8f2
commit
cbaea8d63b
@ -16,7 +16,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -27,13 +27,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "RuntimeError",
|
||||
"evalue": "Error loading ../../../state_of_the_union.txt",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:40\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 40\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfile_path, encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mencoding) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 41\u001b[0m text \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39mread()\n",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../../state_of_the_union.txt'",
|
||||
"\nThe above exception was the direct cause of the following exception:\n",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdocument_loaders\u001b[39;00m \u001b[39mimport\u001b[39;00m TextLoader\n\u001b[1;32m 3\u001b[0m loader \u001b[39m=\u001b[39m TextLoader(\u001b[39m\"\u001b[39m\u001b[39m../../../state_of_the_union.txt\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m documents \u001b[39m=\u001b[39m loader\u001b[39m.\u001b[39;49mload()\n\u001b[1;32m 5\u001b[0m text_splitter \u001b[39m=\u001b[39m CharacterTextSplitter(chunk_size\u001b[39m=\u001b[39m\u001b[39m1000\u001b[39m, chunk_overlap\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 6\u001b[0m docs \u001b[39m=\u001b[39m text_splitter\u001b[39m.\u001b[39msplit_documents(documents)\n",
|
||||
"File \u001b[0;32m/opt/homebrew/lib/python3.10/site-packages/langchain/document_loaders/text.py:56\u001b[0m, in \u001b[0;36mTextLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m---> 56\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mError loading \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 58\u001b[0m metadata \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39msource\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfile_path}\n\u001b[1;32m 59\u001b[0m \u001b[39mreturn\u001b[39;00m [Document(page_content\u001b[39m=\u001b[39mtext, metadata\u001b[39m=\u001b[39mmetadata)]\n",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m: Error loading ../../../state_of_the_union.txt"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
|
||||
"loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
@ -57,16 +74,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'docs' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[39m# drop first if index already exists\u001b[39;00m\n\u001b[1;32m 4\u001b[0m Tair\u001b[39m.\u001b[39mdrop_index(tair_url\u001b[39m=\u001b[39mtair_url)\n\u001b[0;32m----> 6\u001b[0m vector_store \u001b[39m=\u001b[39m Tair\u001b[39m.\u001b[39mfrom_documents(docs, embeddings, tair_url\u001b[39m=\u001b[39mtair_url)\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'docs' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tair_url = \"redis://localhost:6379\"\n",
|
||||
"\n",
|
||||
"# drop first if index already exists\n",
|
||||
"Tair.drop_index(tair_url=tair_url)\n",
|
||||
"\n",
|
||||
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)"
|
||||
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -98,12 +127,44 @@
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Tair Hybrid Search Index build"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"# drop first if index already exists\n",
|
||||
"Tair.drop_index(tair_url=tair_url)\n",
|
||||
"\n",
|
||||
"vector_store = Tair.from_documents(docs, embeddings, tair_url=tair_url, index_params={\"lexical_algorithm\":\"bm25\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Tair Hybrid Search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"# hybrid_ratio: 0.5 hybrid search, 0.9999 vector search, 0.0001 text search\n",
|
||||
"kwargs = {\"TEXT\" : query, \"hybrid_ratio\" : 0.5}\n",
|
||||
"docs = vector_store.similarity_search(query, **kwargs)\n",
|
||||
"docs[0]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -122,7 +183,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -86,6 +86,10 @@ class Tair(VectorStore):
|
||||
"""Add texts data to an existing index."""
|
||||
ids = []
|
||||
keys = kwargs.get("keys", None)
|
||||
use_hybrid_search = False
|
||||
index = self.client.tvs_get_index(self.index_name)
|
||||
if index is not None and index.get("lexical_algorithm") == "bm25":
|
||||
use_hybrid_search = True
|
||||
# Write data to tair
|
||||
pipeline = self.client.pipeline(transaction=False)
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
@ -93,16 +97,30 @@ class Tair(VectorStore):
|
||||
# Use provided key otherwise use default key
|
||||
key = keys[i] if keys else _uuid_key()
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
pipeline.tvs_hset(
|
||||
self.index_name,
|
||||
key,
|
||||
embeddings[i],
|
||||
False,
|
||||
**{
|
||||
self.content_key: text,
|
||||
self.metadata_key: json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
if use_hybrid_search:
|
||||
# tair use TEXT attr hybrid search
|
||||
pipeline.tvs_hset(
|
||||
self.index_name,
|
||||
key,
|
||||
embeddings[i],
|
||||
False,
|
||||
**{
|
||||
"TEXT": text,
|
||||
self.content_key: text,
|
||||
self.metadata_key: json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
else:
|
||||
pipeline.tvs_hset(
|
||||
self.index_name,
|
||||
key,
|
||||
embeddings[i],
|
||||
False,
|
||||
**{
|
||||
self.content_key: text,
|
||||
self.metadata_key: json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
ids.append(key)
|
||||
pipeline.execute()
|
||||
return ids
|
||||
@ -166,7 +184,7 @@ class Tair(VectorStore):
|
||||
|
||||
distance_type = tairvector.DistanceMetric.InnerProduct
|
||||
if "distance_type" in kwargs:
|
||||
distance_type = kwargs.pop("distance_typ")
|
||||
distance_type = kwargs.pop("distance_type")
|
||||
index_type = tairvector.IndexType.HNSW
|
||||
if "index_type" in kwargs:
|
||||
index_type = kwargs.pop("index_type")
|
||||
|
Loading…
Reference in New Issue
Block a user