add clustered vearch in langchain (#10771)

---------

Co-authored-by: zhanghexian1 <zhanghexian1@jd.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/10815/head^2
zhanghexian 1 year ago committed by GitHub
parent f505320a73
commit 0abe996409
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,52 +2,27 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/export/anaconda3/envs/langchainGLM6B/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
"/export/anaconda3/envs/vearch_cluster_langchain/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"INFO 2023-08-28 18:26:07,485-1d: \n",
"loading model config\n",
"llm device: cuda\n",
"embedding device: cuda\n",
"dir: /data/zhx/zhx/langchain-ChatGLM_new\n",
"flagging username: e2fc35b8e87c4de18d692e951a5f7c46\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.01it/s]\n"
"Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00, 1.01s/it]\n"
]
}
],
"source": [
"\n",
"import os, sys, torch\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n",
"from langchain.llms import HuggingFacePipeline\nfrom langchain.chains import ConversationChain\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.vectorstores.vearch import VearchDb\n",
"from langchain.document_loaders import TextLoader\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.chains import RetrievalQA\n",
"from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from transformers import AutoModel, AutoTokenizer\n",
"from langchain.vectorstores.vearch import Vearch\n",
"\n",
"# your local model path\n",
"# repalce to your local model path\n",
"model_path =\"/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b\" \n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
@ -56,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -67,7 +42,7 @@
"ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B很高兴见到你欢迎问我任何问题。\n",
"\n",
"Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n",
"ChatGLM:凌波微步是一种步伐,最早出自于《倚天屠龙记》。在小说中,灭绝师太曾因与练习凌波微步的杨过的恩怨纠葛,而留下了一部经书,内容是记载凌波微步的起源和作用。后来,凌波微步便成为杨过和小龙女的感情象征。在现实生活中,凌波微步是一句口号,是清华大学学生社团“模型社”的社训。\n",
"ChatGLM:凌波微步是一种步伐,最早出自《倚天屠龙记》。在电视剧《人民的名义》中,侯亮平也学会了凌波微步。\n",
"\n"
]
}
@ -83,16 +58,14 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO 2023-08-28 18:27:36,037-1d: Load pretrained SentenceTransformer: /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese\n",
"WARNING 2023-08-28 18:27:36,038-1d: No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n",
"INFO 2023-08-28 18:27:38,936-1d: Use pytorch device: cuda\n"
"No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n"
]
}
],
@ -103,60 +76,45 @@
"documents = loader.load()\n",
"\n",
"# split text into sentences and embedding the sentences\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=500, chunk_overlap=100)\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"#your model path\n",
"#replace to your model path\n",
"embedding_path = '/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese'\n",
"embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n",
"\n"
"embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|██████████| 1/1 [00:00<00:00, 4.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed', '9a640124fc324a8abb0eaa31acb638b7']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
"docids ['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370']\n",
"***************after is cluster res*****************\n",
"docids ['1841638988191686991', '-4519586577642625749', '5028230008472292907']\n"
]
}
],
"source": [
"#first add your document into vearch vectorstore\n",
"vearch_db = VearchDb.from_documents(texts,embeddings,table_name=\"your_table_name\",metadata_path=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/your_table_name\")"
"vearch_standalone = Vearch.from_documents(\n",
" texts,embeddings,path_or_url=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/localdb_new_test\",table_name=\"localdb_new_test\",flag=0)\n",
"\n",
"print(\"***************after is cluster res*****************\")\n",
"\n",
"vearch_cluster = Vearch.from_documents(\n",
" texts,embeddings,path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",db_name=\"vearch_cluster_langchian\",table_name=\"tobenumone\",flag=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|██████████| 1/1 [00:00<00:00, 22.49it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -194,28 +152,76 @@
"段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n",
"卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n",
"\n",
"********ChatGLM:凌波微步是一种轻功身法,属于逍遥派独门轻功。它以《易经》中的六十四卦为基础,按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。凌波微步精妙异常,可以让人内力相助,自身内力颇为深厚之后再练。《天龙八部》第五回中有描述。\n",
"********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》八八六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力大为提升,但需在练成“北冥神功”后才能真正掌握。凌波微步在金庸先生的《天龙八部》中得到了充分的描写。\n",
"\n",
"***************************after is cluster res******************************\n",
"####################第1段相关文档####################\n",
"\n",
"午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\n",
"\n",
"这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\n",
"\n",
"\n",
"\n",
"百度简介\n",
"\n",
"凌波微步是「逍遥派」独门轻功身法,精妙异常。\n",
"\n",
"凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。\n",
"\n",
"####################第2段相关文档####################\n",
"\n",
"《天龙八部》第五回 微步縠纹生\n",
"\n",
"卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\n",
"\n",
"卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”\n",
"\n",
"####################第3段相关文档####################\n",
"\n",
"《天龙八部》第二回 玉壁月华明\n",
"\n",
"再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\n",
"\n",
"帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\n",
"\n",
"段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n",
"卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n",
"\n",
"********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》中的六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力增进,但需要谨慎练习,避免伤害他人。凌波微步在逍遥派中尤为流行,但并非所有逍遥派弟子都会凌波微步。\n",
"\n"
]
}
],
"source": [
"\n",
"res=vearch_db.similarity_search(query, 3)\n",
"query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n",
"for idx,tmp in enumerate(res): \n",
"vearch_standalone_res=vearch_standalone.similarity_search(query, 3)\n",
"for idx,tmp in enumerate(vearch_standalone_res): \n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n",
"# combine your local knowleadge and query \n",
"context = \"\".join([tmp.page_content for tmp in res])\n",
"context = \"\".join([tmp.page_content for tmp in vearch_standalone_res])\n",
"new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n",
"response, history = model.chat(tokenizer, new_query, history=[])\n",
"print(f\"********ChatGLM:{response}\\n\")\n"
"print(f\"********ChatGLM:{response}\\n\")\n",
"\n",
"print(\"***************************after is cluster res******************************\")\n",
"\n",
"query_c = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n",
"cluster_res=vearch_cluster.similarity_search(query_c, 3)\n",
"for idx,tmp in enumerate(cluster_res): \n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n",
"# combine your local knowleadge and query \n",
"context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n",
"new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n",
"response_c, history_c = model.chat(tokenizer, new_query_c, history=[])\n",
"print(f\"********ChatGLM:{response_c}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -223,42 +229,20 @@
"output_type": "stream",
"text": [
"Human: 你知道vearch是什么吗?\n",
"ChatGLM:是的,我知道 Vearch。Vearch 是一种矩阵分解 technique用于将矩阵分解为若干个不可约矩阵的乘积。它是由 Linus Torvalds 开发的,旨在提高 Linux 内核中矩阵操作的性能。\n",
"ChatGLM:是的,我知道 Vearch。Vearch 是一种用于计算机械系统极化子的工具它可以用于模拟和优化电路的性能。它是一个基于Matlab的电路仿真软件可以用于设计和分析各种类型的电路包括交流电路和直流电路。\n",
"\n",
"Vearch 可以通过使用特殊的操作来对矩阵进行操作,从而避免了使用昂贵的矩阵操作库。它也被广泛用于其他操作系统中,如 FreeBSD 和 Solaris。\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
"docids ['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c']\n",
"*****************after is cluster res********************\n",
"docids ['-4311783201092343475', '-2899734009733762895', '1342026762029067927']\n"
]
},
{
"data": {
"text/plain": [
"['04bc84fff5074b7b8990441e92e6df07',\n",
" 'e221906153bb4e03bc7095dadea144de',\n",
" '126034ba51934093920d8732860f340b']"
"['-4311783201092343475', '-2899734009733762895', '1342026762029067927']"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -268,27 +252,24 @@
"response, history = model.chat(tokenizer, query, history=history)\n",
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n",
"\n",
"\n",
"vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库用于存储和快速搜索模型embedding后的向量可用于基于个人知识库的大模型应用\",\n",
" \"Vearch 支持OpenAI, Llama, ChatGLM等模型以及LangChain库\",\n",
" \"vearch 是基于C语言,go语言开发的并提供python接口可以直接通过pip安装\"]\n",
"vearch_source=[{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n",
"vearch_db.add_texts(vearch_info,vearch_source)\n",
"vearch_standalone.add_texts(vearch_info,vearch_source)\n",
"\n",
"print(\"*****************after is cluster res********************\")\n",
"\n",
"vearch_cluster.add_texts(vearch_info,vearch_source)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|██████████| 1/1 [00:00<00:00, 25.57it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -305,56 +286,107 @@
"\n",
"vearch 是基于C语言,go语言开发的并提供python接口可以直接通过pip安装\n",
"\n",
"***************ChatGLM:是的Varch是一个向量数据库旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、Llama和ChatGLM等模型并可以直接通过pip安装。Varch是一个基于C语言和Go语言开发的项目并提供了Python接口。\n",
"***************ChatGLM:是的Varch是一个向量数据库旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、ChatGLM等模型并可直接通过pip安装。\n",
"\n",
"***************after is cluster res******************\n",
"####################第1段相关文档####################\n",
"\n",
"Vearch 是一款存储大语言模型数据的向量数据库用于存储和快速搜索模型embedding后的向量可用于基于个人知识库的大模型应用\n",
"\n",
"####################第2段相关文档####################\n",
"\n",
"Vearch 支持OpenAI, Llama, ChatGLM等模型以及LangChain库\n",
"\n",
"####################第3段相关文档####################\n",
"\n",
"vearch 是基于C语言,go语言开发的并提供python接口可以直接通过pip安装\n",
"\n",
"***************ChatGLM:是的Varch是一个向量数据库旨在存储和快速搜索模型embedding后的向量。它支持OpenAIChatGLM等模型并可用于基于个人知识库的大模型应用。Varch基于C语言和Go语言开发并提供Python接口可以通过pip安装。\n",
"\n"
]
}
],
"source": [
"query3 = \"你知道vearch是什么吗?\"\n",
"res1 = vearch_db.similarity_search(query3, 3)\n",
"res1 = vearch_standalone.similarity_search(query3, 3)\n",
"for idx,tmp in enumerate(res1): \n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n",
"context1 = \"\".join([tmp.page_content for tmp in res1])\n",
"new_query1 = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1} \\n 回答用户这个问题:{query3}\\n\\n\"\n",
"response, history = model.chat(tokenizer, new_query1, history=[])\n",
"print(f\"***************ChatGLM:{response}\\n\")\n",
"\n",
"print(f\"***************ChatGLM:{response}\\n\")"
"print(\"***************after is cluster res******************\")\n",
"\n",
"query3_c = \"你知道vearch是什么吗?\"\n",
"res1_c = vearch_standalone.similarity_search(query3_c, 3)\n",
"for idx,tmp in enumerate(res1_c): \n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n",
"context1_C = \"\".join([tmp.page_content for tmp in res1_c])\n",
"new_query1_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1_C} \\n 回答用户这个问题:{query3_c}\\n\\n\"\n",
"response_c, history_c = model.chat(tokenizer, new_query1_c, history=[])\n",
"\n",
"print(f\"***************ChatGLM:{response_c}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"delete docid True\n",
"delete vearch standalone docid True\n",
"Human: 你知道vearch是什么吗?\n",
"ChatGLM:Vearch是一种用于处理向量的库,可以轻松地将向量转换为矩阵,并提供许多有用的函数和算法,以操作向量。 Vearch支持许多常见的向量操作,例如加法、减法、乘法、除法、矩阵乘法、求和、统计和归一化等。 Vearch还提供了一些高级功能,例如L2正则化、协方差矩阵、稀疏矩阵和奇异值分解等。\n",
"\n",
"delete vearch cluster docid True\n",
"Human: 你知道vearch是什么吗?\n",
"ChatGLM:Vearch是一种高分子化合物,也称为聚合物、高分子材料或合成材料。它是由重复单元组成的大型聚合物,通常由一些重复单元组成,这些单元在聚合过程中结合在一起形成一个连续的高分子链。\n",
"ChatGLM:Vearch是一种用于处理向量数据的函数,可以应用于多种不同的编程语言和数据结构中。\n",
"\n",
"Vearch最初是作为Java中一个名为“vearch”的包而出现的,它的目的是提供一种高效的向量数据结构。它支持向量的多态性,可以轻松地实现不同类型的向量之间的转换,同时还支持向量的压缩和反向操作等操作。\n",
"\n",
"后来,Vearch被广泛应用于其他编程语言中,如Python、Ruby、JavaScript等。在Python中,它被称为“vectorize”,在Ruby中,它被称为“Vector”。\n",
"\n",
"Vearch具有许多独特的性质,例如高强度、高刚性、耐磨、耐腐蚀、耐高温等。它们通常用于制造各种应用,例如塑料制品、橡胶、纤维、建筑材料等。\n",
"Vearch的主要优点是它的向量操作具有多态性,可以应用于不同类型的向量数据,同时还支持高效的向量操作和反向操作,因此可以提高程序的性能。\n",
"\n",
"after delete docid to query again: {}\n",
"get existed docid {'7aae36236f784105a0004d8ff3c7c3ad': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷长卷上源源皆是裸女画像或立或卧或现前胸或见后背。人像的面容都是一般但或喜或愁或含情凝眸或轻嗔薄怒神情各异。一共有三十六幅图像每幅像上均有颜色细线注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字其后绘的是无数足印注明“妇妹”、“无妄”等等字样尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》一见到这些名称登时精神大振便似遇到故交良友一般。只见足印密密麻麻不知有几千百个自一个足印至另一个足印均有绿线贯串线上绘有箭头最后写着一行字道“步法神妙保身避敌待积内力再取敌命。”\\n\\n段誉心道“神仙姊姊所遗的步法必定精妙之极遇到强敌时脱身逃走那就很好再取敌命也就不必了。”\\n卷好帛卷对之作了两个揖珍而重之地揣入怀中转身对那玉像道“神仙姊姊你吩咐我朝午晚三次练功段誉不敢有违。今后我对人加倍客气别人不会来打我我自然也不会去吸他内力。你这套凌波微步我更要用心练熟眼见不对立刻溜之大吉就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '7e495d4e5962497db2080e84d52e75ed': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多皆是取人内力的法门段誉虽自语宽解总觉习之有违本性单是贪多务得便非好事当下暂不理会。\\n\\n卷到卷轴末端又见到了“凌波微步”那四字登时便想起《洛神赋》中那些句子来“凌波微步罗袜生尘……转眄流精光润玉颜。含辞未吐气若幽兰。华容婀娜令我忘餐。”曹子建那些千古名句在脑海中缓缓流过“秾纤得衷修短合度肩若削成腰如约素。延颈秀项皓质呈露。芳泽无加铅华弗御。云髻峨峨修眉连娟。丹唇外朗皓齿内鲜。明眸善睐靥辅承权。瑰姿艳逸仪静体闲。柔情绰态媚于语言……”这些句子用在木婉清身上“这话倒也有理”但如用之于神仙姊姊只怕更为适合。想到神仙姊姊的姿容体态“皎若太阳升朝霞灼若芙蓉出绿波”但觉依她吩咐行事实为人生至乐心想“我先来练这凌波微步此乃逃命之妙法非害人之手段也练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n"
"get existed docid {'18ce6747dca04a2c833e60e8dfd83c04': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷长卷上源源皆是裸女画像或立或卧或现前胸或见后背。人像的面容都是一般但或喜或愁或含情凝眸或轻嗔薄怒神情各异。一共有三十六幅图像每幅像上均有颜色细线注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字其后绘的是无数足印注明“妇妹”、“无妄”等等字样尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》一见到这些名称登时精神大振便似遇到故交良友一般。只见足印密密麻麻不知有几千百个自一个足印至另一个足印均有绿线贯串线上绘有箭头最后写着一行字道“步法神妙保身避敌待积内力再取敌命。”\\n\\n段誉心道“神仙姊姊所遗的步法必定精妙之极遇到强敌时脱身逃走那就很好再取敌命也就不必了。”\\n卷好帛卷对之作了两个揖珍而重之地揣入怀中转身对那玉像道“神仙姊姊你吩咐我朝午晚三次练功段誉不敢有违。今后我对人加倍客气别人不会来打我我自然也不会去吸他内力。你这套凌波微步我更要用心练熟眼见不对立刻溜之大吉就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), 'aafacb0e46574b378a9f433877ab06a8': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多皆是取人内力的法门段誉虽自语宽解总觉习之有违本性单是贪多务得便非好事当下暂不理会。\\n\\n卷到卷轴末端又见到了“凌波微步”那四字登时便想起《洛神赋》中那些句子来“凌波微步罗袜生尘……转眄流精光润玉颜。含辞未吐气若幽兰。华容婀娜令我忘餐。”曹子建那些千古名句在脑海中缓缓流过“秾纤得衷修短合度肩若削成腰如约素。延颈秀项皓质呈露。芳泽无加铅华弗御。云髻峨峨修眉连娟。丹唇外朗皓齿内鲜。明眸善睐靥辅承权。瑰姿艳逸仪静体闲。柔情绰态媚于语言……”这些句子用在木婉清身上“这话倒也有理”但如用之于神仙姊姊只怕更为适合。想到神仙姊姊的姿容体态“皎若太阳升朝霞灼若芙蓉出绿波”但觉依她吩咐行事实为人生至乐心想“我先来练这凌波微步此乃逃命之妙法非害人之手段也练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '9776bccfdd8643a8b219ccee0596f370': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天“凌波微步”已走得颇为纯熟不须再数呼吸纵然疾行气息也已无所窒滞。心意既畅跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子“仿佛兮若轻云之蔽月飘飘兮若流风之回雪”“竦轻躯以鹤立若将飞而未翔”“体迅飞凫飘忽若神”“动无常则若危若安。进止难期若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功所以列于卷轴之末以易经八八六十四卦为基础使用者按特定顺序踏着卦象方位行进从第一步到最后一步正好行走一个大圈。此步法精妙异常原是要待人练成「北冥神功」吸人内力自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n",
"after delete docid to query again: {}\n",
"get existed docid {'1841638988191686991': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷长卷上源源皆是裸女画像或立或卧或现前胸或见后背。人像的面容都是一般但或喜或愁或含情凝眸或轻嗔薄怒神情各异。一共有三十六幅图像每幅像上均有颜色细线注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字其后绘的是无数足印注明“妇妹”、“无妄”等等字样尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》一见到这些名称登时精神大振便似遇到故交良友一般。只见足印密密麻麻不知有几千百个自一个足印至另一个足印均有绿线贯串线上绘有箭头最后写着一行字道“步法神妙保身避敌待积内力再取敌命。”\\n\\n段誉心道“神仙姊姊所遗的步法必定精妙之极遇到强敌时脱身逃走那就很好再取敌命也就不必了。”\\n卷好帛卷对之作了两个揖珍而重之地揣入怀中转身对那玉像道“神仙姊姊你吩咐我朝午晚三次练功段誉不敢有违。今后我对人加倍客气别人不会来打我我自然也不会去吸他内力。你这套凌波微步我更要用心练熟眼见不对立刻溜之大吉就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '-4519586577642625749': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多皆是取人内力的法门段誉虽自语宽解总觉习之有违本性单是贪多务得便非好事当下暂不理会。\\n\\n卷到卷轴末端又见到了“凌波微步”那四字登时便想起《洛神赋》中那些句子来“凌波微步罗袜生尘……转眄流精光润玉颜。含辞未吐气若幽兰。华容婀娜令我忘餐。”曹子建那些千古名句在脑海中缓缓流过“秾纤得衷修短合度肩若削成腰如约素。延颈秀项皓质呈露。芳泽无加铅华弗御。云髻峨峨修眉连娟。丹唇外朗皓齿内鲜。明眸善睐靥辅承权。瑰姿艳逸仪静体闲。柔情绰态媚于语言……”这些句子用在木婉清身上“这话倒也有理”但如用之于神仙姊姊只怕更为适合。想到神仙姊姊的姿容体态“皎若太阳升朝霞灼若芙蓉出绿波”但觉依她吩咐行事实为人生至乐心想“我先来练这凌波微步此乃逃命之妙法非害人之手段也练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '5028230008472292907': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天“凌波微步”已走得颇为纯熟不须再数呼吸纵然疾行气息也已无所窒滞。心意既畅跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子“仿佛兮若轻云之蔽月飘飘兮若流风之回雪”“竦轻躯以鹤立若将飞而未翔”“体迅飞凫飘忽若神”“动无常则若危若安。进止难期若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功所以列于卷轴之末以易经八八六十四卦为基础使用者按特定顺序踏着卦象方位行进从第一步到最后一步正好行走一个大圈。此步法精妙异常原是要待人练成「北冥神功」吸人内力自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n"
]
}
],
"source": [
"##delete and get function need to maintian docids \n",
"##your docid\n",
"res_d=vearch_db.delete(['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b'])\n",
"print(\"delete docid\",res_d)\n",
"\n",
"res_d=vearch_standalone.delete(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n",
"print(\"delete vearch standalone docid\",res_d)\n",
"query = \"你知道vearch是什么吗?\"\n",
"response, history = model.chat(tokenizer, query, history=[])\n",
"print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n",
"get_id_doc=vearch_db.get(['04bc84fff5074b7b8990441e92e6df07'])\n",
"print(\"after delete docid to query again:\",get_id_doc)\n",
"get_delet_doc=vearch_db.get(['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed'])\n",
"print(\"get existed docid\",get_delet_doc)"
"\n",
"res_cluster=vearch_cluster.delete(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n",
"print(\"delete vearch cluster docid\",res_cluster)\n",
"query_c = \"你知道vearch是什么吗?\"\n",
"response_c, history = model.chat(tokenizer, query_c, history=[])\n",
"print(f\"Human: {query}\\nChatGLM:{response_c}\\n\")\n",
"\n",
"\n",
"get_delet_doc=vearch_standalone.get(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n",
"print(\"after delete docid to query again:\",get_delet_doc)\n",
"get_id_doc=vearch_standalone.get(['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370','9223acd6d89d4c2c84ff42677ac0d47c'])\n",
"print(\"get existed docid\",get_id_doc)\n",
"\n",
"get_delet_doc=vearch_cluster.get(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n",
"print(\"after delete docid to query again:\",get_delet_doc)\n",
"get_id_doc=vearch_cluster.get(['1841638988191686991', '-4519586577642625749', '5028230008472292907','1342026762029067927'])\n",
"print(\"get existed docid\",get_id_doc)\n"
]
},
{
@ -385,7 +417,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.12 ('langchainGLM6B')",
"display_name": "Python 3.10.13 ('vearch_cluster_langchain')",
"language": "python",
"name": "python3"
},
@ -399,12 +431,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "1fd24e7ef183310e43cbf656d21568350c6a30580b6df7fe3b34654b3770f74d"
"hash": "f1da10a89896267ed34b497c9568817f36cc7ea79826b5cfca4d96376f5b4835"
}
}
},

@ -73,6 +73,7 @@ from langchain.vectorstores.tigris import Tigris
from langchain.vectorstores.typesense import Typesense
from langchain.vectorstores.usearch import USearch
from langchain.vectorstores.vald import Vald
from langchain.vectorstores.vearch import Vearch
from langchain.vectorstores.vectara import Vectara
from langchain.vectorstores.weaviate import Weaviate
from langchain.vectorstores.zep import ZepVectorStore
@ -137,6 +138,7 @@ __all__ = [
"Typesense",
"USearch",
"Vald",
"Vearch",
"Vectara",
"VectorStore",
"Weaviate",

@ -13,44 +13,68 @@ from langchain.vectorstores.base import VectorStore
if TYPE_CHECKING:
import vearch
DEFAULT_TOPN = 4
class VearchDb(VectorStore):
class Vearch(VectorStore):
_DEFAULT_TABLE_NAME = "langchain_vearch"
_DEFAULT_CLUSTER_DB_NAME = "cluster_client_db"
_DEFAULT_VERSION = 1
def __init__(
self,
embedding_function: Embeddings,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
metadata_path: Optional[str] = None,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> None:
"""Initialize vearch vector store"""
"""Initialize vearch vector store
flag 1 for cluster,0 for standalone
"""
try:
import vearch
if flag:
import vearch_cluster
else:
import vearch
except ImportError:
raise ValueError(
"Could not import vearch python package. "
"Please install it with `pip install vearch`."
"Could not import suitable python package. "
"Please install it with `pip install vearch or vearch_cluster`."
)
if metadata_path is None:
metadata_path = os.getcwd().replace("\\", "/")
if not os.path.isdir(metadata_path):
os.makedirs(metadata_path)
log_path = os.path.join(metadata_path, "log")
if not os.path.isdir(log_path):
os.makedirs(log_path)
self.vearch_engine = vearch.Engine(metadata_path, log_path)
if flag:
if path_or_url is None:
raise ValueError("Please input url of cluster")
if not db_name:
db_name = self._DEFAULT_CLUSTER_DB_NAME
db_name += "_"
db_name += str(uuid.uuid4()).split("-")[-1]
self.using_db_name = db_name
self.url = path_or_url
self.vearch = vearch_cluster.VearchCluster(path_or_url)
else:
if path_or_url is None:
metadata_path = os.getcwd().replace("\\", "/")
else:
metadata_path = path_or_url
if not os.path.isdir(metadata_path):
os.makedirs(metadata_path)
log_path = os.path.join(metadata_path, "log")
if not os.path.isdir(log_path):
os.makedirs(log_path)
self.vearch = vearch.Engine(metadata_path, log_path)
self.using_metapath = metadata_path
if not table_name:
table_name = self._DEFAULT_TABLE_NAME
table_name += "_"
table_name += str(uuid.uuid4()).split("-")[-1]
self.using_table_name = table_name
self.using_metapath = metadata_path
self.embedding_func = embedding_function
self.flag = flag
@property
def embeddings(self) -> Optional[Embeddings]:
@ -58,13 +82,15 @@ class VearchDb(VectorStore):
@classmethod
def from_documents(
cls: Type[VearchDb],
cls: Type[Vearch],
documents: List[Document],
embedding: Embeddings,
table_name: str = "langchain_vearch",
metadata_path: Optional[str] = None,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> VearchDb:
) -> Vearch:
"""Return Vearch VectorStore"""
texts = [d.page_content for d in documents]
@ -74,27 +100,34 @@ class VearchDb(VectorStore):
texts=texts,
embedding=embedding,
metadatas=metadatas,
path_or_url=path_or_url,
table_name=table_name,
metadata_path=metadata_path,
db_name=db_name,
flag=flag,
**kwargs,
)
@classmethod
def from_texts(
cls: Type[VearchDb],
cls: Type[Vearch],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
metadata_path: Optional[str] = None,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> VearchDb:
) -> Vearch:
"""Return Vearch VectorStore"""
vearch_db = cls(
embedding_function=embedding,
embedding=embedding,
path_or_url=path_or_url,
db_name=db_name,
table_name=table_name,
metadata_path=metadata_path,
flag=flag,
)
vearch_db.add_texts(texts=texts, metadatas=metadatas)
return vearch_db
@ -102,19 +135,20 @@ class VearchDb(VectorStore):
def _create_table(
self,
dim: int = 1024,
filed_list: List[dict] = [
{"filed": "text", "type": "str"},
{"filed": "metadata", "type": "str"},
field_list: List[dict] = [
{"field": "text", "type": "str"},
{"field": "metadata", "type": "str"},
],
) -> int:
"""
Create VectorStore Table
Args:
dim:dimension of vector
fileds_list: the filed you want to store
fields_list: the field you want to store
Return:
code,0 for success,1 for failed
"""
type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING}
engine_info = {
"index_size": 10000,
@ -122,8 +156,8 @@ class VearchDb(VectorStore):
"retrieval_param": {"ncentroids": 2048, "nsubvector": 32},
}
fields = [
vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]])
for fi in filed_list
vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]])
for fi in field_list
]
vector_field = vearch.GammaVectorInfo(
name="text_embedding",
@ -135,7 +169,7 @@ class VearchDb(VectorStore):
store_param={"cache_size": 10000},
has_source=False,
)
response_code = self.vearch_engine.create_table(
response_code = self.vearch.create_table(
engine_info,
name=self.using_table_name,
fields=fields,
@ -143,6 +177,48 @@ class VearchDb(VectorStore):
)
return response_code
def _create_space(
self,
dim: int = 1024,
) -> int:
"""
Create VectorStore space
Args:
dim:dimension of vector
Return:
code,0 failed for ,1 for success
"""
space_config = {
"name": self.using_table_name,
"partition_num": 1,
"replica_num": 1,
"engine": {
"name": "gamma",
"index_size": 1,
"retrieval_type": "FLAT",
"retrieval_param": {
"metric_type": "L2",
},
},
"properties": {
"text": {
"type": "string",
},
"metadata": {
"type": "string",
},
"text_embedding": {
"type": "vector",
"index": True,
"dimension": dim,
"store_type": "MemoryOnly",
},
},
}
response_code = self.vearch.create_space(self.using_db_name, space_config)
return response_code
def add_texts(
self,
texts: Iterable[str],
@ -156,64 +232,104 @@ class VearchDb(VectorStore):
embeddings = None
if self.embedding_func is not None:
embeddings = self.embedding_func.embed_documents(list(texts))
table_path = os.path.join(
self.using_metapath, self.using_table_name + ".schema"
)
if not os.path.exists(table_path):
if embeddings is None:
raise ValueError("embeddings is None")
dim = len(embeddings[0])
response_code = self._create_table(dim)
if response_code:
raise ValueError("create table failed!!!")
if embeddings is not None and metadatas is not None:
doc_items = []
for text, metadata, embed in zip(texts, metadatas, embeddings):
profiles: dict[str, Any] = {}
profiles["text"] = text
profiles["metadata"] = metadata["source"]
profiles["text_embedding"] = embed
doc_items.append(profiles)
docid = self.vearch_engine.add(doc_items)
t_time = 0
while len(docid) != len(embeddings):
time.sleep(0.5)
if t_time > 6:
break
t_time += 1
self.vearch_engine.dump()
if embeddings is None:
raise ValueError("embeddings is None")
if self.flag:
dbs_list = self.vearch.list_dbs()
if self.using_db_name not in dbs_list:
create_db_code = self.vearch.create_db(self.using_db_name)
if not create_db_code:
raise ValueError("create db failed!!!")
space_list = self.vearch.list_spaces(self.using_db_name)
if self.using_table_name not in space_list:
create_space_code = self._create_space(len(embeddings[0]))
if not create_space_code:
raise ValueError("create space failed!!!")
docid = []
if embeddings is not None and metadatas is not None:
for text, metadata, embed in zip(texts, metadatas, embeddings):
profiles: dict[str, Any] = {}
profiles["text"] = text
profiles["metadata"] = metadata["source"]
embed_np = np.array(embed)
profiles["text_embedding"] = {
"feature": (embed_np / np.linalg.norm(embed_np)).tolist()
}
insert_res = self.vearch.insert_one(
self.using_db_name, self.using_table_name, profiles
)
if insert_res["status"] == 200:
docid.append(insert_res["_id"])
continue
else:
retry_insert = self.vearch.insert_one(
self.using_db_name, self.using_table_name, profiles
)
docid.append(retry_insert["_id"])
continue
else:
table_path = os.path.join(
self.using_metapath, self.using_table_name + ".schema"
)
if not os.path.exists(table_path):
dim = len(embeddings[0])
response_code = self._create_table(dim)
if response_code:
raise ValueError("create table failed!!!")
if embeddings is not None and metadatas is not None:
doc_items = []
for text, metadata, embed in zip(texts, metadatas, embeddings):
profiles_v: dict[str, Any] = {}
profiles_v["text"] = text
profiles_v["metadata"] = metadata["source"]
embed_np = np.array(embed)
profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np)
doc_items.append(profiles_v)
docid = self.vearch.add(doc_items)
t_time = 0
while len(docid) != len(embeddings):
time.sleep(0.5)
if t_time > 6:
break
t_time += 1
self.vearch.dump()
return docid
def _load(self) -> None:
"""
load vearch engine
load vearch engine for standalone vearch
"""
self.vearch_engine.load()
self.vearch.load()
@classmethod
def load_local(
cls,
embedding: Embeddings,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
metadata_path: Optional[str] = None,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> VearchDb:
"""Load the local specified table.
) -> Vearch:
"""Load the local specified table of standalone vearch.
Returns:
Success or failure of loading the local specified table
"""
if not metadata_path:
if not path_or_url:
raise ValueError("No metadata path!!!")
if not table_name:
raise ValueError("No table name!!!")
table_path = os.path.join(metadata_path, table_name + ".schema")
table_path = os.path.join(path_or_url, table_name + ".schema")
if not os.path.exists(table_path):
raise ValueError("vearch vectorbase table not exist!!!")
vearch_db = cls(
embedding_function=embedding,
path_or_url=path_or_url,
table_name=table_name,
metadata_path=metadata_path,
db_name=db_name,
flag=flag,
)
vearch_db._load()
return vearch_db
@ -228,8 +344,6 @@ class VearchDb(VectorStore):
Return docs most similar to query.
"""
if self.vearch_engine is None:
raise ValueError("Vearch engine is None!!!")
if self.embedding_func is None:
raise ValueError("embedding_func is None!!!")
embeddings = self.embedding_func.embed_query(query)
@ -243,7 +357,6 @@ class VearchDb(VectorStore):
**kwargs: Any,
) -> List[Document]:
"""The most k similar documents and scores of the specified query.
Args:
embeddings: embedding vector of the query.
k: The k most similar documents to the text query.
@ -252,23 +365,45 @@ class VearchDb(VectorStore):
The k most similar documents to the specified text query.
0 is dissimilar, 1 is the most similar.
"""
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": np.array(embedding),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch_engine.search(query_data)
embed = np.array(embedding)
if self.flag:
query_data = {
"query": {
"sum": [
{
"field": "text_embedding",
"feature": (embed / np.linalg.norm(embed)).tolist(),
}
],
},
"size": k,
"fields": ["text", "metadata"],
}
query_result = self.vearch.search(
self.using_db_name, self.using_table_name, query_data
)
res = query_result["hits"]["hits"]
else:
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": embed / np.linalg.norm(embed),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch.search(query_data)
res = query_result[0]["result_items"]
docs = []
for item in query_result[0]["result_items"]:
for item in res:
content = ""
meta_data = {}
if self.flag:
item = item["_source"]
for item_key in item:
if item_key == "text":
content = item[item_key]
@ -286,7 +421,6 @@ class VearchDb(VectorStore):
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""The most k similar documents and scores of the specified query.
Args:
embeddings: embedding vector of the query.
k: The k most similar documents to the text query.
@ -298,23 +432,46 @@ class VearchDb(VectorStore):
if self.embedding_func is None:
raise ValueError("embedding_func is None!!!")
embeddings = self.embedding_func.embed_query(query)
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": np.array(embeddings),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch_engine.search(query_data)
embed = np.array(embeddings)
if self.flag:
query_data = {
"query": {
"sum": [
{
"field": "text_embedding",
"feature": (embed / np.linalg.norm(embed)).tolist(),
}
],
},
"size": k,
"fields": ["text_embedding", "text", "metadata"],
}
query_result = self.vearch.search(
self.using_db_name, self.using_table_name, query_data
)
res = query_result["hits"]["hits"]
else:
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": embed / np.linalg.norm(embed),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch.search(query_data)
res = query_result[0]["result_items"]
results: List[Tuple[Document, float]] = []
for item in query_result[0]["result_items"]:
for item in res:
content = ""
meta_data = {}
if self.flag:
score = item["_score"]
item = item["_source"]
for item_key in item:
if item_key == "text":
content = item[item_key]
@ -322,7 +479,7 @@ class VearchDb(VectorStore):
if item_key == "metadata":
meta_data["source"] = item[item_key]
continue
if item_key == "score":
if self.flag != 1 and item_key == "score":
score = item[item_key]
continue
tmp_res = (Document(page_content=content, metadata=meta_data), score)
@ -351,14 +508,16 @@ class VearchDb(VectorStore):
Optional[bool]: True if deletion is successful.
False otherwise, None if not implemented.
"""
if self.vearch_engine is None:
raise ValueError("Verach Engine is None!!!")
ret: Optional[bool] = None
tmp_res = []
if ids is None or ids.__len__() == 0:
return ret
for _id in ids:
ret = self.vearch_engine.del_doc(_id)
if self.flag:
ret = self.vearch.delete(self.using_db_name, self.using_table_name, _id)
else:
ret = self.vearch.del_doc(_id)
tmp_res.append(ret)
ret = all(i == 0 for i in tmp_res)
return ret
@ -376,26 +535,44 @@ class VearchDb(VectorStore):
Documents which satisfy the input conditions.
"""
if self.vearch_engine is None:
raise ValueError("vearch engine is None!!!")
results: Dict[str, Document] = {}
if ids is None or ids.__len__() == 0:
return results
for id in ids:
docs_detail = self.vearch_engine.get_doc_by_id(id)
if docs_detail == {}:
continue
content = ""
meta_info = {}
for field in docs_detail:
if field == "text":
content = docs_detail[field]
if self.flag:
query_data = {"query": {"ids": ids}}
docs_detail = self.vearch.mget_by_ids(
self.using_db_name, self.using_table_name, query_data
)
for record in docs_detail:
if record["found"] is False:
continue
elif field == "metadata":
meta_info["source"] = docs_detail[field]
content = ""
meta_info = {}
for field in record["_source"]:
if field == "text":
content = record["_source"][field]
continue
elif field == "metadata":
meta_info["source"] = record["_source"][field]
continue
results[record["_id"]] = Document(
page_content=content, metadata=meta_info
)
else:
for id in ids:
docs_detail = self.vearch.get_doc_by_id(id)
if docs_detail == {}:
continue
results[docs_detail["_id"]] = Document(
page_content=content, metadata=meta_info
)
content = ""
meta_info = {}
for field in docs_detail:
if field == "text":
content = docs_detail[field]
continue
elif field == "metadata":
meta_info["source"] = docs_detail[field]
continue
results[docs_detail["_id"]] = Document(
page_content=content, metadata=meta_info
)
return results

Loading…
Cancel
Save