diff --git a/docs/extras/integrations/vectorstores/vearch.ipynb b/docs/extras/integrations/vectorstores/vearch.ipynb index efddac3eb5..eca0218a4f 100644 --- a/docs/extras/integrations/vectorstores/vearch.ipynb +++ b/docs/extras/integrations/vectorstores/vearch.ipynb @@ -2,52 +2,27 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/export/anaconda3/envs/langchainGLM6B/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "/export/anaconda3/envs/vearch_cluster_langchain/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", - "INFO 2023-08-28 18:26:07,485-1d: \n", - "loading model config\n", - "llm device: cuda\n", - "embedding device: cuda\n", - "dir: /data/zhx/zhx/langchain-ChatGLM_new\n", - "flagging username: e2fc35b8e87c4de18d692e951a5f7c46\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.01it/s]\n" + "Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00, 1.01s/it]\n" ] } ], "source": [ - "\n", - "import os, sys, torch\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n", - "from langchain.llms import HuggingFacePipeline\nfrom langchain.chains import ConversationChain\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain.vectorstores.vearch import VearchDb\n", "from langchain.document_loaders import TextLoader\n", - "from langchain.prompts import PromptTemplate\n", - "from langchain.chains import RetrievalQA\n", "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from transformers import AutoModel, AutoTokenizer\n", + "from langchain.vectorstores.vearch import Vearch\n", "\n", - "# your local model path\n", + "# repalce to your local model path\n", "model_path =\"/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b\" \n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n", @@ -56,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -67,7 +42,7 @@ "ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。\n", "\n", "Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n", - "ChatGLM:凌波微步是一种步伐,最早出自于《倚天屠龙记》。在小说中,灭绝师太曾因与练习凌波微步的杨过的恩怨纠葛,而留下了一部经书,内容是记载凌波微步的起源和作用。后来,凌波微步便成为杨过和小龙女的感情象征。在现实生活中,凌波微步是一句口号,是清华大学学生社团“模型社”的社训。\n", + "ChatGLM:凌波微步是一种步伐,最早出自《倚天屠龙记》。在电视剧《人民的名义》中,侯亮平也学会了凌波微步。\n", "\n" ] } @@ -83,16 +58,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO 2023-08-28 18:27:36,037-1d: Load pretrained SentenceTransformer: /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese\n", - "WARNING 2023-08-28 18:27:36,038-1d: No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n", - "INFO 2023-08-28 18:27:38,936-1d: Use pytorch device: cuda\n" + "No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n" ] } ], @@ -103,60 +76,45 @@ "documents = loader.load()\n", "\n", "# split text into sentences and embedding the sentences\n", - "text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=500, chunk_overlap=100)\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", "texts = text_splitter.split_documents(documents)\n", "\n", - "#your model path\n", + "#replace to your model path\n", "embedding_path = '/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese'\n", - "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n", - "\n" + "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 4.56it/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed', '9a640124fc324a8abb0eaa31acb638b7']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "docids ['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370']\n", + "***************after is cluster res*****************\n", + "docids ['1841638988191686991', '-4519586577642625749', '5028230008472292907']\n" ] } ], "source": [ "#first add your document into vearch vectorstore\n", - "vearch_db = VearchDb.from_documents(texts,embeddings,table_name=\"your_table_name\",metadata_path=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/your_table_name\")" + "vearch_standalone = Vearch.from_documents(\n", + " texts,embeddings,path_or_url=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/localdb_new_test\",table_name=\"localdb_new_test\",flag=0)\n", + "\n", + "print(\"***************after is cluster res*****************\")\n", + "\n", + "vearch_cluster = Vearch.from_documents(\n", + " texts,embeddings,path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",db_name=\"vearch_cluster_langchian\",table_name=\"tobenumone\",flag=1)\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.49it/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -194,28 +152,76 @@ "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", "\n", - "********ChatGLM:凌波微步是一种轻功身法,属于逍遥派独门轻功。它以《易经》中的六十四卦为基础,按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。凌波微步精妙异常,可以让人内力相助,自身内力颇为深厚之后再练。《天龙八部》第五回中有描述。\n", + "********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》八八六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力大为提升,但需在练成“北冥神功”后才能真正掌握。凌波微步在金庸先生的《天龙八部》中得到了充分的描写。\n", + "\n", + "***************************after is cluster res******************************\n", + "####################第1段相关文档####################\n", + "\n", + "午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\n", + "\n", + "这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\n", + "\n", + "\n", + "\n", + "百度简介\n", + "\n", + "凌波微步是「逍遥派」独门轻功身法,精妙异常。\n", + "\n", + "凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "《天龙八部》第五回 微步縠纹生\n", + "\n", + "卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\n", + "\n", + "卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "《天龙八部》第二回 玉壁月华明\n", + "\n", + "再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\n", + "\n", + "帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\n", + "\n", + "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", + "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", + "\n", + "********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》中的六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力增进,但需要谨慎练习,避免伤害他人。凌波微步在逍遥派中尤为流行,但并非所有逍遥派弟子都会凌波微步。\n", "\n" ] } ], "source": [ - "\n", - "res=vearch_db.similarity_search(query, 3)\n", "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", - "for idx,tmp in enumerate(res): \n", + "vearch_standalone_res=vearch_standalone.similarity_search(query, 3)\n", + "for idx,tmp in enumerate(vearch_standalone_res): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", "\n", "# combine your local knowleadge and query \n", - "context = \"\".join([tmp.page_content for tmp in res])\n", + "context = \"\".join([tmp.page_content for tmp in vearch_standalone_res])\n", "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", "response, history = model.chat(tokenizer, new_query, history=[])\n", - "print(f\"********ChatGLM:{response}\\n\")\n" + "print(f\"********ChatGLM:{response}\\n\")\n", + "\n", + "print(\"***************************after is cluster res******************************\")\n", + "\n", + "query_c = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", + "cluster_res=vearch_cluster.similarity_search(query_c, 3)\n", + "for idx,tmp in enumerate(cluster_res): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "# combine your local knowleadge and query \n", + "context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n", + "new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n", + "response_c, history_c = model.chat(tokenizer, new_query_c, history=[])\n", + "print(f\"********ChatGLM:{response_c}\\n\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -223,42 +229,20 @@ "output_type": "stream", "text": [ "Human: 你知道vearch是什么吗?\n", - "ChatGLM:是的,我知道 Vearch。Vearch 是一种矩阵分解 technique,用于将矩阵分解为若干个不可约矩阵的乘积。它是由 Linus Torvalds 开发的,旨在提高 Linux 内核中矩阵操作的性能。\n", + "ChatGLM:是的,我知道 Vearch。Vearch 是一种用于计算机械系统极化子的工具,它可以用于模拟和优化电路的性能。它是一个基于Matlab的电路仿真软件,可以用于设计和分析各种类型的电路,包括交流电路和直流电路。\n", "\n", - "Vearch 可以通过使用特殊的操作来对矩阵进行操作,从而避免了使用昂贵的矩阵操作库。它也被广泛用于其他操作系统中,如 FreeBSD 和 Solaris。\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "docids ['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c']\n", + "*****************after is cluster res********************\n", + "docids ['-4311783201092343475', '-2899734009733762895', '1342026762029067927']\n" ] }, { "data": { "text/plain": [ - "['04bc84fff5074b7b8990441e92e6df07',\n", - " 'e221906153bb4e03bc7095dadea144de',\n", - " '126034ba51934093920d8732860f340b']" + "['-4311783201092343475', '-2899734009733762895', '1342026762029067927']" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -268,27 +252,24 @@ "response, history = model.chat(tokenizer, query, history=history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", "\n", - "\n", "vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n", "vearch_source=[{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n", - "vearch_db.add_texts(vearch_info,vearch_source)\n", + "vearch_standalone.add_texts(vearch_info,vearch_source)\n", + "\n", + "print(\"*****************after is cluster res********************\")\n", + "\n", + "vearch_cluster.add_texts(vearch_info,vearch_source)\n", + "\n", "\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 25.57it/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -305,56 +286,107 @@ "\n", "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", "\n", - "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、Llama和ChatGLM等模型,并可以直接通过pip安装。Varch是一个基于C语言和Go语言开发的项目,并提供了Python接口。\n", + "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、ChatGLM等模型,并可直接通过pip安装。\n", + "\n", + "***************after is cluster res******************\n", + "####################第1段相关文档####################\n", + "\n", + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", + "\n", + "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI,ChatGLM等模型,并可用于基于个人知识库的大模型应用。Varch基于C语言和Go语言开发,并提供Python接口,可以通过pip安装。\n", "\n" ] } ], "source": [ "query3 = \"你知道vearch是什么吗?\"\n", - "res1 = vearch_db.similarity_search(query3, 3)\n", + "res1 = vearch_standalone.similarity_search(query3, 3)\n", "for idx,tmp in enumerate(res1): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", "\n", "context1 = \"\".join([tmp.page_content for tmp in res1])\n", "new_query1 = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1} \\n 回答用户这个问题:{query3}\\n\\n\"\n", "response, history = model.chat(tokenizer, new_query1, history=[])\n", + "print(f\"***************ChatGLM:{response}\\n\")\n", "\n", - "print(f\"***************ChatGLM:{response}\\n\")" + "print(\"***************after is cluster res******************\")\n", + "\n", + "query3_c = \"你知道vearch是什么吗?\"\n", + "res1_c = vearch_standalone.similarity_search(query3_c, 3)\n", + "for idx,tmp in enumerate(res1_c): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "context1_C = \"\".join([tmp.page_content for tmp in res1_c])\n", + "new_query1_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1_C} \\n 回答用户这个问题:{query3_c}\\n\\n\"\n", + "response_c, history_c = model.chat(tokenizer, new_query1_c, history=[])\n", + "\n", + "print(f\"***************ChatGLM:{response_c}\\n\")" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "delete docid True\n", + "delete vearch standalone docid True\n", + "Human: 你知道vearch是什么吗?\n", + "ChatGLM:Vearch是一种用于处理向量的库,可以轻松地将向量转换为矩阵,并提供许多有用的函数和算法,以操作向量。 Vearch支持许多常见的向量操作,例如加法、减法、乘法、除法、矩阵乘法、求和、统计和归一化等。 Vearch还提供了一些高级功能,例如L2正则化、协方差矩阵、稀疏矩阵和奇异值分解等。\n", + "\n", + "delete vearch cluster docid True\n", "Human: 你知道vearch是什么吗?\n", - "ChatGLM:Vearch是一种高分子化合物,也称为聚合物、高分子材料或合成材料。它是由重复单元组成的大型聚合物,通常由一些重复单元组成,这些单元在聚合过程中结合在一起形成一个连续的高分子链。\n", + "ChatGLM:Vearch是一种用于处理向量数据的函数,可以应用于多种不同的编程语言和数据结构中。\n", + "\n", + "Vearch最初是作为Java中一个名为“vearch”的包而出现的,它的目的是提供一种高效的向量数据结构。它支持向量的多态性,可以轻松地实现不同类型的向量之间的转换,同时还支持向量的压缩和反向操作等操作。\n", + "\n", + "后来,Vearch被广泛应用于其他编程语言中,如Python、Ruby、JavaScript等。在Python中,它被称为“vectorize”,在Ruby中,它被称为“Vector”。\n", "\n", - "Vearch具有许多独特的性质,例如高强度、高刚性、耐磨、耐腐蚀、耐高温等。它们通常用于制造各种应用,例如塑料制品、橡胶、纤维、建筑材料等。\n", + "Vearch的主要优点是它的向量操作具有多态性,可以应用于不同类型的向量数据,同时还支持高效的向量操作和反向操作,因此可以提高程序的性能。\n", "\n", "after delete docid to query again: {}\n", - "get existed docid {'7aae36236f784105a0004d8ff3c7c3ad': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '7e495d4e5962497db2080e84d52e75ed': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n" + "get existed docid {'18ce6747dca04a2c833e60e8dfd83c04': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), 'aafacb0e46574b378a9f433877ab06a8': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '9776bccfdd8643a8b219ccee0596f370': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法,精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n", + "after delete docid to query again: {}\n", + "get existed docid {'1841638988191686991': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '-4519586577642625749': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '5028230008472292907': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法,精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n" ] } ], "source": [ "##delete and get function need to maintian docids \n", "##your docid\n", - "res_d=vearch_db.delete(['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b'])\n", - "print(\"delete docid\",res_d)\n", + "\n", + "res_d=vearch_standalone.delete(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "print(\"delete vearch standalone docid\",res_d)\n", "query = \"你知道vearch是什么吗?\"\n", "response, history = model.chat(tokenizer, query, history=[])\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", - "get_id_doc=vearch_db.get(['04bc84fff5074b7b8990441e92e6df07'])\n", - "print(\"after delete docid to query again:\",get_id_doc)\n", - "get_delet_doc=vearch_db.get(['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed'])\n", - "print(\"get existed docid\",get_delet_doc)" + "\n", + "res_cluster=vearch_cluster.delete(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n", + "print(\"delete vearch cluster docid\",res_cluster)\n", + "query_c = \"你知道vearch是什么吗?\"\n", + "response_c, history = model.chat(tokenizer, query_c, history=[])\n", + "print(f\"Human: {query}\\nChatGLM:{response_c}\\n\")\n", + "\n", + "\n", + "get_delet_doc=vearch_standalone.get(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "print(\"after delete docid to query again:\",get_delet_doc)\n", + "get_id_doc=vearch_standalone.get(['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370','9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "print(\"get existed docid\",get_id_doc)\n", + "\n", + "get_delet_doc=vearch_cluster.get(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n", + "print(\"after delete docid to query again:\",get_delet_doc)\n", + "get_id_doc=vearch_cluster.get(['1841638988191686991', '-4519586577642625749', '5028230008472292907','1342026762029067927'])\n", + "print(\"get existed docid\",get_id_doc)\n" ] }, { @@ -385,7 +417,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.12 ('langchainGLM6B')", + "display_name": "Python 3.10.13 ('vearch_cluster_langchain')", "language": "python", "name": "python3" }, @@ -399,12 +431,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "1fd24e7ef183310e43cbf656d21568350c6a30580b6df7fe3b34654b3770f74d" + "hash": "f1da10a89896267ed34b497c9568817f36cc7ea79826b5cfca4d96376f5b4835" } } }, diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index cb6e0b17a3..a398166580 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -73,6 +73,7 @@ from langchain.vectorstores.tigris import Tigris from langchain.vectorstores.typesense import Typesense from langchain.vectorstores.usearch import USearch from langchain.vectorstores.vald import Vald +from langchain.vectorstores.vearch import Vearch from langchain.vectorstores.vectara import Vectara from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.zep import ZepVectorStore @@ -137,6 +138,7 @@ __all__ = [ "Typesense", "USearch", "Vald", + "Vearch", "Vectara", "VectorStore", "Weaviate", diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index 0dbbb48ca7..85c25a6f8e 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -13,44 +13,68 @@ from langchain.vectorstores.base import VectorStore if TYPE_CHECKING: import vearch + DEFAULT_TOPN = 4 -class VearchDb(VectorStore): +class Vearch(VectorStore): _DEFAULT_TABLE_NAME = "langchain_vearch" + _DEFAULT_CLUSTER_DB_NAME = "cluster_client_db" + _DEFAULT_VERSION = 1 def __init__( self, embedding_function: Embeddings, + path_or_url: Optional[str] = None, table_name: str = _DEFAULT_TABLE_NAME, - metadata_path: Optional[str] = None, + db_name: str = _DEFAULT_CLUSTER_DB_NAME, + flag: int = _DEFAULT_VERSION, **kwargs: Any, ) -> None: - """Initialize vearch vector store""" + """Initialize vearch vector store + flag 1 for cluster,0 for standalone + """ try: - import vearch + if flag: + import vearch_cluster + else: + import vearch except ImportError: raise ValueError( - "Could not import vearch python package. " - "Please install it with `pip install vearch`." + "Could not import suitable python package. " + "Please install it with `pip install vearch or vearch_cluster`." ) - if metadata_path is None: - metadata_path = os.getcwd().replace("\\", "/") - if not os.path.isdir(metadata_path): - os.makedirs(metadata_path) - log_path = os.path.join(metadata_path, "log") - if not os.path.isdir(log_path): - os.makedirs(log_path) - self.vearch_engine = vearch.Engine(metadata_path, log_path) - + if flag: + if path_or_url is None: + raise ValueError("Please input url of cluster") + if not db_name: + db_name = self._DEFAULT_CLUSTER_DB_NAME + db_name += "_" + db_name += str(uuid.uuid4()).split("-")[-1] + self.using_db_name = db_name + self.url = path_or_url + self.vearch = vearch_cluster.VearchCluster(path_or_url) + + else: + if path_or_url is None: + metadata_path = os.getcwd().replace("\\", "/") + else: + metadata_path = path_or_url + if not os.path.isdir(metadata_path): + os.makedirs(metadata_path) + log_path = os.path.join(metadata_path, "log") + if not os.path.isdir(log_path): + os.makedirs(log_path) + self.vearch = vearch.Engine(metadata_path, log_path) + self.using_metapath = metadata_path if not table_name: table_name = self._DEFAULT_TABLE_NAME table_name += "_" table_name += str(uuid.uuid4()).split("-")[-1] self.using_table_name = table_name - self.using_metapath = metadata_path self.embedding_func = embedding_function + self.flag = flag @property def embeddings(self) -> Optional[Embeddings]: @@ -58,13 +82,15 @@ class VearchDb(VectorStore): @classmethod def from_documents( - cls: Type[VearchDb], + cls: Type[Vearch], documents: List[Document], embedding: Embeddings, - table_name: str = "langchain_vearch", - metadata_path: Optional[str] = None, + path_or_url: Optional[str] = None, + table_name: str = _DEFAULT_TABLE_NAME, + db_name: str = _DEFAULT_CLUSTER_DB_NAME, + flag: int = _DEFAULT_VERSION, **kwargs: Any, - ) -> VearchDb: + ) -> Vearch: """Return Vearch VectorStore""" texts = [d.page_content for d in documents] @@ -74,27 +100,34 @@ class VearchDb(VectorStore): texts=texts, embedding=embedding, metadatas=metadatas, + path_or_url=path_or_url, table_name=table_name, - metadata_path=metadata_path, + db_name=db_name, + flag=flag, **kwargs, ) @classmethod def from_texts( - cls: Type[VearchDb], + cls: Type[Vearch], texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, + path_or_url: Optional[str] = None, table_name: str = _DEFAULT_TABLE_NAME, - metadata_path: Optional[str] = None, + db_name: str = _DEFAULT_CLUSTER_DB_NAME, + flag: int = _DEFAULT_VERSION, **kwargs: Any, - ) -> VearchDb: + ) -> Vearch: """Return Vearch VectorStore""" vearch_db = cls( embedding_function=embedding, + embedding=embedding, + path_or_url=path_or_url, + db_name=db_name, table_name=table_name, - metadata_path=metadata_path, + flag=flag, ) vearch_db.add_texts(texts=texts, metadatas=metadatas) return vearch_db @@ -102,19 +135,20 @@ class VearchDb(VectorStore): def _create_table( self, dim: int = 1024, - filed_list: List[dict] = [ - {"filed": "text", "type": "str"}, - {"filed": "metadata", "type": "str"}, + field_list: List[dict] = [ + {"field": "text", "type": "str"}, + {"field": "metadata", "type": "str"}, ], ) -> int: """ Create VectorStore Table Args: dim:dimension of vector - fileds_list: the filed you want to store + fields_list: the field you want to store Return: code,0 for success,1 for failed """ + type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING} engine_info = { "index_size": 10000, @@ -122,8 +156,8 @@ class VearchDb(VectorStore): "retrieval_param": {"ncentroids": 2048, "nsubvector": 32}, } fields = [ - vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]]) - for fi in filed_list + vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]]) + for fi in field_list ] vector_field = vearch.GammaVectorInfo( name="text_embedding", @@ -135,7 +169,7 @@ class VearchDb(VectorStore): store_param={"cache_size": 10000}, has_source=False, ) - response_code = self.vearch_engine.create_table( + response_code = self.vearch.create_table( engine_info, name=self.using_table_name, fields=fields, @@ -143,6 +177,48 @@ class VearchDb(VectorStore): ) return response_code + def _create_space( + self, + dim: int = 1024, + ) -> int: + """ + Create VectorStore space + Args: + dim:dimension of vector + Return: + code,0 failed for ,1 for success + """ + space_config = { + "name": self.using_table_name, + "partition_num": 1, + "replica_num": 1, + "engine": { + "name": "gamma", + "index_size": 1, + "retrieval_type": "FLAT", + "retrieval_param": { + "metric_type": "L2", + }, + }, + "properties": { + "text": { + "type": "string", + }, + "metadata": { + "type": "string", + }, + "text_embedding": { + "type": "vector", + "index": True, + "dimension": dim, + "store_type": "MemoryOnly", + }, + }, + } + response_code = self.vearch.create_space(self.using_db_name, space_config) + + return response_code + def add_texts( self, texts: Iterable[str], @@ -156,64 +232,104 @@ class VearchDb(VectorStore): embeddings = None if self.embedding_func is not None: embeddings = self.embedding_func.embed_documents(list(texts)) - table_path = os.path.join( - self.using_metapath, self.using_table_name + ".schema" - ) - if not os.path.exists(table_path): - if embeddings is None: - raise ValueError("embeddings is None") - dim = len(embeddings[0]) - response_code = self._create_table(dim) - if response_code: - raise ValueError("create table failed!!!") - if embeddings is not None and metadatas is not None: - doc_items = [] - for text, metadata, embed in zip(texts, metadatas, embeddings): - profiles: dict[str, Any] = {} - profiles["text"] = text - profiles["metadata"] = metadata["source"] - profiles["text_embedding"] = embed - doc_items.append(profiles) - - docid = self.vearch_engine.add(doc_items) - t_time = 0 - while len(docid) != len(embeddings): - time.sleep(0.5) - if t_time > 6: - break - t_time += 1 - self.vearch_engine.dump() + if embeddings is None: + raise ValueError("embeddings is None") + if self.flag: + dbs_list = self.vearch.list_dbs() + if self.using_db_name not in dbs_list: + create_db_code = self.vearch.create_db(self.using_db_name) + if not create_db_code: + raise ValueError("create db failed!!!") + space_list = self.vearch.list_spaces(self.using_db_name) + if self.using_table_name not in space_list: + create_space_code = self._create_space(len(embeddings[0])) + if not create_space_code: + raise ValueError("create space failed!!!") + docid = [] + if embeddings is not None and metadatas is not None: + for text, metadata, embed in zip(texts, metadatas, embeddings): + profiles: dict[str, Any] = {} + profiles["text"] = text + profiles["metadata"] = metadata["source"] + embed_np = np.array(embed) + profiles["text_embedding"] = { + "feature": (embed_np / np.linalg.norm(embed_np)).tolist() + } + insert_res = self.vearch.insert_one( + self.using_db_name, self.using_table_name, profiles + ) + if insert_res["status"] == 200: + docid.append(insert_res["_id"]) + continue + else: + retry_insert = self.vearch.insert_one( + self.using_db_name, self.using_table_name, profiles + ) + docid.append(retry_insert["_id"]) + continue + else: + table_path = os.path.join( + self.using_metapath, self.using_table_name + ".schema" + ) + if not os.path.exists(table_path): + dim = len(embeddings[0]) + response_code = self._create_table(dim) + if response_code: + raise ValueError("create table failed!!!") + if embeddings is not None and metadatas is not None: + doc_items = [] + for text, metadata, embed in zip(texts, metadatas, embeddings): + profiles_v: dict[str, Any] = {} + profiles_v["text"] = text + profiles_v["metadata"] = metadata["source"] + embed_np = np.array(embed) + profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np) + doc_items.append(profiles_v) + + docid = self.vearch.add(doc_items) + t_time = 0 + while len(docid) != len(embeddings): + time.sleep(0.5) + if t_time > 6: + break + t_time += 1 + self.vearch.dump() return docid def _load(self) -> None: """ - load vearch engine + load vearch engine for standalone vearch """ - self.vearch_engine.load() + self.vearch.load() @classmethod def load_local( cls, embedding: Embeddings, + path_or_url: Optional[str] = None, table_name: str = _DEFAULT_TABLE_NAME, - metadata_path: Optional[str] = None, + db_name: str = _DEFAULT_CLUSTER_DB_NAME, + flag: int = _DEFAULT_VERSION, **kwargs: Any, - ) -> VearchDb: - """Load the local specified table. + ) -> Vearch: + """Load the local specified table of standalone vearch. Returns: Success or failure of loading the local specified table """ - if not metadata_path: + if not path_or_url: raise ValueError("No metadata path!!!") if not table_name: raise ValueError("No table name!!!") - table_path = os.path.join(metadata_path, table_name + ".schema") + table_path = os.path.join(path_or_url, table_name + ".schema") if not os.path.exists(table_path): raise ValueError("vearch vectorbase table not exist!!!") + vearch_db = cls( embedding_function=embedding, + path_or_url=path_or_url, table_name=table_name, - metadata_path=metadata_path, + db_name=db_name, + flag=flag, ) vearch_db._load() return vearch_db @@ -228,8 +344,6 @@ class VearchDb(VectorStore): Return docs most similar to query. """ - if self.vearch_engine is None: - raise ValueError("Vearch engine is None!!!") if self.embedding_func is None: raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) @@ -243,7 +357,6 @@ class VearchDb(VectorStore): **kwargs: Any, ) -> List[Document]: """The most k similar documents and scores of the specified query. - Args: embeddings: embedding vector of the query. k: The k most similar documents to the text query. @@ -252,23 +365,45 @@ class VearchDb(VectorStore): The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """ - query_data = { - "vector": [ - { - "field": "text_embedding", - "feature": np.array(embedding), - } - ], - "fields": [], - "is_brute_search": 1, - "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, - "topn": k, - } - query_result = self.vearch_engine.search(query_data) + embed = np.array(embedding) + if self.flag: + query_data = { + "query": { + "sum": [ + { + "field": "text_embedding", + "feature": (embed / np.linalg.norm(embed)).tolist(), + } + ], + }, + "size": k, + "fields": ["text", "metadata"], + } + query_result = self.vearch.search( + self.using_db_name, self.using_table_name, query_data + ) + res = query_result["hits"]["hits"] + else: + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": embed / np.linalg.norm(embed), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch.search(query_data) + res = query_result[0]["result_items"] docs = [] - for item in query_result[0]["result_items"]: + for item in res: content = "" meta_data = {} + if self.flag: + item = item["_source"] for item_key in item: if item_key == "text": content = item[item_key] @@ -286,7 +421,6 @@ class VearchDb(VectorStore): **kwargs: Any, ) -> List[Tuple[Document, float]]: """The most k similar documents and scores of the specified query. - Args: embeddings: embedding vector of the query. k: The k most similar documents to the text query. @@ -298,23 +432,46 @@ class VearchDb(VectorStore): if self.embedding_func is None: raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) - query_data = { - "vector": [ - { - "field": "text_embedding", - "feature": np.array(embeddings), - } - ], - "fields": [], - "is_brute_search": 1, - "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, - "topn": k, - } - query_result = self.vearch_engine.search(query_data) + embed = np.array(embeddings) + if self.flag: + query_data = { + "query": { + "sum": [ + { + "field": "text_embedding", + "feature": (embed / np.linalg.norm(embed)).tolist(), + } + ], + }, + "size": k, + "fields": ["text_embedding", "text", "metadata"], + } + query_result = self.vearch.search( + self.using_db_name, self.using_table_name, query_data + ) + res = query_result["hits"]["hits"] + else: + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": embed / np.linalg.norm(embed), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch.search(query_data) + res = query_result[0]["result_items"] results: List[Tuple[Document, float]] = [] - for item in query_result[0]["result_items"]: + for item in res: content = "" meta_data = {} + if self.flag: + score = item["_score"] + item = item["_source"] for item_key in item: if item_key == "text": content = item[item_key] @@ -322,7 +479,7 @@ class VearchDb(VectorStore): if item_key == "metadata": meta_data["source"] = item[item_key] continue - if item_key == "score": + if self.flag != 1 and item_key == "score": score = item[item_key] continue tmp_res = (Document(page_content=content, metadata=meta_data), score) @@ -351,14 +508,16 @@ class VearchDb(VectorStore): Optional[bool]: True if deletion is successful. False otherwise, None if not implemented. """ - if self.vearch_engine is None: - raise ValueError("Verach Engine is None!!!") + ret: Optional[bool] = None tmp_res = [] if ids is None or ids.__len__() == 0: return ret for _id in ids: - ret = self.vearch_engine.del_doc(_id) + if self.flag: + ret = self.vearch.delete(self.using_db_name, self.using_table_name, _id) + else: + ret = self.vearch.del_doc(_id) tmp_res.append(ret) ret = all(i == 0 for i in tmp_res) return ret @@ -376,26 +535,44 @@ class VearchDb(VectorStore): Documents which satisfy the input conditions. """ - if self.vearch_engine is None: - raise ValueError("vearch engine is None!!!") results: Dict[str, Document] = {} if ids is None or ids.__len__() == 0: return results - for id in ids: - docs_detail = self.vearch_engine.get_doc_by_id(id) - if docs_detail == {}: - continue - - content = "" - meta_info = {} - for field in docs_detail: - if field == "text": - content = docs_detail[field] + if self.flag: + query_data = {"query": {"ids": ids}} + docs_detail = self.vearch.mget_by_ids( + self.using_db_name, self.using_table_name, query_data + ) + for record in docs_detail: + if record["found"] is False: continue - elif field == "metadata": - meta_info["source"] = docs_detail[field] + content = "" + meta_info = {} + for field in record["_source"]: + if field == "text": + content = record["_source"][field] + continue + elif field == "metadata": + meta_info["source"] = record["_source"][field] + continue + results[record["_id"]] = Document( + page_content=content, metadata=meta_info + ) + else: + for id in ids: + docs_detail = self.vearch.get_doc_by_id(id) + if docs_detail == {}: continue - results[docs_detail["_id"]] = Document( - page_content=content, metadata=meta_info - ) + content = "" + meta_info = {} + for field in docs_detail: + if field == "text": + content = docs_detail[field] + continue + elif field == "metadata": + meta_info["source"] = docs_detail[field] + continue + results[docs_detail["_id"]] = Document( + page_content=content, metadata=meta_info + ) return results