From 62fa2bc51808531e31502a3c0c056204550d517d Mon Sep 17 00:00:00 2001 From: zhanghexian <96572405+zhanghexian@users.noreply.github.com> Date: Sat, 9 Sep 2023 07:51:14 +0800 Subject: [PATCH] Add Vearch vectorstore (#9846) --------- Co-authored-by: zhanghexian1 Co-authored-by: Bagatur Co-authored-by: Harrison Chase --- .../integrations/vectorstores/vearch.ipynb | 413 ++++++++++++++++++ docs/integrations/vearch.md | 15 + .../langchain/vectorstores/vearch.py | 401 +++++++++++++++++ .../vectorstores/test_vearch.py | 97 ++++ 4 files changed, 926 insertions(+) create mode 100644 docs/extras/integrations/vectorstores/vearch.ipynb create mode 100644 docs/integrations/vearch.md create mode 100644 libs/langchain/langchain/vectorstores/vearch.py create mode 100644 tests/integration_tests/vectorstores/test_vearch.py diff --git a/docs/extras/integrations/vectorstores/vearch.ipynb b/docs/extras/integrations/vectorstores/vearch.ipynb new file mode 100644 index 0000000000..8e14c12369 --- /dev/null +++ b/docs/extras/integrations/vectorstores/vearch.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/export/anaconda3/envs/langchainGLM6B/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "INFO 2023-08-28 18:26:07,485-1d: \n", + "loading model config\n", + "llm device: cuda\n", + "embedding device: cuda\n", + "dir: /data/zhx/zhx/langchain-ChatGLM_new\n", + "flagging username: e2fc35b8e87c4de18d692e951a5f7c46\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.01it/s]\n" + ] + } + ], + "source": [ + "\n", + "import os, sys, torch\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n", + "from langchain import HuggingFacePipeline, ConversationChain\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.vectorstores.vearch import VearchDb\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "\n", + "# your local model path\n", + "model_path =\"/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b\" \n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n", + "model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Human: 你好!\n", + "ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。\n", + "\n", + "Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n", + "ChatGLM:凌波微步是一种步伐,最早出自于《倚天屠龙记》。在小说中,灭绝师太曾因与练习凌波微步的杨过的恩怨纠葛,而留下了一部经书,内容是记载凌波微步的起源和作用。后来,凌波微步便成为杨过和小龙女的感情象征。在现实生活中,凌波微步是一句口号,是清华大学学生社团“模型社”的社训。\n", + "\n" + ] + } + ], + "source": [ + "query = \"你好!\"\n", + "response, history = model.chat(tokenizer, query, history=[])\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "query = \"你知道凌波微步吗,你知道都有谁学会了吗?\"\n", + "response, history = model.chat(tokenizer, query, history=history)\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO 2023-08-28 18:27:36,037-1d: Load pretrained SentenceTransformer: /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese\n", + "WARNING 2023-08-28 18:27:36,038-1d: No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n", + "INFO 2023-08-28 18:27:38,936-1d: Use pytorch device: cuda\n" + ] + } + ], + "source": [ + "# Add your local knowledge files\n", + "file_path = \"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt\"#Your local file path\"\n", + "loader = TextLoader(file_path,encoding=\"utf-8\")\n", + "documents = loader.load()\n", + "\n", + "# split text into sentences and embedding the sentences\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=500, chunk_overlap=100)\n", + "texts = text_splitter.split_documents(documents)\n", + "\n", + "#your model path\n", + "embedding_path = '/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese'\n", + "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 4.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed', '9a640124fc324a8abb0eaa31acb638b7']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "#first add your document into vearch vectorstore\n", + "vearch_db = VearchDb.from_documents(texts,embeddings,table_name=\"your_table_name\",metadata_path=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/your_table_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####################第1段相关文档####################\n", + "\n", + "午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\n", + "\n", + "这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\n", + "\n", + "\n", + "\n", + "百度简介\n", + "\n", + "凌波微步是「逍遥派」独门轻功身法,精妙异常。\n", + "\n", + "凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "《天龙八部》第五回 微步縠纹生\n", + "\n", + "卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\n", + "\n", + "卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "《天龙八部》第二回 玉壁月华明\n", + "\n", + "再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\n", + "\n", + "帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\n", + "\n", + "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", + "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", + "\n", + "********ChatGLM:凌波微步是一种轻功身法,属于逍遥派独门轻功。它以《易经》中的六十四卦为基础,按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。凌波微步精妙异常,可以让人内力相助,自身内力颇为深厚之后再练。《天龙八部》第五回中有描述。\n", + "\n" + ] + } + ], + "source": [ + "\n", + "res=vearch_db.similarity_search(query, 3)\n", + "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", + "for idx,tmp in enumerate(res): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "# combine your local knowleadge and query \n", + "context = \"\".join([tmp.page_content for tmp in res])\n", + "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", + "response, history = model.chat(tokenizer, new_query, history=[])\n", + "print(f\"********ChatGLM:{response}\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Human: 你知道vearch是什么吗?\n", + "ChatGLM:是的,我知道 Vearch。Vearch 是一种矩阵分解 technique,用于将矩阵分解为若干个不可约矩阵的乘积。它是由 Linus Torvalds 开发的,旨在提高 Linux 内核中矩阵操作的性能。\n", + "\n", + "Vearch 可以通过使用特殊的操作来对矩阵进行操作,从而避免了使用昂贵的矩阵操作库。它也被广泛用于其他操作系统中,如 FreeBSD 和 Solaris。\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['04bc84fff5074b7b8990441e92e6df07',\n", + " 'e221906153bb4e03bc7095dadea144de',\n", + " '126034ba51934093920d8732860f340b']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"你知道vearch是什么吗?\"\n", + "response, history = model.chat(tokenizer, query, history=history)\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "\n", + "\n", + "vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", + " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", + " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n", + "vearch_source=[{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n", + "vearch_db.add_texts(vearch_info,vearch_source)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 25.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####################第1段相关文档####################\n", + "\n", + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", + "\n", + "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、Llama和ChatGLM等模型,并可以直接通过pip安装。Varch是一个基于C语言和Go语言开发的项目,并提供了Python接口。\n", + "\n" + ] + } + ], + "source": [ + "query3 = \"你知道vearch是什么吗?\"\n", + "res1 = vearch_db.similarity_search(query3, 3)\n", + "for idx,tmp in enumerate(res1): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "context1 = \"\".join([tmp.page_content for tmp in res1])\n", + "new_query1 = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1} \\n 回答用户这个问题:{query3}\\n\\n\"\n", + "response, history = model.chat(tokenizer, new_query1, history=[])\n", + "\n", + "print(f\"***************ChatGLM:{response}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delete docid True\n", + "Human: 你知道vearch是什么吗?\n", + "ChatGLM:Vearch是一种高分子化合物,也称为聚合物、高分子材料或合成材料。它是由重复单元组成的大型聚合物,通常由一些重复单元组成,这些单元在聚合过程中结合在一起形成一个连续的高分子链。\n", + "\n", + "Vearch具有许多独特的性质,例如高强度、高刚性、耐磨、耐腐蚀、耐高温等。它们通常用于制造各种应用,例如塑料制品、橡胶、纤维、建筑材料等。\n", + "\n", + "after delete docid to query again: {}\n", + "get existed docid {'7aae36236f784105a0004d8ff3c7c3ad': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '7e495d4e5962497db2080e84d52e75ed': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n" + ] + } + ], + "source": [ + "##delete and get function need to maintian docids \n", + "##your docid\n", + "res_d=vearch_db.delete(['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b'])\n", + "print(\"delete docid\",res_d)\n", + "query = \"你知道vearch是什么吗?\"\n", + "response, history = model.chat(tokenizer, query, history=[])\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "get_id_doc=vearch_db.get(['04bc84fff5074b7b8990441e92e6df07'])\n", + "print(\"after delete docid to query again:\",get_id_doc)\n", + "get_delet_doc=vearch_db.get(['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed'])\n", + "print(\"get existed docid\",get_delet_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.12 ('langchainGLM6B')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1fd24e7ef183310e43cbf656d21568350c6a30580b6df7fe3b34654b3770f74d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/integrations/vearch.md b/docs/integrations/vearch.md new file mode 100644 index 0000000000..da61bec98c --- /dev/null +++ b/docs/integrations/vearch.md @@ -0,0 +1,15 @@ +# Vearch + +Vearch is a scalable distributed system for efficient similarity search of deep learning vectors. + +# Installation and Setup + +Vearch Python SDK enables vearch to use locally. Vearch python sdk can be installed easily by pip install vearch. + +# Vectorstore + +Vearch also can used as vectorstore. Most detalis in [this notebook](docs/modules/indexes/vectorstores/examples/vearch.ipynb) + +```python +from langchain.vectorstores import Vearch +``` diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py new file mode 100644 index 0000000000..99706d2e98 --- /dev/null +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -0,0 +1,401 @@ +from __future__ import annotations + +import os +import time +import uuid +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type + +import numpy as np + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore + +if TYPE_CHECKING: + import vearch +DEFAULT_TOPN = 4 + + +class VearchDb(VectorStore): + _DEFAULT_TABLE_NAME = "langchain_vearch" + + def __init__( + self, + embedding_function: Embeddings, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Initialize vearch vector store""" + try: + import vearch + except ImportError: + raise ValueError( + "Could not import vearch python package. " + "Please install it with `pip install vearch`." + ) + + if metadata_path is None: + metadata_path = os.getcwd().replace("\\", "/") + if not os.path.isdir(metadata_path): + os.makedirs(metadata_path) + log_path = os.path.join(metadata_path, "log") + if not os.path.isdir(log_path): + os.makedirs(log_path) + self.vearch_engine = vearch.Engine(metadata_path, log_path) + + if not table_name: + table_name = self._DEFAULT_TABLE_NAME + table_name += "_" + table_name += str(uuid.uuid4()).split("-")[-1] + self.using_table_name = table_name + self.using_metapath = metadata_path + self.embedding_func = embedding_function + + @property + def embeddings(self) -> Optional[Embeddings]: + return self.embedding_func + + @classmethod + def from_documents( + cls: Type[VearchDb], + documents: List[Document], + embedding: Embeddings, + table_name: str = "langchain_vearch", + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Return Vearch VectorStore""" + + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + + return cls.from_texts( + texts=texts, + embedding=embedding, + metadatas=metadatas, + table_name=table_name, + metadata_path=metadata_path, + **kwargs, + ) + + @classmethod + def from_texts( + cls: Type[VearchDb], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Return Vearch VectorStore""" + + vearch_db = cls( + embedding_function=embedding, + table_name=table_name, + metadata_path=metadata_path, + ) + vearch_db.add_texts(texts=texts, metadatas=metadatas) + return vearch_db + + def _create_table( + self, + dim: int = 1024, + filed_list: List[dict] = [ + {"filed": "text", "type": "str"}, + {"filed": "metadata", "type": "str"}, + ], + ) -> int: + """ + Create VectorStore Table + Args: + dim:dimension of vector + fileds_list: the filed you want to store + Return: + code,0 for success,1 for failed + """ + type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING} + engine_info = { + "index_size": 10000, + "retrieval_type": "IVFPQ", + "retrieval_param": {"ncentroids": 2048, "nsubvector": 32}, + } + fields = [ + vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]]) + for fi in filed_list + ] + vector_field = vearch.GammaVectorInfo( + name="text_embedding", + type=vearch.dataType.VECTOR, + is_index=True, + dimension=dim, + model_id="", + store_type="MemoryOnly", + store_param={"cache_size": 10000}, + has_source=False, + ) + response_code = self.vearch_engine.create_table( + engine_info, + name=self.using_table_name, + fields=fields, + vector_field=vector_field, + ) + return response_code + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """ + Returns: + List of ids from adding the texts into the vectorstore. + """ + embeddings = None + if self.embedding_func is not None: + embeddings = self.embedding_func.embed_documents(list(texts)) + table_path = os.path.join( + self.using_metapath, self.using_table_name + ".schema" + ) + if not os.path.exists(table_path): + if embeddings is None: + raise ValueError("embeddings is None") + dim = len(embeddings[0]) + response_code = self._create_table(dim) + if response_code: + raise ValueError("create table failed!!!") + if embeddings is not None and metadatas is not None: + doc_items = [] + for text, metadata, embed in zip(texts, metadatas, embeddings): + profiles: dict[str, Any] = {} + profiles["text"] = text + profiles["metadata"] = metadata["source"] + profiles["text_embedding"] = embed + doc_items.append(profiles) + + docid = self.vearch_engine.add(doc_items) + t_time = 0 + while len(docid) != len(embeddings): + time.sleep(0.5) + if t_time > 6: + break + t_time += 1 + self.vearch_engine.dump() + return docid + + def _load(self) -> None: + """ + load vearch engine + """ + self.vearch_engine.load() + + @classmethod + def load_local( + cls, + embedding: Embeddings, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Load the local specified table. + Returns: + Success or failure of loading the local specified table + """ + if not metadata_path: + raise ValueError("No metadata path!!!") + if not table_name: + raise ValueError("No table name!!!") + table_path = os.path.join(metadata_path, table_name + ".schema") + if not os.path.exists(table_path): + raise ValueError("vearch vectorbase table not exist!!!") + vearch_db = cls( + embedding_function=embedding, + table_name=table_name, + metadata_path=metadata_path, + ) + vearch_db._load() + return vearch_db + + def similarity_search( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Document]: + """ + Return docs most similar to query. + + """ + if self.vearch_engine is None: + raise ValueError("Vearch engine is None!!!") + if self.embedding_func is None: + raise ValueError("embedding_func is None!!!") + embeddings = self.embedding_func.embed_query(query) + docs = self.similarity_search_by_vector(embeddings, k) + return docs + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Document]: + """The most k similar documents and scores of the specified query. + + Args: + embeddings: embedding vector of the query. + k: The k most similar documents to the text query. + min_score: the score of similar documents to the text query + Returns: + The k most similar documents to the specified text query. + 0 is dissimilar, 1 is the most similar. + """ + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": np.array(embedding), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch_engine.search(query_data) + docs = [] + for item in query_result[0]["result_items"]: + content = "" + meta_data = {} + for item_key in item: + if item_key == "text": + content = item[item_key] + continue + if item_key == "metadata": + meta_data["source"] = item[item_key] + continue + docs.append(Document(page_content=content, metadata=meta_data)) + return docs + + def similarity_search_with_score( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """The most k similar documents and scores of the specified query. + + Args: + embeddings: embedding vector of the query. + k: The k most similar documents to the text query. + min_score: the score of similar documents to the text query + Returns: + The k most similar documents to the specified text query. + 0 is dissimilar, 1 is the most similar. + """ + if self.embedding_func is None: + raise ValueError("embedding_func is None!!!") + embeddings = self.embedding_func.embed_query(query) + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": np.array(embeddings), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch_engine.search(query_data) + results: List[Tuple[Document, float]] = [] + for item in query_result[0]["result_items"]: + content = "" + meta_data = {} + for item_key in item: + if item_key == "text": + content = item[item_key] + continue + if item_key == "metadata": + meta_data["source"] = item[item_key] + continue + if item_key == "score": + score = item[item_key] + continue + tmp_res = (Document(page_content=content, metadata=meta_data), score) + results.append(tmp_res) + return results + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + return self.similarity_search_with_score(query, k, **kwargs) + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete the documents which have the specified ids. + + Args: + ids: The ids of the embedding vectors. + **kwargs: Other keyword arguments that subclasses might use. + Returns: + Optional[bool]: True if deletion is successful. + False otherwise, None if not implemented. + """ + if self.vearch_engine is None: + raise ValueError("Verach Engine is None!!!") + ret: Optional[bool] = None + tmp_res = [] + if ids is None or ids.__len__() == 0: + return ret + for _id in ids: + ret = self.vearch_engine.del_doc(_id) + tmp_res.append(ret) + ret = all(i == 0 for i in tmp_res) + return ret + + def get( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Dict[str, Document]: + """Return docs according ids. + + Args: + ids: The ids of the embedding vectors. + Returns: + Documents which satisfy the input conditions. + """ + + if self.vearch_engine is None: + raise ValueError("vearch engine is None!!!") + results: Dict[str, Document] = {} + if ids is None or ids.__len__() == 0: + return results + for id in ids: + docs_detail = self.vearch_engine.get_doc_by_id(id) + if docs_detail == {}: + continue + + content = "" + meta_info = {} + for field in docs_detail: + if field == "text": + content = docs_detail[field] + continue + elif field == "metadata": + meta_info["source"] = docs_detail[field] + continue + results[docs_detail["_id"]] = Document( + page_content=content, metadata=meta_info + ) + return results diff --git a/tests/integration_tests/vectorstores/test_vearch.py b/tests/integration_tests/vectorstores/test_vearch.py new file mode 100644 index 0000000000..a6827b4b85 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_vearch.py @@ -0,0 +1,97 @@ +from langchain.docstore.document import Document +from langchain.vectorstores.vearch import VearchDb +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_vearch() -> None: + """ + Test end to end create vearch ,store vector into it and search + """ + texts = [ + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装", + ] + metadatas = [ + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ] + vearch_db = VearchDb.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + table_name="test_vearch", + metadata_path="./", + ) + result = vearch_db.similarity_search( + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 1 + ) + assert result == [ + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ) + ] + + +def test_vearch_add_texts() -> None: + """Test end to end adding of texts.""" + texts = [ + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装", + ] + + metadatas = [ + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ] + vearch_db = VearchDb.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + table_name="test_vearch", + metadata_path="./", + ) + + vearch_db.add_texts( + texts=["Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库"], + metadatas={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ) + result = vearch_db.similarity_search( + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 2 + ) + + assert result == [ + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ), + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ), + ] +