diff --git a/cookbook/multi_modal_RAG_vdms.ipynb b/cookbook/multi_modal_RAG_vdms.ipynb new file mode 100644 index 0000000000..01bdd28eb2 --- /dev/null +++ b/cookbook/multi_modal_RAG_vdms.ipynb @@ -0,0 +1,526 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9fc3897d-176f-4729-8fd1-cfb4add53abd", + "metadata": {}, + "source": [ + "## VDMS multi-modal RAG\n", + "\n", + "Many documents contain a mixture of content types, including text and images. \n", + "\n", + "Yet, information captured in images is lost in most RAG applications.\n", + "\n", + "With the emergence of multimodal LLMs, like [GPT-4V](https://openai.com/research/gpt-4v-system-card), it is worth considering how to utilize images in RAG. \n", + "\n", + "This cookbook highlights: \n", + "* Use of [Unstructured](https://unstructured.io/) to parse images, text, and tables from documents (PDFs).\n", + "* Use of multimodal embeddings (such as [CLIP](https://openai.com/research/clip)) to embed images and text\n", + "* Use of [VDMS](https://github.com/IntelLabs/vdms/blob/master/README.md) as a vector store with support for multi-modal\n", + "* Retrieval of both images and text using similarity search\n", + "* Passing raw images and text chunks to a multimodal LLM for answer synthesis \n", + "\n", + "\n", + "## Packages\n", + "\n", + "For `unstructured`, you will also need `poppler` ([installation instructions](https://pdf2image.readthedocs.io/en/latest/installation.html)) and `tesseract` ([installation instructions](https://tesseract-ocr.github.io/tessdoc/Installation.html)) in your system." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "febbc459-ebba-4c1a-a52b-fed7731593f8", + "metadata": {}, + "outputs": [], + "source": [ + "# (newest versions required for multi-modal)\n", + "! pip install --quiet -U vdms langchain-experimental\n", + "\n", + "# lock to 0.10.19 due to a persistent bug in more recent versions\n", + "! pip install --quiet pdf2image \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml open_clip_torch" + ] + }, + { + "cell_type": "markdown", + "id": "6a6b6e73", + "metadata": {}, + "source": [ + "## Start VDMS Server\n", + "\n", + "Let's start a VDMS docker using port 55559 instead of default 55555. \n", + "Keep note of the port and hostname as this is needed for the vector store as it uses the VDMS Python client to connect to the server." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5f483872", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docker: Error response from daemon: Conflict. The container name \"/vdms_rag_nb\" is already in use by container \"0c19ed281463ac10d7efe07eb815643e3e534ddf24844357039453ad2b0c27e8\". You have to remove (or rename) that container to be able to reuse that name.\n", + "See 'docker run --help'.\n" + ] + } + ], + "source": [ + "! docker run --rm -d -p 55559:55555 --name vdms_rag_nb intellabs/vdms:latest\n", + "\n", + "# Connect to VDMS Vector Store\n", + "from langchain_community.vectorstores.vdms import VDMS_Client\n", + "\n", + "vdms_client = VDMS_Client(port=55559)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78ac6543", + "metadata": {}, + "outputs": [], + "source": [ + "# from dotenv import load_dotenv, find_dotenv\n", + "# load_dotenv(find_dotenv(), override=True);" + ] + }, + { + "cell_type": "markdown", + "id": "1e94b3fb-8e3e-4736-be0a-ad881626c7bd", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n", + "### Partition PDF text and images\n", + " \n", + "Let's look at an example pdf containing interesting images.\n", + "\n", + "Famous photographs from library of congress:\n", + "\n", + "* https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\n", + "* We'll use this as an example below\n", + "\n", + "We can use `partition_pdf` below from [Unstructured](https://unstructured-io.github.io/unstructured/introduction.html#key-concepts) to extract text and images." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9646b524-71a7-4b2a-bdc8-0b81f77e968f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import requests\n", + "\n", + "# Folder with pdf and extracted images\n", + "datapath = Path(\"./multimodal_files\").resolve()\n", + "datapath.mkdir(parents=True, exist_ok=True)\n", + "\n", + "pdf_url = \"https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\"\n", + "pdf_path = str(datapath / pdf_url.split(\"/\")[-1])\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(requests.get(pdf_url).content)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bc4839c0-8773-4a07-ba59-5364501269b2", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract images, tables, and chunk text\n", + "from unstructured.partition.pdf import partition_pdf\n", + "\n", + "raw_pdf_elements = partition_pdf(\n", + " filename=pdf_path,\n", + " extract_images_in_pdf=True,\n", + " infer_table_structure=True,\n", + " chunking_strategy=\"by_title\",\n", + " max_characters=4000,\n", + " new_after_n_chars=3800,\n", + " combine_text_under_n_chars=2000,\n", + " image_output_dir_path=datapath,\n", + ")\n", + "\n", + "datapath = str(datapath)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "969545ad", + "metadata": {}, + "outputs": [], + "source": [ + "# Categorize text elements by type\n", + "tables = []\n", + "texts = []\n", + "for element in raw_pdf_elements:\n", + " if \"unstructured.documents.elements.Table\" in str(type(element)):\n", + " tables.append(str(element))\n", + " elif \"unstructured.documents.elements.CompositeElement\" in str(type(element)):\n", + " texts.append(str(element))" + ] + }, + { + "cell_type": "markdown", + "id": "5d8e6349-1547-4cbf-9c6f-491d8610ec10", + "metadata": {}, + "source": [ + "## Multi-modal embeddings with our document\n", + "\n", + "We will use [OpenClip multimodal embeddings](https://python.langchain.com/docs/integrations/text_embedding/open_clip).\n", + "\n", + "We use a larger model for better performance (set in `langchain_experimental.open_clip.py`).\n", + "\n", + "```\n", + "model_name = \"ViT-g-14\"\n", + "checkpoint = \"laion2b_s34b_b88k\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4bc15842-cb95-4f84-9eb5-656b0282a800", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from langchain_community.vectorstores import VDMS\n", + "from langchain_experimental.open_clip import OpenCLIPEmbeddings\n", + "\n", + "# Create VDMS\n", + "vectorstore = VDMS(\n", + " client=vdms_client,\n", + " collection_name=\"mm_rag_clip_photos\",\n", + " embedding_function=OpenCLIPEmbeddings(\n", + " model_name=\"ViT-g-14\", checkpoint=\"laion2b_s34b_b88k\"\n", + " ),\n", + ")\n", + "\n", + "# Get image URIs with .jpg extension only\n", + "image_uris = sorted(\n", + " [\n", + " os.path.join(datapath, image_name)\n", + " for image_name in os.listdir(datapath)\n", + " if image_name.endswith(\".jpg\")\n", + " ]\n", + ")\n", + "\n", + "# Add images\n", + "if image_uris:\n", + " vectorstore.add_images(uris=image_uris)\n", + "\n", + "# Add documents\n", + "if texts:\n", + " vectorstore.add_texts(texts=texts)\n", + "\n", + "# Make retriever\n", + "retriever = vectorstore.as_retriever()" + ] + }, + { + "cell_type": "markdown", + "id": "02a186d0-27e0-4820-8092-63b5349dd25d", + "metadata": {}, + "source": [ + "## RAG\n", + "\n", + "`vectorstore.add_images` will store / retrieve images as base64 encoded strings." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "344f56a8-0dc3-433e-851c-3f7600c7a72b", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "from io import BytesIO\n", + "\n", + "from PIL import Image\n", + "\n", + "\n", + "def resize_base64_image(base64_string, size=(128, 128)):\n", + " \"\"\"\n", + " Resize an image encoded as a Base64 string.\n", + "\n", + " Args:\n", + " base64_string (str): Base64 string of the original image.\n", + " size (tuple): Desired size of the image as (width, height).\n", + "\n", + " Returns:\n", + " str: Base64 string of the resized image.\n", + " \"\"\"\n", + " # Decode the Base64 string\n", + " img_data = base64.b64decode(base64_string)\n", + " img = Image.open(BytesIO(img_data))\n", + "\n", + " # Resize the image\n", + " resized_img = img.resize(size, Image.LANCZOS)\n", + "\n", + " # Save the resized image to a bytes buffer\n", + " buffered = BytesIO()\n", + " resized_img.save(buffered, format=img.format)\n", + "\n", + " # Encode the resized image to Base64\n", + " return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", + "\n", + "\n", + "def is_base64(s):\n", + " \"\"\"Check if a string is Base64 encoded\"\"\"\n", + " try:\n", + " return base64.b64encode(base64.b64decode(s)) == s.encode()\n", + " except Exception:\n", + " return False\n", + "\n", + "\n", + "def split_image_text_types(docs):\n", + " \"\"\"Split numpy array images and texts\"\"\"\n", + " images = []\n", + " text = []\n", + " for doc in docs:\n", + " doc = doc.page_content # Extract Document contents\n", + " if is_base64(doc):\n", + " # Resize image to avoid OAI server error\n", + " images.append(\n", + " resize_base64_image(doc, size=(250, 250))\n", + " ) # base64 encoded str\n", + " else:\n", + " text.append(doc)\n", + " return {\"images\": images, \"texts\": text}" + ] + }, + { + "cell_type": "markdown", + "id": "23a2c1d8-fea6-4152-b184-3172dd46c735", + "metadata": {}, + "source": [ + "Currently, we format the inputs using a `RunnableLambda` while we add image support to `ChatPromptTemplates`.\n", + "\n", + "Our runnable follows the classic RAG flow - \n", + "\n", + "* We first compute the context (both \"texts\" and \"images\" in this case) and the question (just a RunnablePassthrough here) \n", + "* Then we pass this into our prompt template, which is a custom function that formats the message for the llava model. \n", + "* And finally we parse the output as a string.\n", + "\n", + "Here we are using Ollama to serve the Llava model. Please see [Ollama](https://python.langchain.com/docs/integrations/llms/ollama) for setup instructions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4c93fab3-74c4-4f1d-958a-0bc4cdd0797e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.llms.ollama import Ollama\n", + "from langchain_core.messages import HumanMessage\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n", + "\n", + "\n", + "def prompt_func(data_dict):\n", + " # Joining the context texts into a single string\n", + " formatted_texts = \"\\n\".join(data_dict[\"context\"][\"texts\"])\n", + " messages = []\n", + "\n", + " # Adding image(s) to the messages if present\n", + " if data_dict[\"context\"][\"images\"]:\n", + " image_message = {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{data_dict['context']['images'][0]}\"\n", + " },\n", + " }\n", + " messages.append(image_message)\n", + "\n", + " # Adding the text message for analysis\n", + " text_message = {\n", + " \"type\": \"text\",\n", + " \"text\": (\n", + " \"As an expert art critic and historian, your task is to analyze and interpret images, \"\n", + " \"considering their historical and cultural significance. Alongside the images, you will be \"\n", + " \"provided with related text to offer context. Both will be retrieved from a vectorstore based \"\n", + " \"on user-input keywords. Please convert answers to english and use your extensive knowledge \"\n", + " \"and analytical skills to provide a comprehensive summary that includes:\\n\"\n", + " \"- A detailed description of the visual elements in the image.\\n\"\n", + " \"- The historical and cultural context of the image.\\n\"\n", + " \"- An interpretation of the image's symbolism and meaning.\\n\"\n", + " \"- Connections between the image and the related text.\\n\\n\"\n", + " f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n", + " \"Text and / or tables:\\n\"\n", + " f\"{formatted_texts}\"\n", + " ),\n", + " }\n", + " messages.append(text_message)\n", + " return [HumanMessage(content=messages)]\n", + "\n", + "\n", + "def multi_modal_rag_chain(retriever):\n", + " \"\"\"Multi-modal RAG chain\"\"\"\n", + "\n", + " # Multi-modal LLM\n", + " llm_model = Ollama(\n", + " verbose=True, temperature=0.5, model=\"llava\", base_url=\"http://localhost:11434\"\n", + " )\n", + "\n", + " # RAG pipeline\n", + " chain = (\n", + " {\n", + " \"context\": retriever | RunnableLambda(split_image_text_types),\n", + " \"question\": RunnablePassthrough(),\n", + " }\n", + " | RunnableLambda(prompt_func)\n", + " | llm_model\n", + " | StrOutputParser()\n", + " )\n", + "\n", + " return chain" + ] + }, + { + "cell_type": "markdown", + "id": "1566096d-97c2-4ddc-ba4a-6ef88c525e4e", + "metadata": {}, + "source": [ + "## Test retrieval and run RAG" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "90121e56-674b-473b-871d-6e4753fd0c45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GREAT PHOTOGRAPHS\n", + "The subject of the photo, Florence Owens Thompson, a Cherokee from Oklahoma, initially regretted that Lange ever made this photograph. “She was a very strong woman. She was a leader,” her daughter Katherine later said. “I think that's one of the reasons she resented the photo — because it didn't show her in that light.”\n", + "\n", + "DOROTHEA LANGE. “DESTITUTE PEA PICKERS IN CALIFORNIA. MOTHER OF SEVEN CHILDREN. AGE THIRTY-TWO. NIPOMO, CALIFORNIA.” MARCH 1936. NITRATE NEGATIVE. FARM SECURITY ADMINISTRATION-OFFICE OF WAR INFORMATION COLLECTION. PRINTS AND PHOTOGRAPHS DIVISION.\n", + "\n", + "—Helena Zinkham\n", + "\n", + "—Helena Zinkham\n", + "\n", + "NOVEMBER/DECEMBER 2020 LOC.GOV/LCM\n" + ] + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import HTML, display\n", + "\n", + "\n", + "def plt_img_base64(img_base64):\n", + " # Create an HTML img tag with the base64 string as the source\n", + " image_html = f''\n", + "\n", + " # Display the image by rendering the HTML\n", + " display(HTML(image_html))\n", + "\n", + "\n", + "query = \"Woman with children\"\n", + "docs = retriever.get_relevant_documents(query, k=10)\n", + "\n", + "for doc in docs:\n", + " if is_base64(doc.page_content):\n", + " plt_img_base64(doc.page_content)\n", + " else:\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "69fb15fd-76fc-49b4-806d-c4db2990027d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. Detailed description of the visual elements in the image: The image features a woman with children, likely a mother and her family, standing together outside. They appear to be poor or struggling financially, as indicated by their attire and surroundings.\n", + "2. Historical and cultural context of the image: The photo was taken in 1936 during the Great Depression, when many families struggled to make ends meet. Dorothea Lange, a renowned American photographer, took this iconic photograph that became an emblem of poverty and hardship experienced by many Americans at that time.\n", + "3. Interpretation of the image's symbolism and meaning: The image conveys a sense of unity and resilience despite adversity. The woman and her children are standing together, displaying their strength as a family unit in the face of economic challenges. The photograph also serves as a reminder of the importance of empathy and support for those who are struggling.\n", + "4. Connections between the image and the related text: The text provided offers additional context about the woman in the photo, her background, and her feelings towards the photograph. It highlights the historical backdrop of the Great Depression and emphasizes the significance of this particular image as a representation of that time period.\n" + ] + } + ], + "source": [ + "chain = multi_modal_rag_chain(retriever)\n", + "response = chain.invoke(query)\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ec2ea7e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vdms_rag_nb\n" + ] + } + ], + "source": [ + "! docker kill vdms_rag_nb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ba652da", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".langchain-venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 41564f7184..fd6a877c29 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -4,14 +4,14 @@ # ATTENTION: When adding a service below use a non-standard port # increment by one from the preceding port. # For credentials always use `langchain` and `langchain` for the -# username and password. +# username and password. version: "3" name: langchain-tests services: redis: image: redis/redis-stack-server:latest - # We use non standard ports since + # We use non standard ports since # these instances are used for testing # and users may already have existing # redis instances set up locally @@ -73,6 +73,11 @@ services: retries: 60 volumes: - postgres_data_pgvector:/var/lib/postgresql/data + vdms: + image: intellabs/vdms:latest + container_name: vdms_container + ports: + - "6025:55555" volumes: postgres_data: diff --git a/docs/docs/integrations/providers/vdms.mdx b/docs/docs/integrations/providers/vdms.mdx new file mode 100644 index 0000000000..b64d63b5fb --- /dev/null +++ b/docs/docs/integrations/providers/vdms.mdx @@ -0,0 +1,62 @@ +# VDMS + +> [VDMS](https://github.com/IntelLabs/vdms/blob/master/README.md) is a storage solution for efficient access +> of big-”visual”-data that aims to achieve cloud scale by searching for relevant visual data via visual metadata +> stored as a graph and enabling machine friendly enhancements to visual data for faster access. + +## Installation and Setup + +### Install Client + +```bash +pip install vdms +``` + +### Install Database + +There are two ways to get started with VDMS: + +#### Install VDMS on your local machine via docker +```bash + docker run -d -p 55555:55555 intellabs/vdms:latest +``` + +#### Install VDMS directly on your local machine +Please see [installation instructions](https://github.com/IntelLabs/vdms/blob/master/INSTALL.md). + + + +## VectorStore + +The vector store is a simple wrapper around VDMS. It provides a simple interface to store and retrieve data. + +```python +from langchain_community.document_loaders import TextLoader +from langchain.text_splitter import CharacterTextSplitter + +loader = TextLoader("./state_of_the_union.txt") +documents = loader.load() +text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0) +docs = text_splitter.split_documents(documents) + +from langchain_community.vectorstores import VDMS +from langchain_community.vectorstores.vdms import VDMS_Client +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings + +client = VDMS_Client("localhost", 55555) +vectorstore = VDMS.from_documents( + docs, + client=client, + collection_name="langchain-demo", + embedding_function=HuggingFaceEmbeddings(), + engine="FaissFlat" + distance_strategy="L2", +) + +query = "What did the president say about Ketanji Brown Jackson" +results = vectorstore.similarity_search(query) +``` + +For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms) + + diff --git a/docs/docs/integrations/vectorstores/vdms.ipynb b/docs/docs/integrations/vectorstores/vdms.ipynb new file mode 100644 index 0000000000..acfeec141f --- /dev/null +++ b/docs/docs/integrations/vectorstores/vdms.ipynb @@ -0,0 +1,1125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# Intel's Visual Data Management System (VDMS)\n", + "\n", + ">Intel's [VDMS](https://github.com/IntelLabs/vdms) is a storage solution for efficient access of big-”visual”-data that aims to achieve cloud scale by searching for relevant visual data via visual metadata stored as a graph and enabling machine friendly enhancements to visual data for faster access. VDMS is licensed under MIT.\n", + "\n", + "VDMS supports:\n", + "* K nearest neighbor search\n", + "* Euclidean distance (L2) and inner product (IP)\n", + "* Libraries for indexing and computing distances: TileDBDense, TileDBSparse, FaissFlat (Default), FaissIVFFlat\n", + "* Vector and metadata searches\n", + "\n", + "VDMS has server and client components. To setup the server, see the [installation instructions](https://github.com/IntelLabs/vdms/blob/master/INSTALL.md) or use the [docker image](https://hub.docker.com/r/intellabs/vdms).\n", + "\n", + "This notebook shows how to use VDMS as a vector store using the docker image.\n", + "\n", + "To begin, install the Python packages for the VDMS client and Sentence Transformers:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2167badd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Pip install necessary package\n", + "%pip install --upgrade --quiet pip sentence-transformers vdms \"unstructured-inference==0.6.6\";" + ] + }, + { + "cell_type": "markdown", + "id": "af2b4512", + "metadata": {}, + "source": [ + "## Start VDMS Server\n", + "Here we start the VDMS server with port 55555." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4b1537c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "e6061b270eef87de5319a6c5af709b36badcad8118069a8f6b577d2e01ad5e2d\n" + ] + } + ], + "source": [ + "!docker run --rm -d -p 55555:55555 --name vdms_vs_test_nb intellabs/vdms:latest" + ] + }, + { + "cell_type": "markdown", + "id": "2b5ffbf8", + "metadata": {}, + "source": [ + "## Basic Example (using the Docker Container)\n", + "\n", + "In this basic example, we demonstrate adding documents into VDMS and using it as a vector database.\n", + "\n", + "You can run the VDMS Server in a Docker container separately to use with LangChain which connects to the server via the VDMS Python Client. \n", + "\n", + "VDMS has the ability to handle multiple collections of documents, but the LangChain interface expects one, so we need to specify the name of the collection . The default collection name used by LangChain is \"langchain\".\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5201ba0c", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "from langchain_community.document_loaders.text import TextLoader\n", + "from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings\n", + "from langchain_community.vectorstores import VDMS\n", + "from langchain_community.vectorstores.vdms import VDMS_Client\n", + "from langchain_text_splitters.character import CharacterTextSplitter\n", + "\n", + "time.sleep(2)\n", + "DELIMITER = \"-\" * 50\n", + "\n", + "# Connect to VDMS Vector Store\n", + "vdms_client = VDMS_Client(host=\"localhost\", port=55555)" + ] + }, + { + "cell_type": "markdown", + "id": "935069bc", + "metadata": {}, + "source": [ + "Here are some helper functions for printing results." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e78814eb", + "metadata": {}, + "outputs": [], + "source": [ + "def print_document_details(doc):\n", + " print(f\"Content:\\n\\t{doc.page_content}\\n\")\n", + " print(\"Metadata:\")\n", + " for key, value in doc.metadata.items():\n", + " if value != \"Missing property\":\n", + " print(f\"\\t{key}:\\t{value}\")\n", + "\n", + "\n", + "def print_results(similarity_results, score=True):\n", + " print(f\"{DELIMITER}\\n\")\n", + " if score:\n", + " for doc, score in similarity_results:\n", + " print(f\"Score:\\t{score}\\n\")\n", + " print_document_details(doc)\n", + " print(f\"{DELIMITER}\\n\")\n", + " else:\n", + " for doc in similarity_results:\n", + " print_document_details(doc)\n", + " print(f\"{DELIMITER}\\n\")\n", + "\n", + "\n", + "def print_response(list_of_entities):\n", + " for ent in list_of_entities:\n", + " for key, value in ent.items():\n", + " if value != \"Missing property\":\n", + " print(f\"\\n{key}:\\n\\t{value}\")\n", + " print(f\"{DELIMITER}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "88229867", + "metadata": {}, + "source": [ + "### Load Document and Obtain Embedding Function\n", + "Here we load the most recent State of the Union Address and split the document into chunks. \n", + "\n", + "LangChain vector stores use a string/keyword `id` for bookkeeping documents. By default, `id` is a uuid but here we're defining it as an integer cast as a string. Additional metadata is also provided with the documents and the HuggingFaceEmbeddings are used for this example as the embedding function." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2ebfc16c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Documents: 42\n", + "# Embedding Dimensions: 768\n" + ] + } + ], + "source": [ + "# load the document and split it into chunks\n", + "document_path = \"../../modules/state_of_the_union.txt\"\n", + "raw_documents = TextLoader(document_path).load()\n", + "\n", + "# split it into chunks\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(raw_documents)\n", + "ids = []\n", + "for doc_idx, doc in enumerate(docs):\n", + " ids.append(str(doc_idx + 1))\n", + " docs[doc_idx].metadata[\"id\"] = str(doc_idx + 1)\n", + " docs[doc_idx].metadata[\"page_number\"] = int(doc_idx + 1)\n", + " docs[doc_idx].metadata[\"president_included\"] = (\n", + " \"president\" in doc.page_content.lower()\n", + " )\n", + "print(f\"# Documents: {len(docs)}\")\n", + "\n", + "\n", + "# create the open-source embedding function\n", + "embedding = HuggingFaceEmbeddings()\n", + "print(\n", + " f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a6a596f0", + "metadata": {}, + "source": [ + "### Similarity Search using Faiss Flat and Euclidean Distance (Default)\n", + "\n", + "In this section, we add the documents to VDMS using FAISS IndexFlat indexing (default) and Euclidena distance (default) as the distance metric for simiarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1f3f43d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "\n", + "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", + "\n", + "And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. \n", + "\n", + "Third, support our veterans. \n", + "\n", + "Veterans are the best of us. \n", + "\n", + "I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. \n", + "\n", + "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", + "\n", + "Our troops in Iraq and Afghanistan faced many dangers.\n", + "\n", + "Metadata:\n", + "\tid:\t37\n", + "\tpage_number:\t37\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "\n", + "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "\n", + "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "\n", + "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "\n", + "Metadata:\n", + "\tid:\t33\n", + "\tpage_number:\t33\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "# add data\n", + "collection_name = \"my_collection_faiss_L2\"\n", + "db = VDMS.from_documents(\n", + " docs,\n", + " client=vdms_client,\n", + " ids=ids,\n", + " collection_name=collection_name,\n", + " embedding=embedding,\n", + ")\n", + "\n", + "# Query (No metadata filtering)\n", + "k = 3\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "returned_docs = db.similarity_search(query, k=k, filter=None)\n", + "print_results(returned_docs, score=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c2e36c18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tAnd for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \n", + "\n", + "And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n", + "\n", + "So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \n", + "\n", + "First, beat the opioid epidemic.\n", + "\n", + "Metadata:\n", + "\tid:\t35\n", + "\tpage_number:\t35\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Content:\n", + "\tLast month, I announced our plan to supercharge \n", + "the Cancer Moonshot that President Obama asked me to lead six years ago. \n", + "\n", + "Our goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases. \n", + "\n", + "More support for patients and families. \n", + "\n", + "To get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n", + "\n", + "It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n", + "\n", + "ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n", + "\n", + "A unity agenda for the nation. \n", + "\n", + "We can do this. \n", + "\n", + "My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n", + "\n", + "In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n", + "\n", + "We have fought for freedom, expanded liberty, defeated totalitarianism and terror.\n", + "\n", + "Metadata:\n", + "\tid:\t40\n", + "\tpage_number:\t40\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "# Query (with filtering)\n", + "k = 3\n", + "constraints = {\"page_number\": [\">\", 30], \"president_included\": [\"==\", True]}\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "returned_docs = db.similarity_search(query, k=k, filter=constraints)\n", + "print_results(returned_docs, score=False)" + ] + }, + { + "cell_type": "markdown", + "id": "a5984766", + "metadata": {}, + "source": [ + "### Similarity Search using TileDBDense and Euclidean Distance\n", + "\n", + "In this section, we add the documents to VDMS using TileDB Dense indexing and L2 as the distance metric for similarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson` and also return the score along with the document.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3001ba6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "\n", + "Score:\t1.2032090425491333\n", + "\n", + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Score:\t1.495247483253479\n", + "\n", + "Content:\n", + "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "\n", + "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", + "\n", + "And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. \n", + "\n", + "Third, support our veterans. \n", + "\n", + "Veterans are the best of us. \n", + "\n", + "I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. \n", + "\n", + "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", + "\n", + "Our troops in Iraq and Afghanistan faced many dangers.\n", + "\n", + "Metadata:\n", + "\tid:\t37\n", + "\tpage_number:\t37\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Score:\t1.5008409023284912\n", + "\n", + "Content:\n", + "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "\n", + "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "\n", + "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "\n", + "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "\n", + "Metadata:\n", + "\tid:\t33\n", + "\tpage_number:\t33\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "db_tiledbD = VDMS.from_documents(\n", + " docs,\n", + " client=vdms_client,\n", + " ids=ids,\n", + " collection_name=\"my_collection_tiledbD_L2\",\n", + " embedding=embedding,\n", + " engine=\"TileDBDense\",\n", + " distance_strategy=\"L2\",\n", + ")\n", + "\n", + "k = 3\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs_with_score = db_tiledbD.similarity_search_with_score(query, k=k, filter=None)\n", + "print_results(docs_with_score)" + ] + }, + { + "cell_type": "markdown", + "id": "92ab3370", + "metadata": {}, + "source": [ + "### Similarity Search using Faiss IVFFlat and Euclidean Distance\n", + "\n", + "In this section, we add the documents to VDMS using Faiss IndexIVFFlat indexing and L2 as the distance metric for similarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson` and also return the score along with the document.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "78f502cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "\n", + "Score:\t1.2032090425491333\n", + "\n", + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Score:\t1.495247483253479\n", + "\n", + "Content:\n", + "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "\n", + "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", + "\n", + "And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. \n", + "\n", + "Third, support our veterans. \n", + "\n", + "Veterans are the best of us. \n", + "\n", + "I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. \n", + "\n", + "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", + "\n", + "Our troops in Iraq and Afghanistan faced many dangers.\n", + "\n", + "Metadata:\n", + "\tid:\t37\n", + "\tpage_number:\t37\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Score:\t1.5008409023284912\n", + "\n", + "Content:\n", + "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "\n", + "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "\n", + "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "\n", + "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "\n", + "Metadata:\n", + "\tid:\t33\n", + "\tpage_number:\t33\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "db_FaissIVFFlat = VDMS.from_documents(\n", + " docs,\n", + " client=vdms_client,\n", + " ids=ids,\n", + " collection_name=\"my_collection_FaissIVFFlat_L2\",\n", + " embedding=embedding,\n", + " engine=\"FaissIVFFlat\",\n", + " distance_strategy=\"L2\",\n", + ")\n", + "# Query\n", + "k = 3\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs_with_score = db_FaissIVFFlat.similarity_search_with_score(query, k=k, filter=None)\n", + "print_results(docs_with_score)" + ] + }, + { + "cell_type": "markdown", + "id": "9ed3ec50", + "metadata": {}, + "source": [ + "### Update and Delete\n", + "\n", + "While building toward a real application, you want to go beyond adding data, and also update and delete data.\n", + "\n", + "Here is a basic example showing how to do so. First, we will update the metadata for the document most relevant to the query." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "81a02810", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original metadata: \n", + "\t{'id': '32', 'page_number': 32, 'president_included': True, 'source': '../../modules/state_of_the_union.txt'}\n", + "new metadata: \n", + "\t{'id': '32', 'page_number': 32, 'president_included': True, 'source': '../../modules/state_of_the_union.txt', 'new_value': 'hello world'}\n", + "--------------------------------------------------\n", + "\n", + "UPDATED ENTRY (id=32):\n", + "\n", + "content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "id:\n", + "\t32\n", + "\n", + "new_value:\n", + "\thello world\n", + "\n", + "page_number:\n", + "\t32\n", + "\n", + "president_included:\n", + "\tTrue\n", + "\n", + "source:\n", + "\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "doc = db.similarity_search(query)[0]\n", + "print(f\"Original metadata: \\n\\t{doc.metadata}\")\n", + "\n", + "# update the metadata for a document\n", + "doc.metadata[\"new_value\"] = \"hello world\"\n", + "print(f\"new metadata: \\n\\t{doc.metadata}\")\n", + "print(f\"{DELIMITER}\\n\")\n", + "\n", + "# Update document in VDMS\n", + "id_to_update = doc.metadata[\"id\"]\n", + "db.update_document(collection_name, id_to_update, doc)\n", + "response, response_array = db.get(\n", + " collection_name, constraints={\"id\": [\"==\", id_to_update]}\n", + ")\n", + "\n", + "# Display Results\n", + "print(f\"UPDATED ENTRY (id={id_to_update}):\")\n", + "print_response([response[0][\"FindDescriptor\"][\"entities\"][0]])" + ] + }, + { + "cell_type": "markdown", + "id": "872a7dff", + "metadata": {}, + "source": [ + "Next we will delete the last document by ID (id=42)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "95537fe8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Documents before deletion: 42\n", + "Documents after deletion (id=42): 41\n" + ] + } + ], + "source": [ + "print(\"Documents before deletion: \", db.count(collection_name))\n", + "\n", + "id_to_remove = ids[-1]\n", + "db.delete(collection_name=collection_name, ids=[id_to_remove])\n", + "print(f\"Documents after deletion (id={id_to_remove}): {db.count(collection_name)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "18152965", + "metadata": {}, + "source": [ + "## Other Information\n", + "VDMS supports various types of visual data and operations. Some of the capabilities are integrated in the LangChain interface but additional workflow improvements will be added as VDMS is under continuous development.\n", + "\n", + "Addtional capabilities integrated into LangChain are below.\n", + "\n", + "### Similarity search by vector\n", + "Instead of searching by string query, you can also search by embedding/vector." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1db4d6ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tnew_value:\thello world\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n" + ] + } + ], + "source": [ + "embedding_vector = embedding.embed_query(query)\n", + "returned_docs = db.similarity_search_by_vector(embedding_vector)\n", + "\n", + "# Print Results\n", + "print_document_details(returned_docs[0])" + ] + }, + { + "cell_type": "markdown", + "id": "daf718b2", + "metadata": {}, + "source": [ + "### Filtering on metadata\n", + "\n", + "It can be helpful to narrow down the collection before working with it.\n", + "\n", + "For example, collections can be filtered on metadata using the get method. A dictionary is used to filter metadata. Here we retrieve the document where `id = 2` and remove it from the vector store." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2bc0313b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Returned entry:\n", + "\n", + "blob:\n", + "\tTrue\n", + "\n", + "content:\n", + "\tGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", + "\n", + "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \n", + "\n", + "Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n", + "\n", + "Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n", + "\n", + "Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. \n", + "\n", + "They keep moving. \n", + "\n", + "And the costs and the threats to America and the world keep rising. \n", + "\n", + "That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. \n", + "\n", + "The United States is a member along with 29 other nations. \n", + "\n", + "It matters. American diplomacy matters. American resolve matters.\n", + "\n", + "id:\n", + "\t2\n", + "\n", + "page_number:\n", + "\t2\n", + "\n", + "president_included:\n", + "\tTrue\n", + "\n", + "source:\n", + "\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "response, response_array = db.get(\n", + " collection_name,\n", + " limit=1,\n", + " include=[\"metadata\", \"embeddings\"],\n", + " constraints={\"id\": [\"==\", \"2\"]},\n", + ")\n", + "\n", + "print(\"Returned entry:\")\n", + "print_response([response[0][\"FindDescriptor\"][\"entities\"][0]])\n", + "\n", + "# Delete id=2\n", + "db.delete(collection_name=collection_name, ids=[\"2\"]);" + ] + }, + { + "cell_type": "markdown", + "id": "794a7552", + "metadata": {}, + "source": [ + "### Retriever options\n", + "\n", + "This section goes over different options for how to use VDMS as a retriever.\n", + "\n", + "\n", + "#### Simiarity Search\n", + "\n", + "Here we use similarity search in the retriever object.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "120f55eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tnew_value:\thello world\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n" + ] + } + ], + "source": [ + "retriever = db.as_retriever()\n", + "relevant_docs = retriever.get_relevant_documents(query)[0]\n", + "\n", + "print_document_details(relevant_docs)" + ] + }, + { + "cell_type": "markdown", + "id": "e8c0fb24", + "metadata": {}, + "source": [ + "#### Maximal Marginal Relevance Search (MMR)\n", + "\n", + "In addition to using similarity search in the retriever object, you can also use `mmr`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f00be6d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tnew_value:\thello world\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n" + ] + } + ], + "source": [ + "retriever = db.as_retriever(search_type=\"mmr\")\n", + "relevant_docs = retriever.get_relevant_documents(query)[0]\n", + "\n", + "print_document_details(relevant_docs)" + ] + }, + { + "cell_type": "markdown", + "id": "ffadbafc", + "metadata": {}, + "source": [ + "We can also use MMR directly." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ab911470", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "\n", + "Score:\t1.2032092809677124\n", + "\n", + "Content:\n", + "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Metadata:\n", + "\tid:\t32\n", + "\tnew_value:\thello world\n", + "\tpage_number:\t32\n", + "\tpresident_included:\tTrue\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n", + "Score:\t1.507053256034851\n", + "\n", + "Content:\n", + "\tBut cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n", + "\n", + "Danielle says Heath was a fighter to the very end. \n", + "\n", + "He didn’t know how to stop fighting, and neither did she. \n", + "\n", + "Through her pain she found purpose to demand we do better. \n", + "\n", + "Tonight, Danielle—we are. \n", + "\n", + "The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n", + "\n", + "And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. \n", + "\n", + "I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n", + "\n", + "And fourth, let’s end cancer as we know it. \n", + "\n", + "This is personal to me and Jill, to Kamala, and to so many of you. \n", + "\n", + "Cancer is the #2 cause of death in America–second only to heart disease.\n", + "\n", + "Metadata:\n", + "\tid:\t39\n", + "\tpage_number:\t39\n", + "\tpresident_included:\tFalse\n", + "\tsource:\t../../modules/state_of_the_union.txt\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "mmr_resp = db.max_marginal_relevance_search_with_score(query, k=2, fetch_k=10)\n", + "print_results(mmr_resp)" + ] + }, + { + "cell_type": "markdown", + "id": "190bc4b5", + "metadata": {}, + "source": [ + "### Delete collection\n", + "Previously, we removed documents based on its `id`. Here, all documents are removed since no ID is provided." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "874e7af9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Documents before deletion: 40\n", + "Documents after deletion: 0\n" + ] + } + ], + "source": [ + "print(\"Documents before deletion: \", db.count(collection_name))\n", + "\n", + "db.delete(collection_name=collection_name)\n", + "\n", + "print(\"Documents after deletion: \", db.count(collection_name))" + ] + }, + { + "cell_type": "markdown", + "id": "68b7a400", + "metadata": {}, + "source": [ + "## Stop VDMS Server" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "08931796", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vdms_vs_test_nb\n" + ] + } + ], + "source": [ + "!docker kill vdms_vs_test_nb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0386ea81", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb index 2264c8e70a..1260c1b8d0 100644 --- a/docs/docs/modules/data_connection/indexing.ipynb +++ b/docs/docs/modules/data_connection/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", + "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index f2a272b1a9..b480a7f8a8 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -102,6 +102,7 @@ _module_lookup = { "Typesense": "langchain_community.vectorstores.typesense", "USearch": "langchain_community.vectorstores.usearch", "Vald": "langchain_community.vectorstores.vald", + "VDMS": "langchain_community.vectorstores.vdms", "Vearch": "langchain_community.vectorstores.vearch", "Vectara": "langchain_community.vectorstores.vectara", "VectorStore": "langchain_core.vectorstores", diff --git a/libs/community/langchain_community/vectorstores/vdms.py b/libs/community/langchain_community/vectorstores/vdms.py new file mode 100644 index 0000000000..d367dce649 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/vdms.py @@ -0,0 +1,1580 @@ +from __future__ import annotations + +import base64 +import logging +import uuid +from copy import deepcopy +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Literal, + Optional, + Sized, + Tuple, + Type, + Union, + get_args, +) + +import numpy as np +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +from langchain_community.vectorstores.utils import maximal_marginal_relevance + +if TYPE_CHECKING: + import vdms + + +DISTANCE_METRICS = Literal[ + "L2", # Euclidean Distance + "IP", # Inner Product +] +AVAILABLE_DISTANCE_METRICS: List[DISTANCE_METRICS] = list(get_args(DISTANCE_METRICS)) +ENGINES = Literal[ + "TileDBDense", # TileDB Dense + "TileDBSparse", # TileDB Sparse + "FaissFlat", # FAISS IndexFlat + "FaissIVFFlat", # FAISS IndexIVFFlat + "Flinng", # FLINNG +] +AVAILABLE_ENGINES: List[ENGINES] = list(get_args(ENGINES)) +DEFAULT_COLLECTION_NAME = "langchain" +DEFAULT_INSERT_BATCH_SIZE = 32 +# Number of Documents to return. +DEFAULT_K = 3 +# Number of Documents to fetch to pass to knn when filters applied. +DEFAULT_FETCH_K = DEFAULT_K * 5 +DEFAULT_PROPERTIES = ["_distance", "id", "content"] +INVALID_DOC_METADATA_KEYS = ["_distance", "content", "blob"] +INVALID_METADATA_VALUE = ["Missing property", None, {}] # type: List + + +logger = logging.getLogger(__name__) + + +def _len_check_if_sized(x: Any, y: Any, x_name: str, y_name: str) -> None: + """ + Check that sizes of two variables are the same + + Args: + x: Variable to compare + y: Variable to compare + x_name: Name for variable x + y_name: Name for variable y + """ + if isinstance(x, Sized) and isinstance(y, Sized) and len(x) != len(y): + raise ValueError( + f"{x_name} and {y_name} expected to be equal length but " + f"len({x_name})={len(x)} and len({y_name})={len(y)}" + ) + return + + +def VDMS_Client(host: str = "localhost", port: int = 55555) -> vdms.vdms: + """ + Wrapper to initiate and connect a VDMS client to a VDMS server + + Args: + host: IP or hostname of VDMS server + port: Port to connect to VDMS server + """ + try: + import vdms + except ImportError: + raise ImportError( + "Could not import vdms python package. " + "Please install it with `pip install vdms." + ) + + client = vdms.vdms() + client.connect(host, port) + return client + + +class VDMS(VectorStore): + """Wrapper around Intel Lab's VDMS for vector-store workloads. + + To use, you should have both: + - the ``vdms`` python package installed + - a host (str) and port (int) associated with a deployed VDMS Server + + Visit https://github.com/IntelLabs/vdms/wiki more information. + + IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA. + + Args: + client: VDMS Client used to connect to VDMS server + collection_name: Name of data collection [Default: langchain] + distance_strategy: Method used to calculate distances. VDMS supports + "L2" (euclidean distance) or "IP" (inner product) [Default: L2] + engine: Underlying implementation for indexing and computing distances. + VDMS supports TileDBDense, TileDBSparse, FaissFlat, FaissIVFFlat, + and Flinng [Default: FaissFlat] + embedding: Any embedding function implementing + `langchain_core.embeddings.Embeddings` interface. + relevance_score_fn: Function for obtaining relevance score + + Example: + .. code-block:: python + + from langchain_community.embeddings import HuggingFaceEmbeddings + from langchain_community.vectorstores.vdms import VDMS, VDMS_Client + + vectorstore = VDMS( + client=VDMS_Client("localhost", 55555), + embedding=HuggingFaceEmbeddings(), + collection_name="langchain-demo", + distance_strategy="L2", + engine="FaissFlat", + ) + """ + + def __init__( + self, + client: vdms.vdms, + *, + embedding: Optional[Embeddings] = None, + collection_name: str = DEFAULT_COLLECTION_NAME, # DescriptorSet name + distance_strategy: DISTANCE_METRICS = "L2", + engine: ENGINES = "FaissFlat", + relevance_score_fn: Optional[Callable[[float], float]] = None, + ) -> None: + # Check required parameters + self._client = client + self.similarity_search_engine = engine + self.distance_strategy = distance_strategy + self.embedding = embedding + self._check_required_inputs(collection_name) + + # Update other parameters + self.override_relevance_score_fn = relevance_score_fn + + # Initialize collection + self._collection_name = self.__add_set( + collection_name, + engine=self.similarity_search_engine, + metric=self.distance_strategy, + ) + + @property + def embeddings(self) -> Optional[Embeddings]: + return self.embedding + + def _embed_documents(self, texts: List[str]) -> List[List[float]]: + if isinstance(self.embedding, Embeddings): + return self.embedding.embed_documents(texts) + else: + p_str = "Must provide `embedding` which is expected" + p_str += " to be an Embeddings object" + raise ValueError(p_str) + + def _embed_image(self, uris: List[str]) -> List[List[float]]: + if self.embedding is not None and hasattr(self.embedding, "embed_image"): + return self.embedding.embed_image(uris=uris) + else: + raise ValueError( + "Must provide `embedding` which has attribute `embed_image`" + ) + + def _embed_query(self, text: str) -> List[float]: + if isinstance(self.embedding, Embeddings): + return self.embedding.embed_query(text) + else: + raise ValueError( + "Must provide `embedding` which is expected" + " to be an Embeddings object" + ) + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided + # in vectorstore constructor + if self.distance_strategy.lower() in ["ip", "l2"]: + return lambda x: x + else: + raise ValueError( + "No supported normalization function" + f" for distance_strategy of {self.distance_strategy}." + "Consider providing relevance_score_fn to VDMS constructor." + ) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores on a scale from 0 to 1.""" + if self.override_relevance_score_fn is None: + kwargs["normalize_distance"] = True + docs_and_scores = self.similarity_search_with_score( + query, + k, + fetch_k, + filter, + **kwargs, + ) + + docs_and_rel_scores: List[Any] = [] + for doc, score in docs_and_scores: + if self.override_relevance_score_fn is None: + docs_and_rel_scores.append((doc, score)) + else: + docs_and_rel_scores.append( + (doc, self.override_relevance_score_fn(score)) + ) + return docs_and_rel_scores + + def __add( + self, + collection_name: str, + texts: List[str], + embeddings: List[List[float]], + metadatas: Optional[Union[List[None], List[Dict[str, Any]]]] = None, + ids: Optional[List[str]] = None, + ) -> List: + _len_check_if_sized(texts, embeddings, "texts", "embeddings") + + metadatas = metadatas if metadatas is not None else [None for _ in texts] + _len_check_if_sized(texts, metadatas, "texts", "metadatas") + + ids = ids if ids is not None else [str(uuid.uuid1()) for _ in texts] + _len_check_if_sized(texts, ids, "texts", "ids") + + all_queries: List[Any] = [] + all_blobs: List[Any] = [] + inserted_ids: List[Any] = [] + for meta, emb, doc, id in zip(metadatas, embeddings, texts, ids): + query, blob = self.__get_add_query( + collection_name, metadata=meta, embedding=emb, document=doc, id=id + ) + + if blob is not None: + all_queries.append(query) + all_blobs.append(blob) + inserted_ids.append(id) + + response, response_array = self.__run_vdms_query(all_queries, all_blobs) + + return inserted_ids + + def __add_set( + self, + collection_name: str, + engine: ENGINES = "FaissFlat", + metric: DISTANCE_METRICS = "L2", + ) -> str: + query = _add_descriptorset( + "AddDescriptorSet", + collection_name, + self.embedding_dimension, + engine=getattr(engine, "value", engine), + metric=getattr(metric, "value", metric), + ) + + response, _ = self.__run_vdms_query([query]) + + if "FailedCommand" in response[0]: + raise ValueError(f"Failed to add collection {collection_name}") + + return collection_name + + def __delete( + self, + collection_name: str, + ids: Union[None, List[str]] = None, + constraints: Union[None, Dict[str, Any]] = None, + ) -> bool: + """ + Deletes entire collection if id is not provided + """ + all_queries: List[Any] = [] + all_blobs: List[Any] = [] + + collection_properties = self.__get_properties(collection_name) + results = {"list": collection_properties} + + if constraints is None: + constraints = {"_deletion": ["==", 1]} + else: + constraints["_deletion"] = ["==", 1] + + if ids is not None: + constraints["id"] = ["==", ids[0]] # if len(ids) > 1 else ids[0]] + + query = _add_descriptor( + "FindDescriptor", + collection_name, + label=None, + ref=None, + props=None, + link=None, + k_neighbors=None, + constraints=constraints, + results=results, + ) + + all_queries.append(query) + response, response_array = self.__run_vdms_query(all_queries, all_blobs) + return "FindDescriptor" in response[0] + + def __get_add_query( + self, + collection_name: str, + metadata: Optional[Any] = None, + embedding: Union[List[float], None] = None, + document: Optional[Any] = None, + id: Optional[str] = None, + ) -> Tuple[Dict[str, Dict[str, Any]], Union[bytes, None]]: + if id is None: + props: Dict[str, Any] = {} + else: + props = {"id": id} + id_exists, query = _check_descriptor_exists_by_id( + self._client, collection_name, id + ) + if id_exists: + skipped_value = { + prop_key: prop_val[-1] + for prop_key, prop_val in query["FindDescriptor"][ + "constraints" + ].items() + } + pstr = f"[!] Embedding with id ({id}) exists in DB;" + pstr += "Therefore, skipped and not inserted" + print(pstr) # noqa: T201 + print(f"\tSkipped values are: {skipped_value}") # noqa: T201 + return query, None + + if metadata: + props.update(metadata) + if document: + props["content"] = document + + for k in props.keys(): + if k not in self.collection_properties: + self.collection_properties.append(k) + + query = _add_descriptor( + "AddDescriptor", + collection_name, + label=None, + ref=None, + props=props, + link=None, + k_neighbors=None, + constraints=None, + results=None, + ) + + blob = embedding2bytes(embedding) + + return ( + query, + blob, + ) + + def __get_properties( + self, + collection_name: str, + unique_entity: Optional[bool] = False, + deletion: Optional[bool] = False, + ) -> List[str]: + find_query = _find_property_entity( + collection_name, unique_entity=unique_entity, deletion=deletion + ) + response, response_blob = self.__run_vdms_query([find_query]) + if len(response_blob) > 0: + collection_properties = _bytes2str(response_blob[0]).split(",") + else: + collection_properties = deepcopy(DEFAULT_PROPERTIES) + return collection_properties + + def __run_vdms_query( + self, + all_queries: List[Dict], + all_blobs: Optional[List] = [], + print_last_response: Optional[bool] = False, + ) -> Tuple[Any, Any]: + response, response_array = self._client.query(all_queries, all_blobs) + + _ = _check_valid_response(all_queries, response) + if print_last_response: + self._client.print_last_response() + return response, response_array + + def __update( + self, + collection_name: str, + ids: List[str], + documents: List[str], + embeddings: List[List[float]], + metadatas: Optional[Union[List[None], List[Dict[str, Any]]]] = None, + ) -> None: + """ + Updates (find, delete, add) a collection based on id. + If more than one collection returned with id, error occuers + """ + _len_check_if_sized(ids, documents, "ids", "documents") + + _len_check_if_sized(ids, embeddings, "ids", "embeddings") + + metadatas = metadatas if metadatas is not None else [None for _ in ids] + _len_check_if_sized(ids, metadatas, "ids", "metadatas") + + orig_props = self.__get_properties(collection_name) + + updated_ids: List[Any] = [] + for meta, emb, doc, id in zip(metadatas, embeddings, documents, ids): + results = {"list": self.collection_properties} + + constraints = {"_deletion": ["==", 1]} + + if id is not None: + constraints["id"] = ["==", id] + + query = _add_descriptor( + "FindDescriptor", + collection_name, + label=None, + ref=None, + props=None, + link=None, + k_neighbors=None, + constraints=constraints, + results=results, + ) + + response, response_array = self.__run_vdms_query([query]) + + query, blob = self.__get_add_query( + collection_name, + metadata=meta, + embedding=emb, + document=doc, + id=id, + ) + if blob is not None: + response, response_array = self.__run_vdms_query([query], [blob]) + updated_ids.append(id) + + self.__update_properties( + collection_name, orig_props, self.collection_properties + ) + + def __update_properties( + self, + collection_name: str, + current_collection_properties: List, + new_collection_properties: Optional[List], + ) -> None: + if new_collection_properties is not None: + old_collection_properties = deepcopy(current_collection_properties) + for prop in new_collection_properties: + if prop not in current_collection_properties: + current_collection_properties.append(prop) + + if current_collection_properties != old_collection_properties: + all_queries, blob_arr = _build_property_query( + collection_name, + command_type="update", + all_properties=current_collection_properties, + ) + response, _ = self.__run_vdms_query(all_queries, [blob_arr]) + + def add_images( + self, + uris: List[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + add_path: Optional[bool] = True, + **kwargs: Any, + ) -> List[str]: + """Run more images through the embeddings and add to the vectorstore. + + Images are added as embeddings (AddDescriptor) instead of separate + entity (AddImage) within VDMS to leverage similarity search capability + + Args: + uris: List of paths to the images to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of unique IDs. + batch_size (int): Number of concurrent requests to send to the server. + add_path: Bool to add image path as metadata + + Returns: + List of ids from adding images into the vectorstore. + """ + # Map from uris to blobs to base64 + b64_texts = [self.encode_image(image_path=uri) for uri in uris] + + if add_path and metadatas: + for midx, uri in enumerate(uris): + metadatas[midx]["image_path"] = uri + elif add_path: + metadatas = [] + for uri in uris: + metadatas.append({"image_path": uri}) + + # Populate IDs + ids = ids if ids is not None else [str(uuid.uuid1()) for _ in uris] + + # Set embeddings + embeddings = self._embed_image(uris=uris) + + if metadatas is None: + metadatas = [{} for _ in uris] + else: + metadatas = [_validate_vdms_properties(m) for m in metadatas] + + self.__from( + texts=b64_texts, + embeddings=embeddings, + ids=ids, + metadatas=metadatas, + batch_size=batch_size, + **kwargs, + ) + return ids + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: List of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of unique IDs. + batch_size (int): Number of concurrent requests to send to the server. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + + texts = list(texts) + if ids is None: + ids = [str(uuid.uuid1()) for _ in texts] + + embeddings = self._embed_documents(texts) + + if metadatas is None: + metadatas = [{} for _ in texts] + else: + metadatas = [_validate_vdms_properties(m) for m in metadatas] + + inserted_ids = self.__from( + texts=texts, + embeddings=embeddings, + ids=ids, + metadatas=metadatas, + batch_size=batch_size, + **kwargs, + ) + return inserted_ids + + def __from( + self, + texts: List[str], + embeddings: List[List[float]], + ids: List[str], + metadatas: Optional[List[dict]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + **kwargs: Any, + ) -> List[str]: + # Get initial properties + orig_props = self.__get_properties(self._collection_name) + inserted_ids: List[str] = [] + for start_idx in range(0, len(texts), batch_size): + end_idx = min(start_idx + batch_size, len(texts)) + + batch_texts = texts[start_idx:end_idx] + batch_embedding_vectors = embeddings[start_idx:end_idx] + batch_ids = ids[start_idx:end_idx] + if metadatas: + batch_metadatas = metadatas[start_idx:end_idx] + + result = self.__add( + self._collection_name, + embeddings=batch_embedding_vectors, + texts=batch_texts, + metadatas=batch_metadatas, + ids=batch_ids, + ) + + inserted_ids.extend(result) + + # Update Properties + self.__update_properties( + self._collection_name, orig_props, self.collection_properties + ) + return inserted_ids + + def _check_required_inputs(self, collection_name: str) -> None: + # Check connection to client + if not self._client.is_connected(): + raise ValueError( + "VDMS client must be connected to a VDMS server." + + "Please use VDMS_Client to establish a connection" + ) + + # Check Distance Metric + if self.distance_strategy not in AVAILABLE_DISTANCE_METRICS: + raise ValueError("distance_strategy must be either 'L2' or 'IP'") + + # Check Engines + if self.similarity_search_engine not in AVAILABLE_ENGINES: + raise ValueError( + "engine must be either 'TileDBDense', 'TileDBSparse', " + + "'FaissFlat', 'FaissIVFFlat', or 'Flinng'" + ) + + # Check Embedding Func is provided and store dimension size + if self.embedding is None: + raise ValueError("Must provide embedding function") + + self.embedding_dimension = len(self._embed_query("This is a sample sentence.")) + + # Check for properties + current_props = self.__get_properties(collection_name) + if hasattr(self, "collection_properties"): + self.collection_properties.extend(current_props) + else: + self.collection_properties: List[str] = current_props + + def count(self, collection_name: str) -> int: + all_queries: List[Any] = [] + all_blobs: List[Any] = [] + + results = {"count": "", "list": ["id"]} # collection_properties} + query = _add_descriptor( + "FindDescriptor", + collection_name, + label=None, + ref=None, + props=None, + link=None, + k_neighbors=None, + constraints=None, + results=results, + ) + + all_queries.append(query) + + response, response_array = self.__run_vdms_query(all_queries, all_blobs) + return response[0]["FindDescriptor"]["returned"] + + def decode_image(self, base64_image: str) -> bytes: + return base64.b64decode(base64_image) + + def delete( + self, + ids: Optional[List[str]] = None, + collection_name: Optional[str] = None, + constraints: Optional[Dict] = None, + **kwargs: Any, + ) -> bool: + """Delete by ID. These are the IDs in the vectorstore. + + Args: + ids: List of ids to delete. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + name = collection_name if collection_name is not None else self._collection_name + return self.__delete(name, ids=ids, constraints=constraints) + + def get_k_candidates( + self, + setname: str, + fetch_k: Optional[int], + results: Optional[Dict[str, Any]] = None, + all_blobs: Optional[List] = None, + normalize: Optional[bool] = False, + ) -> Tuple[List[Dict[str, Any]], List, float]: + max_dist = 1 + command_str = "FindDescriptor" + query = _add_descriptor( + command_str, + setname, + k_neighbors=fetch_k, + results=results, + ) + response, response_array = self.__run_vdms_query([query], all_blobs) + + if normalize: + max_dist = response[0][command_str]["entities"][-1]["_distance"] + + return response, response_array, max_dist + + def get_descriptor_response( + self, + command_str: str, + setname: str, + k_neighbors: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + constraints: Optional[dict] = None, + results: Optional[Dict[str, Any]] = None, + query_embedding: Optional[List[float]] = None, + normalize_distance: bool = False, + ) -> Tuple[List[Dict[str, Any]], List]: + all_blobs: List[Any] = [] + blob = embedding2bytes(query_embedding) + all_blobs.append(blob) + + if constraints is None: + # K results returned + response, response_array, max_dist = self.get_k_candidates( + setname, k_neighbors, results, all_blobs, normalize=normalize_distance + ) + else: + if results is None: + results = {"list": ["id"]} + elif "list" not in results: + results["list"] = ["id"] + elif "id" not in results["list"]: + results["list"].append("id") + + # (1) Find docs satisfy constraints + query = _add_descriptor( + command_str, + setname, + constraints=constraints, + results=results, + ) + response, response_array = self.__run_vdms_query([query]) + ids_of_interest = [ + ent["id"] for ent in response[0][command_str]["entities"] + ] + + # (2) Find top fetch_k results + response, response_array, max_dist = self.get_k_candidates( + setname, fetch_k, results, all_blobs, normalize=normalize_distance + ) + + # (3) Intersection of (1) & (2) using ids + new_entities: List[Dict] = [] + for ent in response[0][command_str]["entities"]: + if ent["id"] in ids_of_interest: + new_entities.append(ent) + if len(new_entities) == k_neighbors: + break + response[0][command_str]["entities"] = new_entities + response[0][command_str]["returned"] = len(new_entities) + if len(new_entities) < k_neighbors: + p_str = "Returned items < k_neighbors; Try increasing fetch_k" + print(p_str) # noqa: T201 + + if normalize_distance: + max_dist = 1.0 if max_dist == 0 else max_dist + for ent_idx, ent in enumerate(response[0][command_str]["entities"]): + ent["_distance"] = ent["_distance"] / max_dist + response[0][command_str]["entities"][ent_idx]["_distance"] = ent[ + "_distance" + ] + + return response, response_array + + def encode_image(self, image_path: str) -> str: + with open(image_path, "rb") as f: + blob = f.read() + return base64.b64encode(blob).decode("utf-8") + + @classmethod + def from_documents( + cls: Type[VDMS], + documents: List[Document], + embedding: Optional[Embeddings] = None, + ids: Optional[List[str]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + collection_name: str = DEFAULT_COLLECTION_NAME, # Add this line + **kwargs: Any, + ) -> VDMS: + """Create a VDMS vectorstore from a list of documents. + + Args: + collection_name (str): Name of the collection to create. + documents (List[Document]): List of documents to add to vectorstore. + embedding (Embeddings): Embedding function. Defaults to None. + ids (Optional[List[str]]): List of document IDs. Defaults to None. + batch_size (int): Number of concurrent requests to send to the server. + + Returns: + VDMS: VDMS vectorstore. + """ + client: vdms.vdms = kwargs["client"] + + return cls.from_texts( + client=client, + texts=[doc.page_content for doc in documents], + metadatas=[doc.metadata for doc in documents], + embedding=embedding, + ids=ids, + batch_size=batch_size, + collection_name=collection_name, + # **kwargs, + ) + + @classmethod + def from_texts( + cls: Type[VDMS], + texts: List[str], + embedding: Optional[Embeddings] = None, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + collection_name: str = DEFAULT_COLLECTION_NAME, + **kwargs: Any, + ) -> VDMS: + """Create a VDMS vectorstore from a raw documents. + + Args: + texts (List[str]): List of texts to add to the collection. + embedding (Embeddings): Embedding function. Defaults to None. + metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. + ids (Optional[List[str]]): List of document IDs. Defaults to None. + batch_size (int): Number of concurrent requests to send to the server. + collection_name (str): Name of the collection to create. + + Returns: + VDMS: VDMS vectorstore. + """ + client: vdms.vdms = kwargs["client"] + vdms_collection = cls( + collection_name=collection_name, + embedding=embedding, + client=client, + # **kwargs, + ) + if ids is None: + ids = [str(uuid.uuid1()) for _ in texts] + vdms_collection.add_texts( + texts=texts, + metadatas=metadatas, + ids=ids, + batch_size=batch_size, # **kwargs + ) + return vdms_collection + + def get( + self, + collection_name: str, + constraints: Optional[Dict] = None, + limit: Optional[int] = None, + include: List[str] = ["metadata"], + ) -> Tuple[Any, Any]: + """Gets the collection. + Get embeddings and their associated data from the data store. + If no constraints provided returns all embeddings up to limit. + + Args: + constraints: A dict used to filter results by. + E.g. `{"color" : ["==", "red"], "price": [">", 4.00]}`. Optional. + limit: The number of documents to return. Optional. + include: A list of what to include in the results. + Can contain `"embeddings"`, `"metadatas"`, `"documents"`. + Ids are always included. + Defaults to `["metadatas", "documents"]`. Optional. + """ + all_queries: List[Any] = [] + all_blobs: List[Any] = [] + + results: Dict[str, Any] = {"count": ""} + + if limit is not None: + results["limit"] = limit + + # Include metadata + if "metadata" in include: + collection_properties = self.__get_properties(collection_name) + results["list"] = collection_properties + + # Include embedding + if "embeddings" in include: + results["blob"] = True + + query = _add_descriptor( + "FindDescriptor", + collection_name, + k_neighbors=None, + constraints=constraints, + results=results, + ) + + all_queries.append(query) + + response, response_array = self.__run_vdms_query(all_queries, all_blobs) + return response, response_array + + def max_marginal_relevance_search( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + if self.embedding is None: + raise ValueError( + "For MMR search, you must specify an embedding function on" "creation." + ) + + embedding_vector: List[float] = self._embed_query(query) + docs = self.max_marginal_relevance_search_by_vector( + embedding_vector, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + ) + return docs + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + results = self.query_collection_embeddings( + query_embeddings=[embedding], + n_results=fetch_k, + filter=filter, + include=["metadatas", "documents", "distances", "embeddings"], + ) + + embedding_list = [list(_bytes2embedding(result)) for result in results[0][1]] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + candidates = _results_to_docs(results) + + selected_results = [r for i, r in enumerate(candidates) if i in mmr_selected] + return selected_results + + def max_marginal_relevance_search_with_score( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + if self.embedding is None: + raise ValueError( + "For MMR search, you must specify an embedding function on" "creation." + ) + + embedding = self._embed_query(query) + docs = self.max_marginal_relevance_search_with_score_by_vector( + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + ) + return docs + + def max_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + results = self.query_collection_embeddings( + query_embeddings=[embedding], + n_results=fetch_k, + filter=filter, + include=["metadatas", "documents", "distances", "embeddings"], + ) + + embedding_list = [list(_bytes2embedding(result)) for result in results[0][1]] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + candidates = _results_to_docs_and_scores(results) + + selected_results = [ + (r, s) for i, (r, s) in enumerate(candidates) if i in mmr_selected + ] + return selected_results + + def query_collection_embeddings( + self, + query_embeddings: Optional[List[List[float]]] = None, + collection_name: Optional[str] = None, + n_results: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Union[None, Dict[str, Any]] = None, + results: Union[None, Dict[str, Any]] = None, + normalize_distance: bool = False, + **kwargs: Any, + ) -> List[Tuple[Dict[str, Any], List]]: + all_responses: List[Any] = [] + + if collection_name is None: + collection_name = self._collection_name + + if query_embeddings is None: + return all_responses + + include = kwargs.get("include", ["metadatas"]) + if results is None and "metadatas" in include: + results = { + "list": self.collection_properties, + "blob": "embeddings" in include, + } + + for qemb in query_embeddings: + response, response_array = self.get_descriptor_response( + "FindDescriptor", + collection_name, + k_neighbors=n_results, + fetch_k=fetch_k, + constraints=filter, + results=results, + normalize_distance=normalize_distance, + query_embedding=qemb, + ) + all_responses.append([response, response_array]) + + return all_responses + + def similarity_search( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Document]: + """Run similarity search with VDMS. + + Args: + query (str): Query text to search for. + k (int): Number of results to return. Defaults to 3. + fetch_k (int): Number of candidates to fetch for knn (>= k). + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Document]: List of documents most similar to the query text. + """ + docs_and_scores = self.similarity_search_with_score( + query, k, fetch_k, filter=filter, **kwargs + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + Args: + embedding (List[float]): Embedding to look up documents similar to. + k (int): Number of Documents to return. Defaults to 3. + fetch_k (int): Number of candidates to fetch for knn (>= k). + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + Returns: + List of Documents most similar to the query vector. + """ + results = self.query_collection_embeddings( + query_embeddings=[embedding], + n_results=k, + fetch_k=fetch_k, + filter=filter, + **kwargs, + ) + + return _results_to_docs(results) + + def similarity_search_with_score( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Run similarity search with VDMS with distance. + + Args: + query (str): Query text to search for. + k (int): Number of results to return. Defaults to 3. + fetch_k (int): Number of candidates to fetch for knn (>= k). + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Tuple[Document, float]]: List of documents most similar to + the query text and cosine distance in float for each. + Lower score represents more similarity. + """ + if self.embedding is None: + raise ValueError("Must provide embedding function") + else: + query_embedding: List[float] = self._embed_query(query) + results = self.query_collection_embeddings( + query_embeddings=[query_embedding], + n_results=k, + fetch_k=fetch_k, + filter=filter, + **kwargs, + ) + + return _results_to_docs_and_scores(results) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[Dict[str, List]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Return docs most similar to embedding vector and similarity score. + + Args: + embedding (List[float]): Embedding to look up documents similar to. + k (int): Number of Documents to return. Defaults to 3. + fetch_k (int): Number of candidates to fetch for knn (>= k). + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Tuple[Document, float]]: List of documents most similar to + the query text and cosine distance in float for each. + Lower score represents more similarity. + """ + kwargs["normalize_distance"] = True + + results = self.query_collection_embeddings( + query_embeddings=[embedding], + n_results=k, + fetch_k=fetch_k, + filter=filter, + **kwargs, + ) + return _results_to_docs_and_scores(results) + + def update_document( + self, collection_name: str, document_id: str, document: Document + ) -> None: + """Update a document in the collection. + + Args: + document_id (str): ID of the document to update. + document (Document): Document to update. + """ + return self.update_documents(collection_name, [document_id], [document]) + + def update_documents( + self, collection_name: str, ids: List[str], documents: List[Document] + ) -> None: + """Update a document in the collection. + + Args: + ids (List[str]): List of ids of the document to update. + documents (List[Document]): List of documents to update. + """ + text = [document.page_content for document in documents] + metadata = [ + _validate_vdms_properties(document.metadata) for document in documents + ] + embeddings = self._embed_documents(text) + + self.__update( + collection_name, + ids, + metadatas=metadata, + embeddings=embeddings, + documents=text, + ) + + +# VDMS UTILITY + + +def _results_to_docs(results: Any) -> List[Document]: + return [doc for doc, _ in _results_to_docs_and_scores(results)] + + +def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: + final_res: List[Any] = [] + responses, blobs = results[0] + if ( + "FindDescriptor" in responses[0] + and "entities" in responses[0]["FindDescriptor"] + ): + result_entities = responses[0]["FindDescriptor"]["entities"] + # result_blobs = blobs + for ent in result_entities: + distance = ent["_distance"] + txt_contents = ent["content"] + for p in INVALID_DOC_METADATA_KEYS: + if p in ent: + del ent[p] + props = { + mkey: mval + for mkey, mval in ent.items() + if mval not in INVALID_METADATA_VALUE + } + + final_res.append( + (Document(page_content=txt_contents, metadata=props), distance) + ) + return final_res + + +def _add_descriptor( + command_str: str, + setname: str, + label: Optional[str] = None, + ref: Optional[int] = None, + props: Optional[dict] = None, + link: Optional[dict] = None, + k_neighbors: Optional[int] = None, + constraints: Optional[dict] = None, + results: Optional[dict] = None, +) -> Dict[str, Dict[str, Any]]: + entity: Dict[str, Any] = {"set": setname} + + if "Add" in command_str and label: + entity["label"] = label + + if ref is not None: + entity["_ref"] = ref + + if props not in INVALID_METADATA_VALUE: + entity["properties"] = props + + if "Add" in command_str and link is not None: + entity["link"] = link + + if "Find" in command_str and k_neighbors is not None: + entity["k_neighbors"] = int(k_neighbors) + + if "Find" in command_str and constraints not in INVALID_METADATA_VALUE: + entity["constraints"] = constraints + + if "Find" in command_str and results not in INVALID_METADATA_VALUE: + entity["results"] = results + + query = {command_str: entity} + return query + + +def _add_descriptorset( + command_str: str, + name: str, + num_dims: Optional[int] = None, + engine: Optional[str] = None, + metric: Optional[str] = None, + ref: Optional[int] = None, + props: Optional[Dict] = None, + link: Optional[Dict] = None, + storeIndex: bool = False, + constraints: Optional[Dict] = None, + results: Optional[Dict] = None, +) -> Dict[str, Any]: + if command_str == "AddDescriptorSet" and all( + var is not None for var in [name, num_dims] + ): + entity: Dict[str, Any] = { + "name": name, + "dimensions": num_dims, + } + + if engine is not None: + entity["engine"] = engine + + if metric is not None: + entity["metric"] = metric + + if ref is not None: + entity["_ref"] = ref + + if props not in [None, {}]: + entity["properties"] = props + + if link is not None: + entity["link"] = link + + elif command_str == "FindDescriptorSet": + entity = {"set": name} + + if storeIndex: + entity["storeIndex"] = storeIndex + + if constraints not in [None, {}]: + entity["constraints"] = constraints + + if results is not None: + entity["results"] = results + + else: + raise ValueError(f"Unknown command: {command_str}") + + query = {command_str: entity} + return query + + +def _add_entity_with_blob( + collection_name: str, all_properties: List +) -> Tuple[Dict[str, Any], bytes]: + all_properties_str = ",".join(all_properties) if len(all_properties) > 0 else "" + + querytype = "AddEntity" + entity: Dict[str, Any] = {} + entity["class"] = "properties" + entity["blob"] = True # New + + props: Dict[str, Any] = {"name": collection_name} + props["type"] = "queryable properties" + props["content"] = all_properties_str + entity["properties"] = props + + byte_data = _str2bytes(all_properties_str) + + query: Dict[str, Any] = {} + query[querytype] = entity + return query, byte_data + + +def _build_property_query( + collection_name: str, + command_type: str = "find", + all_properties: List = [], + ref: Optional[int] = None, +) -> Tuple[Any, Any]: + all_queries: List[Any] = [] + blob_arr: List[Any] = [] + + choices = ["find", "add", "update"] + if command_type.lower() not in choices: + raise ValueError("[!] Invalid type. Choices are : {}".format(",".join(choices))) + + if command_type.lower() == "find": + query = _find_property_entity(collection_name, unique_entity=True) + all_queries.append(query) + + elif command_type.lower() == "add": + query, byte_data = _add_entity_with_blob(collection_name, all_properties) + all_queries.append(query) + blob_arr.append(byte_data) + + elif command_type.lower() == "update": + # Find & Delete + query = _find_property_entity(collection_name, deletion=True) + all_queries.append(query) + + # Add + query, byte_data = _add_entity_with_blob(collection_name, all_properties) + all_queries.append(query) + blob_arr.append(byte_data) + + return all_queries, blob_arr + + +def _bytes2embedding(blob: bytes) -> Any: + emb = np.frombuffer(blob, dtype="float32") + return emb + + +def _bytes2str(in_bytes: bytes) -> str: + return in_bytes.decode() + + +def _get_cmds_from_query(all_queries: list) -> List[str]: + return list(set([k for q in all_queries for k in q.keys()])) + + +def _check_valid_response(all_queries: List[dict], response: Any) -> bool: + cmd_list = _get_cmds_from_query(all_queries) + valid_res = isinstance(response, list) and any( + cmd in response[0] + and "returned" in response[0][cmd] + and response[0][cmd]["returned"] > 0 + for cmd in cmd_list + ) + return valid_res + + +def _check_descriptor_exists_by_id( + client: vdms.vdms, + setname: str, + id: str, +) -> Tuple[bool, Any]: + constraints = {"id": ["==", id]} + findDescriptor = _add_descriptor( + "FindDescriptor", + setname, + constraints=constraints, + results={"list": ["id"], "count": ""}, + ) + all_queries = [findDescriptor] + res, _ = client.query(all_queries) + + valid_res = _check_valid_response(all_queries, res) + return valid_res, findDescriptor + + +def embedding2bytes(embedding: Union[List[float], None]) -> Union[bytes, None]: + blob = None + if embedding: + emb = np.array(embedding, dtype="float32") + blob = emb.tobytes() + return blob + + +def _find_property_entity( + collection_name: str, + unique_entity: Optional[bool] = False, + deletion: Optional[bool] = False, +) -> Dict[str, Dict[str, Any]]: + querytype = "FindEntity" + entity: Dict[str, Any] = {} + entity["class"] = "properties" + if unique_entity: + entity["unique"] = unique_entity + + results: Dict[str, Any] = {} + results["blob"] = True + results["count"] = "" + results["list"] = ["content"] + entity["results"] = results + + constraints: Dict[str, Any] = {} + if deletion: + constraints["_deletion"] = ["==", 1] + constraints["name"] = ["==", collection_name] + entity["constraints"] = constraints + + query: Dict[str, Any] = {} + query[querytype] = entity + return query + + +def _str2bytes(in_str: str) -> bytes: + return str.encode(in_str) + + +def _validate_vdms_properties(metadata: Dict[str, Any]) -> Dict: + new_metadata: Dict[str, Any] = {} + for key, value in metadata.items(): + if not isinstance(value, list): + new_metadata[str(key)] = value + return new_metadata diff --git a/libs/community/poetry.lock b/libs/community/poetry.lock index ab4bfb6a8b..5352284bc1 100644 --- a/libs/community/poetry.lock +++ b/libs/community/poetry.lock @@ -3725,7 +3725,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.1.34" +version = "0.1.35" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -5467,22 +5467,24 @@ testing = ["google-api-core[grpc] (>=1.31.5)"] [[package]] name = "protobuf" -version = "4.25.3" +version = "4.24.2" description = "" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"}, - {file = "protobuf-4.25.3-cp310-abi3-win_amd64.whl", hash = "sha256:209ba4cc916bab46f64e56b85b090607a676f66b473e6b762e6f1d9d591eb2e8"}, - {file = "protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f1279ab38ecbfae7e456a108c5c0681e4956d5b1090027c1de0f934dfdb4b35c"}, - {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:e7cb0ae90dd83727f0c0718634ed56837bfeeee29a5f82a7514c03ee1364c019"}, - {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d"}, - {file = "protobuf-4.25.3-cp38-cp38-win32.whl", hash = "sha256:f4f118245c4a087776e0a8408be33cf09f6c547442c00395fbfb116fac2f8ac2"}, - {file = "protobuf-4.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:c053062984e61144385022e53678fbded7aea14ebb3e0305ae3592fb219ccfa4"}, - {file = "protobuf-4.25.3-cp39-cp39-win32.whl", hash = "sha256:19b270aeaa0099f16d3ca02628546b8baefe2955bbe23224aaf856134eccf1e4"}, - {file = "protobuf-4.25.3-cp39-cp39-win_amd64.whl", hash = "sha256:e3c97a1555fd6388f857770ff8b9703083de6bf1f9274a002a332d65fbb56c8c"}, - {file = "protobuf-4.25.3-py3-none-any.whl", hash = "sha256:f0700d54bcf45424477e46a9f0944155b46fb0639d69728739c0e47bab83f2b9"}, - {file = "protobuf-4.25.3.tar.gz", hash = "sha256:25b5d0b42fd000320bd7830b349e3b696435f3b329810427a6bcce6a5492cc5c"}, + {file = "protobuf-4.24.2-cp310-abi3-win32.whl", hash = "sha256:58e12d2c1aa428ece2281cef09bbaa6938b083bcda606db3da4e02e991a0d924"}, + {file = "protobuf-4.24.2-cp310-abi3-win_amd64.whl", hash = "sha256:77700b55ba41144fc64828e02afb41901b42497b8217b558e4a001f18a85f2e3"}, + {file = "protobuf-4.24.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:237b9a50bd3b7307d0d834c1b0eb1a6cd47d3f4c2da840802cd03ea288ae8880"}, + {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:25ae91d21e3ce8d874211110c2f7edd6384816fb44e06b2867afe35139e1fd1c"}, + {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:c00c3c7eb9ad3833806e21e86dca448f46035242a680f81c3fe068ff65e79c74"}, + {file = "protobuf-4.24.2-cp37-cp37m-win32.whl", hash = "sha256:4e69965e7e54de4db989289a9b971a099e626f6167a9351e9d112221fc691bc1"}, + {file = "protobuf-4.24.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c5cdd486af081bf752225b26809d2d0a85e575b80a84cde5172a05bbb1990099"}, + {file = "protobuf-4.24.2-cp38-cp38-win32.whl", hash = "sha256:6bd26c1fa9038b26c5c044ee77e0ecb18463e957fefbaeb81a3feb419313a54e"}, + {file = "protobuf-4.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb7aa97c252279da65584af0456f802bd4b2de429eb945bbc9b3d61a42a8cd16"}, + {file = "protobuf-4.24.2-cp39-cp39-win32.whl", hash = "sha256:2b23bd6e06445699b12f525f3e92a916f2dcf45ffba441026357dea7fa46f42b"}, + {file = "protobuf-4.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:839952e759fc40b5d46be319a265cf94920174d88de31657d5622b5d8d6be5cd"}, + {file = "protobuf-4.24.2-py3-none-any.whl", hash = "sha256:3b7b170d3491ceed33f723bbf2d5a260f8a4e23843799a3906f16ef736ef251e"}, + {file = "protobuf-4.24.2.tar.gz", hash = "sha256:7fda70797ddec31ddfa3576cbdcc3ddbb6b3078b737a1a87ab9136af0570cd6e"}, ] [[package]] @@ -8700,6 +8702,20 @@ yarl = "*" [package.extras] tests = ["Werkzeug (==2.0.3)", "aiohttp", "boto3", "httplib2", "httpx", "pytest", "pytest-aiohttp", "pytest-asyncio", "pytest-cov", "pytest-httpbin", "requests (>=2.22.0)", "tornado", "urllib3"] +[[package]] +name = "vdms" +version = "0.0.20" +description = "VDMS Client Module" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4" +files = [ + {file = "vdms-0.0.20-py3-none-any.whl", hash = "sha256:7b81127f2981f2dabdcc5880ad7eb4bc2c7833a25aaf79a7b1a560e86bf7b5ec"}, + {file = "vdms-0.0.20.tar.gz", hash = "sha256:746c21a96e420b9b034495537b42d70f2326b020a1c6907677f7851a926e8605"}, +] + +[package.dependencies] +protobuf = "4.24.2" + [[package]] name = "watchdog" version = "4.0.0" @@ -9247,9 +9263,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] cli = ["typer"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "xata", "xmltodict", "zhipuai"] +extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cloudpickle", "cloudpickle", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "friendli-client", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "premai", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tidb-vector", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "vdms", "xata", "xmltodict", "zhipuai"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "c3f981923b0ba3a6b3ffa99e2ba23ebb0bb548f9f09f979c46e675eb8233cd81" +content-hash = "310c6e7bd72b09bf42f3fd3565c33072c11438d23cb160cb4666e44bce41a068" diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml index 1016076e3d..5eba959520 100644 --- a/libs/community/pyproject.toml +++ b/libs/community/pyproject.toml @@ -98,6 +98,7 @@ nvidia-riva-client = {version = "^2.14.0", optional = true} tidb-vector = {version = ">=0.0.3,<1.0.0", optional = true} friendli-client = {version = "^1.2.4", optional = true} premai = {version = "^0.3.25", optional = true} +vdms = {version = "^0.0.20", optional = true} [tool.poetry.group.test] optional = true @@ -156,6 +157,7 @@ tiktoken = ">=0.3.2,<0.6.0" anthropic = "^0.3.11" langchain-core = { path = "../core", develop = true } fireworks-ai = "^0.9.0" +vdms = "^0.0.20" [tool.poetry.group.lint] optional = true @@ -269,7 +271,8 @@ extended_testing = [ "tidb-vector", "cloudpickle", "friendli-client", - "premai" + "premai", + "vdms" ] [tool.ruff] diff --git a/libs/community/tests/integration_tests/vectorstores/test_vdms.py b/libs/community/tests/integration_tests/vectorstores/test_vdms.py new file mode 100644 index 0000000000..e5d5fdbef7 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_vdms.py @@ -0,0 +1,365 @@ +"""Test VDMS functionality.""" +from __future__ import annotations + +import logging +import os +from typing import TYPE_CHECKING + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores import VDMS +from langchain_community.vectorstores.vdms import VDMS_Client, embedding2bytes +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, + FakeEmbeddings, +) + +if TYPE_CHECKING: + import vdms + +logging.basicConfig(level=logging.DEBUG) + + +# The connection string matches the default settings in the docker-compose file +# located in the root of the repository: [root]/docker/docker-compose.yml +# To spin up a detached VDMS server: +# cd [root]/docker +# docker compose up -d vdms +@pytest.fixture +def vdms_client() -> vdms.vdms: + return VDMS_Client( + host=os.getenv("VDMS_DBHOST", "localhost"), + port=int(os.getenv("VDMS_DBPORT", 6025)), + ) + + +@pytest.mark.requires("vdms") +def test_init_from_client(vdms_client: vdms.vdms) -> None: + embedding_function = FakeEmbeddings() + _ = VDMS( + embedding_function=embedding_function, + client=vdms_client, + ) + + +@pytest.mark.requires("vdms") +def test_from_texts_with_metadatas(vdms_client: vdms.vdms) -> None: + """Test end to end construction and search.""" + collection_name = "test_from_texts_with_metadatas" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))] + metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [ + Document(page_content="foo", metadata={"page": "1", "id": ids[0]}) + ] + + +@pytest.mark.requires("vdms") +def test_from_texts_with_metadatas_with_scores(vdms_client: vdms.vdms) -> None: + """Test end to end construction and scored search.""" + collection_name = "test_from_texts_with_metadatas_with_scores" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_from_texts_with_metadatas_with_scores_{i}" for i in range(len(texts))] + metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [ + (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0) + ] + + +@pytest.mark.requires("vdms") +def test_from_texts_with_metadatas_with_scores_using_vector( + vdms_client: vdms.vdms, +) -> None: + """Test end to end construction and scored search, using embedding vector.""" + collection_name = "test_from_texts_with_metadatas_with_scores_using_vector" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))] + metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch._similarity_search_with_relevance_scores("foo", k=1) + assert output == [ + (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0) + ] + + +@pytest.mark.requires("vdms") +def test_search_filter(vdms_client: vdms.vdms) -> None: + """Test end to end construction and search with metadata filtering.""" + collection_name = "test_search_filter" + embedding_function = FakeEmbeddings() + texts = ["far", "bar", "baz"] + ids = [f"test_search_filter_{i}" for i in range(len(texts))] + metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.similarity_search( + "far", k=1, filter={"first_letter": ["==", "f"]} + ) + assert output == [ + Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}) + ] + output = docsearch.similarity_search( + "far", k=2, filter={"first_letter": ["==", "b"]} + ) + assert output == [ + Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}), + Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}), + ] + + +@pytest.mark.requires("vdms") +def test_search_filter_with_scores(vdms_client: vdms.vdms) -> None: + """Test end to end construction and scored search with metadata filtering.""" + collection_name = "test_search_filter_with_scores" + embedding_function = FakeEmbeddings() + texts = ["far", "bar", "baz"] + ids = [f"test_search_filter_with_scores_{i}" for i in range(len(texts))] + metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.similarity_search_with_score( + "far", k=1, filter={"first_letter": ["==", "f"]} + ) + assert output == [ + ( + Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}), + 0.0, + ) + ] + + output = docsearch.similarity_search_with_score( + "far", k=2, filter={"first_letter": ["==", "b"]} + ) + assert output == [ + ( + Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}), + 1.0, + ), + ( + Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}), + 4.0, + ), + ] + + +@pytest.mark.requires("vdms") +def test_mmr(vdms_client: vdms.vdms) -> None: + """Test end to end construction and search.""" + collection_name = "test_mmr" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_mmr_{i}" for i in range(len(texts))] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.max_marginal_relevance_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"id": ids[0]})] + + +@pytest.mark.requires("vdms") +def test_mmr_by_vector(vdms_client: vdms.vdms) -> None: + """Test end to end construction and search.""" + collection_name = "test_mmr_by_vector" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_mmr_by_vector_{i}" for i in range(len(texts))] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + collection_name=collection_name, + client=vdms_client, + ) + embedded_query = embedding_function.embed_query("foo") + output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1) + assert output == [Document(page_content="foo", metadata={"id": ids[0]})] + + +@pytest.mark.requires("vdms") +def test_with_include_parameter(vdms_client: vdms.vdms) -> None: + """Test end to end construction and include parameter.""" + collection_name = "test_with_include_parameter" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + docsearch = VDMS.from_texts( + texts=texts, + embedding=embedding_function, + collection_name=collection_name, + client=vdms_client, + ) + response, response_array = docsearch.get(collection_name, include=["embeddings"]) + assert response_array != [] + response, response_array = docsearch.get(collection_name) + assert response_array == [] + + +@pytest.mark.requires("vdms") +def test_update_document(vdms_client: vdms.vdms) -> None: + """Test the update_document function in the VDMS class.""" + collection_name = "test_update_document" + + # Make a consistent embedding + embedding_function = ConsistentFakeEmbeddings() + + # Initial document content and id + initial_content = "foo" + document_id = "doc1" + + # Create an instance of Document with initial content and metadata + original_doc = Document(page_content=initial_content, metadata={"page": "1"}) + + # Initialize a VDMS instance with the original document + docsearch = VDMS.from_documents( + client=vdms_client, + collection_name=collection_name, + documents=[original_doc], + embedding=embedding_function, + ids=[document_id], + ) + response, old_embedding = docsearch.get( + collection_name, + constraints={"id": ["==", document_id]}, + include=["metadata", "embeddings"], + ) + # old_embedding = response_array[0] + + # Define updated content for the document + updated_content = "updated foo" + + # Create a new Document instance with the updated content and the same id + updated_doc = Document(page_content=updated_content, metadata={"page": "1"}) + + # Update the document in the VDMS instance + docsearch.update_document( + collection_name, document_id=document_id, document=updated_doc + ) + + # Perform a similarity search with the updated content + output = docsearch.similarity_search(updated_content, k=1) + + # Assert that the updated document is returned by the search + assert output == [ + Document( + page_content=updated_content, metadata={"page": "1", "id": document_id} + ) + ] + + # Assert that the new embedding is correct + response, new_embedding = docsearch.get( + collection_name, + constraints={"id": ["==", document_id]}, + include=["metadata", "embeddings"], + ) + # new_embedding = response_array[0] + + assert new_embedding[0] == embedding2bytes( + embedding_function.embed_documents([updated_content])[0] + ) + assert new_embedding != old_embedding + + +@pytest.mark.requires("vdms") +def test_with_relevance_score(vdms_client: vdms.vdms) -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + collection_name = "test_with_relevance_score" + embedding_function = FakeEmbeddings() + texts = ["foo", "bar", "baz"] + ids = [f"test_relevance_scores_{i}" for i in range(len(texts))] + metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] + docsearch = VDMS.from_texts( + texts=texts, + ids=ids, + embedding=embedding_function, + metadatas=metadatas, + collection_name=collection_name, + client=vdms_client, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0), + (Document(page_content="bar", metadata={"page": "2", "id": ids[1]}), 0.25), + (Document(page_content="baz", metadata={"page": "3", "id": ids[2]}), 1.0), + ] + + +@pytest.mark.requires("vdms") +def test_add_documents_no_metadata(vdms_client: vdms.vdms) -> None: + collection_name = "test_add_documents_no_metadata" + embedding_function = FakeEmbeddings() + db = VDMS( + collection_name=collection_name, + embedding_function=embedding_function, + client=vdms_client, + ) + db.add_documents([Document(page_content="foo")]) + + +@pytest.mark.requires("vdms") +def test_add_documents_mixed_metadata(vdms_client: vdms.vdms) -> None: + collection_name = "test_add_documents_mixed_metadata" + embedding_function = FakeEmbeddings() + db = VDMS( + collection_name=collection_name, + embedding_function=embedding_function, + client=vdms_client, + ) + + docs = [ + Document(page_content="foo"), + Document(page_content="bar", metadata={"baz": 1}), + ] + ids = ["10", "11"] + actual_ids = db.add_documents(docs, ids=ids) + assert actual_ids == ids + + search = db.similarity_search("foo bar", k=2) + docs[0].metadata = {"id": ids[0]} + docs[1].metadata["id"] = ids[1] + assert sorted(search, key=lambda d: d.page_content) == sorted( + docs, key=lambda d: d.page_content + ) diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index f04cc0b64d..36b67ae074 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -84,6 +84,7 @@ def test_compatible_vectorstore_documentation() -> None: "TimescaleVector", "EcloudESVectorStore", "Vald", + "VDMS", "Vearch", "VespaStore", "Weaviate", diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index b092e0fba2..c2007f1113 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -77,6 +77,7 @@ _EXPECTED = [ "Typesense", "USearch", "Vald", + "VDMS", "Vearch", "Vectara", "VespaStore",