From b30f449dae39d046059b3c485530a30761304eb3 Mon Sep 17 00:00:00 2001 From: Xiaoyu Xee Date: Wed, 16 Aug 2023 07:19:30 +0800 Subject: [PATCH] Add dashvector vectorstore (#9163) ## Description Add `Dashvector` vectorstore for langchain - [dashvector quick start](https://help.aliyun.com/document_detail/2510223.html) - [dashvector package description](https://pypi.org/project/dashvector/) ## How to use ```python from langchain.vectorstores.dashvector import DashVector dashvector = DashVector.from_documents(docs, embeddings) ``` --------- Co-authored-by: smallrain.xuxy Co-authored-by: Bagatur --- .../integrations/providers/dashvector.mdx | 24 ++ .../vectorstores/dashvector.ipynb | 236 +++++++++++ .../langchain/vectorstores/__init__.py | 2 + .../langchain/vectorstores/dashvector.py | 365 ++++++++++++++++++ .../vectorstores/test_dashvector.py | 75 ++++ 5 files changed, 702 insertions(+) create mode 100644 docs/extras/integrations/providers/dashvector.mdx create mode 100644 docs/extras/integrations/vectorstores/dashvector.ipynb create mode 100644 libs/langchain/langchain/vectorstores/dashvector.py create mode 100644 libs/langchain/tests/integration_tests/vectorstores/test_dashvector.py diff --git a/docs/extras/integrations/providers/dashvector.mdx b/docs/extras/integrations/providers/dashvector.mdx new file mode 100644 index 0000000000..d1ab24a0d0 --- /dev/null +++ b/docs/extras/integrations/providers/dashvector.mdx @@ -0,0 +1,24 @@ +# DashVector + +> [DashVector](https://help.aliyun.com/document_detail/2510225.html) is a fully-managed vectorDB service that supports high-dimension dense and sparse vectors, real-time insertion and filtered search. It is built to scale automatically and can adapt to different application requirements. + +This document demonstrates to leverage DashVector within the LangChain ecosystem. In particular, it shows how to install DashVector, and how to use it as a VectorStore plugin in LangChain. +It is broken into two parts: installation and setup, and then references to specific DashVector wrappers. + +## Installation and Setup +Install the Python SDK: +```bash +pip install dashvector +``` + +## VectorStore + +A DashVector Collection is wrapped as a familiar VectorStore for native usage within LangChain, +which allows it to be readily used for various scenarios, such as semantic search or example selection. + +You may import the vectorstore by: +```python +from langchain.vectorstores import DashVector +``` + +For a detailed walkthrough of the DashVector wrapper, please refer to [this notebook](/docs/integrations/vectorstores/dashvector.html) diff --git a/docs/extras/integrations/vectorstores/dashvector.ipynb b/docs/extras/integrations/vectorstores/dashvector.ipynb new file mode 100644 index 0000000000..2352cf6f27 --- /dev/null +++ b/docs/extras/integrations/vectorstores/dashvector.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# DashVector\n", + "\n", + "> [DashVector](https://help.aliyun.com/document_detail/2510225.html) is a fully-managed vectorDB service that supports high-dimension dense and sparse vectors, real-time insertion and filtered search. It is built to scale automatically and can adapt to different application requirements.\n", + "\n", + "This notebook shows how to use functionality related to the `DashVector` vector database.\n", + "\n", + "To use DashVector, you must have an API key.\n", + "Here are the [installation instructions](https://help.aliyun.com/document_detail/2510223.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!pip install dashvector dashscope" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We want to use `DashScopeEmbeddings` so we also have to get the Dashscope API Key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n", + "is_executing": true + }, + "ExecuteTime": { + "end_time": "2023-08-11T10:37:15.091585Z", + "start_time": "2023-08-11T10:36:51.859753Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"DASHVECTOR_API_KEY\"] = getpass.getpass(\"DashVector API Key:\")\n", + "os.environ[\"DASHSCOPE_API_KEY\"] = getpass.getpass(\"DashScope API Key:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Example" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n", + "is_executing": true + }, + "ExecuteTime": { + "end_time": "2023-08-11T10:42:30.243460Z", + "start_time": "2023-08-11T10:42:27.783785Z" + } + }, + "outputs": [], + "source": [ + "from langchain.embeddings.dashscope import DashScopeEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import DashVector" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "is_executing": true, + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2023-08-11T10:42:30.391580Z", + "start_time": "2023-08-11T10:42:30.249021Z" + } + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = DashScopeEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can create DashVector from documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "dashvector = DashVector.from_documents(docs, embeddings)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = dashvector.similarity_search(query)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can add texts with meta datas and ids, and search with meta filter." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2023-08-11T10:42:51.641309Z", + "start_time": "2023-08-11T10:42:51.132109Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='baz', metadata={'key': 2})]\n" + ] + } + ], + "source": [ + "texts = [\"foo\", \"bar\", \"baz\"]\n", + "metadatas = [{\"key\": i} for i in range(len(texts))]\n", + "ids = [\"0\", \"1\", \"2\"]\n", + "\n", + "dashvector.add_texts(texts, metadatas=metadatas, ids=ids)\n", + "\n", + "docs = dashvector.similarity_search(\"foo\", filter=\"key = 2\")\n", + "print(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index ab75c524ad..74e70804f0 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -33,6 +33,7 @@ from langchain.vectorstores.cassandra import Cassandra from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.clarifai import Clarifai from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings +from langchain.vectorstores.dashvector import DashVector from langchain.vectorstores.deeplake import DeepLake from langchain.vectorstores.dingo import Dingo from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch @@ -83,6 +84,7 @@ __all__ = [ "Chroma", "Clickhouse", "ClickhouseSettings", + "DashVector", "DeepLake", "Dingo", "DocArrayHnswSearch", diff --git a/libs/langchain/langchain/vectorstores/dashvector.py b/libs/langchain/langchain/vectorstores/dashvector.py new file mode 100644 index 0000000000..714c76c008 --- /dev/null +++ b/libs/langchain/langchain/vectorstores/dashvector.py @@ -0,0 +1,365 @@ +"""Wrapper around DashVector vector database.""" +from __future__ import annotations + +import logging +import uuid +from typing import ( + Any, + Iterable, + List, + Optional, + Tuple, +) + +import numpy as np + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_env +from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance + +logger = logging.getLogger(__name__) + + +class DashVector(VectorStore): + """Wrapper around DashVector vector database. + + To use, you should have the ``dashvector`` python package installed. + + Example: + .. code-block:: python + + from langchain.vectorstores import dashvector + from langchain.embeddings.openai import OpenAIEmbeddings + import dashvector + + client = dashvector.Client.init(api_key="***") + client.create("langchain") + collection = client.get("langchain") + embeddings = OpenAIEmbeddings() + vectorstore = Dashvector(collection, embeddings.embed_query, "text") + """ + + def __init__( + self, + collection: Any, + embedding: Embeddings, + text_field: str, + ): + """Initialize with DashVector collection.""" + + try: + import dashvector + except ImportError: + raise ValueError( + "Could not import dashvector python package. " + "Please install it with `pip install dashvector`." + ) + + if not isinstance(collection, dashvector.Collection): + raise ValueError( + f"collection should be an instance of dashvector.Collection, " + f"bug got {type(collection)}" + ) + + self._collection = collection + self._embedding = embedding + self._text_field = text_field + + def _similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[str] = None, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query vector, along with scores""" + + # query by vector + ret = self._collection.query(embedding, topk=k, filter=filter) + if not ret: + raise ValueError( + f"Fail to query docs by vector, error {self._collection.message}" + ) + + docs = [] + for doc in ret: + metadata = doc.fields + text = metadata.pop(self._text_field) + score = doc.score + docs.append((Document(page_content=text, metadata=metadata), score)) + return docs + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + batch_size: int = 25, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids associated with the texts. + batch_size: Optional batch size to upsert docs. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + ids = ids or [str(uuid.uuid4().hex) for _ in texts] + text_list = list(texts) + for i in range(0, len(text_list), batch_size): + # batch end + end = min(i + batch_size, len(text_list)) + + batch_texts = text_list[i:end] + batch_ids = ids[i:end] + batch_embeddings = self._embedding.embed_documents(list(batch_texts)) + + # batch metadatas + if metadatas: + batch_metadatas = metadatas[i:end] + else: + batch_metadatas = [{} for _ in range(i, end)] + for metadata, text in zip(batch_metadatas, batch_texts): + metadata[self._text_field] = text + + # batch upsert to collection + docs = list(zip(batch_ids, batch_embeddings, batch_metadatas)) + ret = self._collection.upsert(docs) + if not ret: + raise ValueError( + f"Fail to upsert docs to dashvector vector database," + f"Error: {ret.message}" + ) + return ids + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> bool: + """Delete by vector ID. + + Args: + ids: List of ids to delete. + + Returns: + True if deletion is successful, + False otherwise. + """ + return bool(self._collection.delete(ids)) + + def similarity_search( + self, + query: str, + k: int = 4, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to search documents similar to. + k: Number of documents to return. Default to 4. + filter: Doc fields filter conditions that meet the SQL where clause + specification. + + Returns: + List of Documents most similar to the query text. + """ + + docs_and_scores = self.similarity_search_with_relevance_scores(query, k, filter) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query text , alone with relevance scores. + + Less is more similar, more is more dissimilar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + filter: Doc fields filter conditions that meet the SQL where clause + specification. + + Returns: + List of Tuples of (doc, similarity_score) + """ + + embedding = self._embedding.embed_query(query) + return self._similarity_search_with_score_by_vector( + embedding, k=k, filter=filter + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Doc fields filter conditions that meet the SQL where clause + specification. + + Returns: + List of Documents most similar to the query vector. + """ + docs_and_scores = self._similarity_search_with_score_by_vector( + embedding, k, filter + ) + return [doc for doc, _ in docs_and_scores] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[dict] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter: Doc fields filter conditions that meet the SQL where clause + specification. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = self._embedding.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, k, fetch_k, lambda_mult, filter + ) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[dict] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter: Doc fields filter conditions that meet the SQL where clause + specification. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + + # query by vector + ret = self._collection.query( + embedding, topk=fetch_k, filter=filter, include_vector=True + ) + if not ret: + raise ValueError( + f"Fail to query docs by vector, error {self._collection.message}" + ) + + candidate_embeddings = [doc.vector for doc in ret] + mmr_selected = maximal_marginal_relevance( + np.array(embedding), candidate_embeddings, lambda_mult, k + ) + + metadatas = [ret.output[i].fields for i in mmr_selected] + return [ + Document(page_content=metadata.pop(self._text_field), metadata=metadata) + for metadata in metadatas + ] + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + dashvector_api_key: Optional[str] = None, + collection_name: str = "langchain", + text_field: str = "text", + batch_size: int = 25, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> DashVector: + """Return DashVector VectorStore initialized from texts and embeddings. + + This is the quick way to get started with dashvector vector store. + + Example: + .. code-block:: python + + from langchain.vectorstores import DashVector + from langchain.embeddings import OpenAIEmbeddings + import dashvector + + embeddings = OpenAIEmbeddings() + dashvector = DashVector.from_documents( + docs, + embeddings, + dashvector_api_key="{DASHVECTOR_API_KEY}" + ) + """ + try: + import dashvector + except ImportError: + raise ValueError( + "Could not import dashvector python package. " + "Please install it with `pip install dashvector`." + ) + + dashvector_api_key = dashvector_api_key or get_from_env( + "dashvector_api_key", "DASHVECTOR_API_KEY" + ) + + dashvector_client = dashvector.Client(api_key=dashvector_api_key) + dashvector_client.delete(collection_name) + collection = dashvector_client.get(collection_name) + if not collection: + dim = len(embedding.embed_query(texts[0])) + # create collection if not existed + resp = dashvector_client.create(collection_name, dimension=dim) + if resp: + collection = dashvector_client.get(collection_name) + else: + raise ValueError( + "Fail to create collection. " f"Error: {resp.message}." + ) + + dashvector_vector_db = cls(collection, embedding, text_field) + dashvector_vector_db.add_texts(texts, metadatas, ids, batch_size) + return dashvector_vector_db diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_dashvector.py b/libs/langchain/tests/integration_tests/vectorstores/test_dashvector.py new file mode 100644 index 0000000000..e01a12073e --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/test_dashvector.py @@ -0,0 +1,75 @@ +from time import sleep + +from langchain.schema import Document +from langchain.vectorstores import DashVector +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +texts = ["foo", "bar", "baz"] +ids = ["1", "2", "3"] + + +def test_dashvector_from_texts() -> None: + dashvector = DashVector.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + ids=ids, + ) + + # the vector insert operation is async by design, we wait here a bit for the + # insertion to complete. + sleep(0.5) + output = dashvector.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_dashvector_with_text_with_metadatas() -> None: + metadatas = [{"meta": i} for i in range(len(texts))] + dashvector = DashVector.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ids=ids, + ) + + # the vector insert operation is async by design, we wait here a bit for the + # insertion to complete. + sleep(0.5) + output = dashvector.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"meta": 0})] + + +def test_dashvector_search_with_filter() -> None: + metadatas = [{"meta": i} for i in range(len(texts))] + dashvector = DashVector.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ids=ids, + ) + + # the vector insert operation is async by design, we wait here a bit for the + # insertion to complete. + sleep(0.5) + output = dashvector.similarity_search("foo", filter="meta=2") + assert output == [Document(page_content="baz", metadata={"meta": 2})] + + +def test_dashvector_search_with_scores() -> None: + dashvector = DashVector.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + ids=ids, + ) + + # the vector insert operation is async by design, we wait here a bit for the + # insertion to complete. + sleep(0.5) + output = dashvector.similarity_search_with_relevance_scores("foo") + docs, scores = zip(*output) + + assert scores[0] < scores[1] < scores[2] + assert list(docs) == [ + Document(page_content="foo"), + Document(page_content="bar"), + Document(page_content="baz"), + ]