diff --git a/docs/extras/integrations/providers/dingo.mdx b/docs/extras/integrations/providers/dingo.mdx new file mode 100644 index 0000000000..5fd59675b1 --- /dev/null +++ b/docs/extras/integrations/providers/dingo.mdx @@ -0,0 +1,19 @@ +# Dingo + +This page covers how to use the Dingo ecosystem within LangChain. +It is broken into two parts: installation and setup, and then references to specific Dingo wrappers. + +## Installation and Setup +- Install the Python SDK with `pip install dingodb` + +## VectorStore + +There exists a wrapper around Dingo indexes, allowing you to use it as a vectorstore, +whether for semantic search or example selection. + +To import this vectorstore: +```python +from langchain.vectorstores import Dingo +``` + +For a more detailed walkthrough of the Dingo wrapper, see [this notebook](/docs/integrations/vectorstores/dingo.html) diff --git a/docs/extras/integrations/vectorstores/dingo.ipynb b/docs/extras/integrations/vectorstores/dingo.ipynb new file mode 100644 index 0000000000..fab91b3273 --- /dev/null +++ b/docs/extras/integrations/vectorstores/dingo.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# Dingo\n", + "\n", + ">[Dingo](https://dingodb.readthedocs.io/en/latest/) is a distributed multi-mode vector database, which combines the characteristics of data lakes and vector databases, and can store data of any type and size (Key-Value, PDF, audio, video, etc.). It has real-time low-latency processing capabilities to achieve rapid insight and response, and can efficiently conduct instant analysis and process multi-modal data.\n", + "\n", + "This notebook shows how to use functionality related to the DingoDB vector database.\n", + "\n", + "To run, you should have a [DingoDB instance up and running](https://github.com/dingodb/dingo-deploy/blob/main/README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a62cff8a-bcf7-4e33-bbbc-76999c2e3e20", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install dingodb" + ] + }, + { + "cell_type": "markdown", + "id": "7a0f9e02-8eb0-4aef-b11f-8861360472ee", + "metadata": {}, + "source": [ + "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8b6ed9cd-81b9-46e5-9c20-5aafca2844d0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI API Key:········\n" + ] + } + ], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aac9563e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Dingo\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3c3999a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dcf88bdf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dingodb import DingoDB\n", + "\n", + "index_name = \"langchain-demo\"\n", + "\n", + "dingo_client = DingoDB(user=\"\", password=\"\", host=[\"127.0.0.1:13000\"])\n", + "# First, check if our index already exists. If it doesn't, we create it\n", + "if index_name not in dingo_client.get_index():\n", + " # we create a new index\n", + " dingo_client.create_index(\n", + " index_name=index_name,\n", + " dimension=1536,\n", + " metric_type='cosine',\n", + " auto_id=False\n", + ")\n", + "\n", + "# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`\n", + "docsearch = Dingo.from_documents(docs, embeddings, client=dingo_client, index_name=index_name)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3aae49e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Dingo\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a8c513ab", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fc516993", + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0][1])" + ] + }, + { + "cell_type": "markdown", + "id": "1eca81e4", + "metadata": {}, + "source": [ + "### Adding More Text to an Existing Index\n", + "\n", + "More text can embedded and upserted to an existing Dingo index using the `add_texts` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e40d558b", + "metadata": {}, + "outputs": [], + "source": [ + "vectorstore = Dingo(client, embeddings.embed_query, \"text\")\n", + "\n", + "vectorstore.add_texts(\"More text!\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcb858a8", + "metadata": {}, + "source": [ + "### Maximal Marginal Relevance Searches\n", + "\n", + "In addition to using similarity search in the retriever object, you can also use `mmr` as retriever." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "649083ab", + "metadata": {}, + "outputs": [], + "source": [ + "retriever = docsearch.as_retriever(search_type=\"mmr\")\n", + "matched_docs = retriever.get_relevant_documents(query)\n", + "for i, d in enumerate(matched_docs):\n", + " print(f\"\\n## Document {i}\\n\")\n", + " print(d.page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "7d3831ad", + "metadata": {}, + "source": [ + "Or use `max_marginal_relevance_search` directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "732f58b1", + "metadata": {}, + "outputs": [], + "source": [ + "found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n", + "for i, doc in enumerate(found_docs):\n", + " print(f\"{i + 1}.\", doc.page_content, \"\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index 4c254daede..5e3b904a48 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -34,6 +34,7 @@ from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.clarifai import Clarifai from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings from langchain.vectorstores.deeplake import DeepLake +from langchain.vectorstores.dingo import Dingo from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch from langchain.vectorstores.elastic_vector_search import ( ElasticKnnSearch, @@ -82,6 +83,7 @@ __all__ = [ "Clickhouse", "ClickhouseSettings", "DeepLake", + "Dingo", "DocArrayHnswSearch", "DocArrayInMemorySearch", "ElasticVectorSearch", diff --git a/libs/langchain/langchain/vectorstores/dingo.py b/libs/langchain/langchain/vectorstores/dingo.py new file mode 100644 index 0000000000..a1f31eeb40 --- /dev/null +++ b/libs/langchain/langchain/vectorstores/dingo.py @@ -0,0 +1,349 @@ +"""Wrapper around the Dingo vector database.""" +from __future__ import annotations + +import logging +import uuid +from typing import Any, Iterable, List, Optional, Tuple + +import numpy as np + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance + +logger = logging.getLogger(__name__) + + +class Dingo(VectorStore): + """Wrapper around Dingo vector database. + + To use, you should have the ``dingodb`` python package installed. + + Example: + .. code-block:: python + + from langchain.vectorstores import Dingo + from langchain.embeddings.openai import OpenAIEmbeddings + + embeddings = OpenAIEmbeddings() + dingo = Dingo(embeddings, "text") + """ + + def __init__( + self, + embedding: Embeddings, + text_key: str, + *, + client: Any = None, + index_name: Optional[str] = None, + host: Optional[List[str]] = None, + user: str = "root", + password: str = "123123", + self_id: bool = False, + ): + """Initialize with Dingo client.""" + try: + import dingodb + except ImportError: + raise ImportError( + "Could not import dingo python package. " + "Please install it with `pip install dingodb." + ) + + host = host if host is not None else ["172.20.31.10:13000"] + + # collection + if client is not None: + dingo_client = client + else: + try: + # connect to dingo db + dingo_client = dingodb.DingoDB(user, password, host) + except ValueError as e: + raise ValueError(f"Dingo failed to connect: {e}") + + self._text_key = text_key + self._client = dingo_client + + if index_name is not None and index_name not in dingo_client.get_index(): + if self_id is True: + dingo_client.create_index(index_name, 1024, auto_id=False) + else: + dingo_client.create_index(index_name, 1024) + + self._index_name = index_name + self._embedding = embedding + + @property + def embeddings(self) -> Optional[Embeddings]: + return self._embedding + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + text_key: str = "text", + batch_size: int = 500, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + + """ + + # Embed and create the documents + ids = ids or [str(uuid.uuid1().int)[:13] for _ in texts] + metadatas_list = [] + texts = list(texts) + embeds = self._embedding.embed_documents(texts) + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + metadata[self._text_key] = text + metadatas_list.append(metadata) + # upsert to Dingo + for i in range(0, len(list(texts)), batch_size): + j = i + batch_size + self._client.vector_add( + self._index_name, metadatas_list[i:j], embeds[i:j], ids[i:j] + ) + + return ids + + def similarity_search( + self, + query: str, + k: int = 4, + search_params: Optional[dict] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Return Dingo documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_params: Dictionary of argument(s) to filter on metadata + + Returns: + List of Documents most similar to the query and score for each + """ + docs_and_scores = self.similarity_search_with_score( + query, k=k, search_params=search_params + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + search_params: Optional[dict] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return Dingo documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_params: Dictionary of argument(s) to filter on metadata + + Returns: + List of Documents most similar to the query and score for each + """ + docs = [] + query_obj = self._embedding.embed_query(query) + results = self._client.vector_search( + self._index_name, xq=query_obj, top_k=k, search_params=search_params + ) + + if not results: + return [] + + for res in results[0]["vectorWithDistances"]: + metadatas = res["scalarData"] + id = res["id"] + score = res["distance"] + text = metadatas[self._text_key]["fields"][0]["data"] + + metadata = {"id": id, "text": text, "score": score} + docs.append((Document(page_content=text, metadata=metadata), score)) + + return docs + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + search_params: Optional[dict] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + results = self._client.vector_search( + self._index_name, [embedding], search_params, k + ) + + mmr_selected = maximal_marginal_relevance( + np.array([embedding], dtype=np.float32), + [item["floatValues"] for item in results[0]["vectorWithDistances"]], + k=k, + lambda_mult=lambda_mult, + ) + selected = [ + results[0]["vectorWithDistances"][i]["metaData"] for i in mmr_selected + ] + return [ + Document(page_content=metadata.pop((self._text_key)), metadata=metadata) + for metadata in selected + ] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + search_params: Optional[dict] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = self._embedding.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, k, fetch_k, lambda_mult, search_params + ) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + text_key: str = "text", + index_name: Optional[str] = None, + client: Any = None, + host: List[str] = ["172.20.31.10:13000"], + user: str = "root", + password: str = "123123", + batch_size: int = 500, + **kwargs: Any, + ) -> Dingo: + """Construct Dingo wrapper from raw documents. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Adds the documents to a provided Dingo index + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import Dingo + from langchain.embeddings import OpenAIEmbeddings + import dingodb + sss + embeddings = OpenAIEmbeddings() + dingo = Dingo.from_texts( + texts, + embeddings, + index_name="langchain-demo" + ) + """ + try: + import dingodb + except ImportError: + raise ImportError( + "Could not import dingo python package. " + "Please install it with `pip install dingodb`." + ) + + if client is not None: + dingo_client = client + else: + try: + # connect to dingo db + dingo_client = dingodb.DingoDB(user, password, host) + except ValueError as e: + raise ValueError(f"Dingo failed to connect: {e}") + if kwargs is not None and kwargs.get("self_id") is True: + if index_name not in dingo_client.get_index(): + dingo_client.create_index(index_name, 1024, auto_id=False) + else: + if index_name not in dingo_client.get_index(): + dingo_client.create_index(index_name, 1024) + # dingo_client.create_index(index_name, 1024, index_type="hnsw") + + # Embed and create the documents + + ids = ids or [str(uuid.uuid1().int)[:13] for _ in texts] + metadatas_list = [] + texts = list(texts) + embeds = embedding.embed_documents(texts) + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + metadata[text_key] = text + metadatas_list.append(metadata) + + # upsert to Dingo + for i in range(0, len(list(texts)), batch_size): + j = i + batch_size + dingo_client.vector_add( + index_name, metadatas_list[i:j], embeds[i:j], ids[i:j] + ) + return cls(embedding, text_key, client=dingo_client, index_name=index_name) + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Any: + """Delete by vector IDs or filter. + Args: + ids: List of ids to delete. + """ + + if ids is None: + raise ValueError("No ids provided to delete.") + + return self._client.vector_delete(self._index_name, ids=ids)