mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
parent
926c64da60
commit
8cb2594562
19
docs/extras/integrations/providers/dingo.mdx
Normal file
19
docs/extras/integrations/providers/dingo.mdx
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# Dingo
|
||||||
|
|
||||||
|
This page covers how to use the Dingo ecosystem within LangChain.
|
||||||
|
It is broken into two parts: installation and setup, and then references to specific Dingo wrappers.
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
- Install the Python SDK with `pip install dingodb`
|
||||||
|
|
||||||
|
## VectorStore
|
||||||
|
|
||||||
|
There exists a wrapper around Dingo indexes, allowing you to use it as a vectorstore,
|
||||||
|
whether for semantic search or example selection.
|
||||||
|
|
||||||
|
To import this vectorstore:
|
||||||
|
```python
|
||||||
|
from langchain.vectorstores import Dingo
|
||||||
|
```
|
||||||
|
|
||||||
|
For a more detailed walkthrough of the Dingo wrapper, see [this notebook](/docs/integrations/vectorstores/dingo.html)
|
244
docs/extras/integrations/vectorstores/dingo.ipynb
Normal file
244
docs/extras/integrations/vectorstores/dingo.ipynb
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "683953b3",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Dingo\n",
|
||||||
|
"\n",
|
||||||
|
">[Dingo](https://dingodb.readthedocs.io/en/latest/) is a distributed multi-mode vector database, which combines the characteristics of data lakes and vector databases, and can store data of any type and size (Key-Value, PDF, audio, video, etc.). It has real-time low-latency processing capabilities to achieve rapid insight and response, and can efficiently conduct instant analysis and process multi-modal data.\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to use functionality related to the DingoDB vector database.\n",
|
||||||
|
"\n",
|
||||||
|
"To run, you should have a [DingoDB instance up and running](https://github.com/dingodb/dingo-deploy/blob/main/README.md)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a62cff8a-bcf7-4e33-bbbc-76999c2e3e20",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install dingodb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7a0f9e02-8eb0-4aef-b11f-8861360472ee",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We want to use OpenAIEmbeddings so we have to get the OpenAI API Key."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "8b6ed9cd-81b9-46e5-9c20-5aafca2844d0",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"OpenAI API Key:········\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "aac9563e",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import Dingo\n",
|
||||||
|
"from langchain.document_loaders import TextLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "a3c3999a",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import TextLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "dcf88bdf",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from dingodb import DingoDB\n",
|
||||||
|
"\n",
|
||||||
|
"index_name = \"langchain-demo\"\n",
|
||||||
|
"\n",
|
||||||
|
"dingo_client = DingoDB(user=\"\", password=\"\", host=[\"127.0.0.1:13000\"])\n",
|
||||||
|
"# First, check if our index already exists. If it doesn't, we create it\n",
|
||||||
|
"if index_name not in dingo_client.get_index():\n",
|
||||||
|
" # we create a new index\n",
|
||||||
|
" dingo_client.create_index(\n",
|
||||||
|
" index_name=index_name,\n",
|
||||||
|
" dimension=1536,\n",
|
||||||
|
" metric_type='cosine',\n",
|
||||||
|
" auto_id=False\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`\n",
|
||||||
|
"docsearch = Dingo.from_documents(docs, embeddings, client=dingo_client, index_name=index_name)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c3aae49e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import Dingo\n",
|
||||||
|
"from langchain.document_loaders import TextLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "a8c513ab",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
"docs = docsearch.similarity_search(query)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "fc516993",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(docs[0][1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1eca81e4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Adding More Text to an Existing Index\n",
|
||||||
|
"\n",
|
||||||
|
"More text can embedded and upserted to an existing Dingo index using the `add_texts` function"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e40d558b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vectorstore = Dingo(client, embeddings.embed_query, \"text\")\n",
|
||||||
|
"\n",
|
||||||
|
"vectorstore.add_texts(\"More text!\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bcb858a8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Maximal Marginal Relevance Searches\n",
|
||||||
|
"\n",
|
||||||
|
"In addition to using similarity search in the retriever object, you can also use `mmr` as retriever."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "649083ab",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever = docsearch.as_retriever(search_type=\"mmr\")\n",
|
||||||
|
"matched_docs = retriever.get_relevant_documents(query)\n",
|
||||||
|
"for i, d in enumerate(matched_docs):\n",
|
||||||
|
" print(f\"\\n## Document {i}\\n\")\n",
|
||||||
|
" print(d.page_content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7d3831ad",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Or use `max_marginal_relevance_search` directly:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "732f58b1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n",
|
||||||
|
"for i, doc in enumerate(found_docs):\n",
|
||||||
|
" print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -34,6 +34,7 @@ from langchain.vectorstores.chroma import Chroma
|
|||||||
from langchain.vectorstores.clarifai import Clarifai
|
from langchain.vectorstores.clarifai import Clarifai
|
||||||
from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings
|
from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings
|
||||||
from langchain.vectorstores.deeplake import DeepLake
|
from langchain.vectorstores.deeplake import DeepLake
|
||||||
|
from langchain.vectorstores.dingo import Dingo
|
||||||
from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch
|
from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch
|
||||||
from langchain.vectorstores.elastic_vector_search import (
|
from langchain.vectorstores.elastic_vector_search import (
|
||||||
ElasticKnnSearch,
|
ElasticKnnSearch,
|
||||||
@ -82,6 +83,7 @@ __all__ = [
|
|||||||
"Clickhouse",
|
"Clickhouse",
|
||||||
"ClickhouseSettings",
|
"ClickhouseSettings",
|
||||||
"DeepLake",
|
"DeepLake",
|
||||||
|
"Dingo",
|
||||||
"DocArrayHnswSearch",
|
"DocArrayHnswSearch",
|
||||||
"DocArrayInMemorySearch",
|
"DocArrayInMemorySearch",
|
||||||
"ElasticVectorSearch",
|
"ElasticVectorSearch",
|
||||||
|
349
libs/langchain/langchain/vectorstores/dingo.py
Normal file
349
libs/langchain/langchain/vectorstores/dingo.py
Normal file
@ -0,0 +1,349 @@
|
|||||||
|
"""Wrapper around the Dingo vector database."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.vectorstores.base import VectorStore
|
||||||
|
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Dingo(VectorStore):
|
||||||
|
"""Wrapper around Dingo vector database.
|
||||||
|
|
||||||
|
To use, you should have the ``dingodb`` python package installed.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.vectorstores import Dingo
|
||||||
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
dingo = Dingo(embeddings, "text")
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding: Embeddings,
|
||||||
|
text_key: str,
|
||||||
|
*,
|
||||||
|
client: Any = None,
|
||||||
|
index_name: Optional[str] = None,
|
||||||
|
host: Optional[List[str]] = None,
|
||||||
|
user: str = "root",
|
||||||
|
password: str = "123123",
|
||||||
|
self_id: bool = False,
|
||||||
|
):
|
||||||
|
"""Initialize with Dingo client."""
|
||||||
|
try:
|
||||||
|
import dingodb
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import dingo python package. "
|
||||||
|
"Please install it with `pip install dingodb."
|
||||||
|
)
|
||||||
|
|
||||||
|
host = host if host is not None else ["172.20.31.10:13000"]
|
||||||
|
|
||||||
|
# collection
|
||||||
|
if client is not None:
|
||||||
|
dingo_client = client
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
# connect to dingo db
|
||||||
|
dingo_client = dingodb.DingoDB(user, password, host)
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError(f"Dingo failed to connect: {e}")
|
||||||
|
|
||||||
|
self._text_key = text_key
|
||||||
|
self._client = dingo_client
|
||||||
|
|
||||||
|
if index_name is not None and index_name not in dingo_client.get_index():
|
||||||
|
if self_id is True:
|
||||||
|
dingo_client.create_index(index_name, 1024, auto_id=False)
|
||||||
|
else:
|
||||||
|
dingo_client.create_index(index_name, 1024)
|
||||||
|
|
||||||
|
self._index_name = index_name
|
||||||
|
self._embedding = embedding
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embeddings(self) -> Optional[Embeddings]:
|
||||||
|
return self._embedding
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
text_key: str = "text",
|
||||||
|
batch_size: int = 500,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
ids: Optional list of ids to associate with the texts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Embed and create the documents
|
||||||
|
ids = ids or [str(uuid.uuid1().int)[:13] for _ in texts]
|
||||||
|
metadatas_list = []
|
||||||
|
texts = list(texts)
|
||||||
|
embeds = self._embedding.embed_documents(texts)
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
metadata = metadatas[i] if metadatas else {}
|
||||||
|
metadata[self._text_key] = text
|
||||||
|
metadatas_list.append(metadata)
|
||||||
|
# upsert to Dingo
|
||||||
|
for i in range(0, len(list(texts)), batch_size):
|
||||||
|
j = i + batch_size
|
||||||
|
self._client.vector_add(
|
||||||
|
self._index_name, metadatas_list[i:j], embeds[i:j], ids[i:j]
|
||||||
|
)
|
||||||
|
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
search_params: Optional[dict] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return Dingo documents most similar to query, along with scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
search_params: Dictionary of argument(s) to filter on metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each
|
||||||
|
"""
|
||||||
|
docs_and_scores = self.similarity_search_with_score(
|
||||||
|
query, k=k, search_params=search_params
|
||||||
|
)
|
||||||
|
return [doc for doc, _ in docs_and_scores]
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
search_params: Optional[dict] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return Dingo documents most similar to query, along with scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
search_params: Dictionary of argument(s) to filter on metadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each
|
||||||
|
"""
|
||||||
|
docs = []
|
||||||
|
query_obj = self._embedding.embed_query(query)
|
||||||
|
results = self._client.vector_search(
|
||||||
|
self._index_name, xq=query_obj, top_k=k, search_params=search_params
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return []
|
||||||
|
|
||||||
|
for res in results[0]["vectorWithDistances"]:
|
||||||
|
metadatas = res["scalarData"]
|
||||||
|
id = res["id"]
|
||||||
|
score = res["distance"]
|
||||||
|
text = metadatas[self._text_key]["fields"][0]["data"]
|
||||||
|
|
||||||
|
metadata = {"id": id, "text": text, "score": score}
|
||||||
|
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||||
|
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def max_marginal_relevance_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
search_params: Optional[dict] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
lambda_mult: Number between 0 and 1 that determines the degree
|
||||||
|
of diversity among the results with 0 corresponding
|
||||||
|
to maximum diversity and 1 to minimum diversity.
|
||||||
|
Defaults to 0.5.
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
results = self._client.vector_search(
|
||||||
|
self._index_name, [embedding], search_params, k
|
||||||
|
)
|
||||||
|
|
||||||
|
mmr_selected = maximal_marginal_relevance(
|
||||||
|
np.array([embedding], dtype=np.float32),
|
||||||
|
[item["floatValues"] for item in results[0]["vectorWithDistances"]],
|
||||||
|
k=k,
|
||||||
|
lambda_mult=lambda_mult,
|
||||||
|
)
|
||||||
|
selected = [
|
||||||
|
results[0]["vectorWithDistances"][i]["metaData"] for i in mmr_selected
|
||||||
|
]
|
||||||
|
return [
|
||||||
|
Document(page_content=metadata.pop((self._text_key)), metadata=metadata)
|
||||||
|
for metadata in selected
|
||||||
|
]
|
||||||
|
|
||||||
|
def max_marginal_relevance_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
search_params: Optional[dict] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
lambda_mult: Number between 0 and 1 that determines the degree
|
||||||
|
of diversity among the results with 0 corresponding
|
||||||
|
to maximum diversity and 1 to minimum diversity.
|
||||||
|
Defaults to 0.5.
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
embedding = self._embedding.embed_query(query)
|
||||||
|
return self.max_marginal_relevance_search_by_vector(
|
||||||
|
embedding, k, fetch_k, lambda_mult, search_params
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
text_key: str = "text",
|
||||||
|
index_name: Optional[str] = None,
|
||||||
|
client: Any = None,
|
||||||
|
host: List[str] = ["172.20.31.10:13000"],
|
||||||
|
user: str = "root",
|
||||||
|
password: str = "123123",
|
||||||
|
batch_size: int = 500,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Dingo:
|
||||||
|
"""Construct Dingo wrapper from raw documents.
|
||||||
|
|
||||||
|
This is a user friendly interface that:
|
||||||
|
1. Embeds documents.
|
||||||
|
2. Adds the documents to a provided Dingo index
|
||||||
|
|
||||||
|
This is intended to be a quick way to get started.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain import Dingo
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
import dingodb
|
||||||
|
sss
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
dingo = Dingo.from_texts(
|
||||||
|
texts,
|
||||||
|
embeddings,
|
||||||
|
index_name="langchain-demo"
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import dingodb
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import dingo python package. "
|
||||||
|
"Please install it with `pip install dingodb`."
|
||||||
|
)
|
||||||
|
|
||||||
|
if client is not None:
|
||||||
|
dingo_client = client
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
# connect to dingo db
|
||||||
|
dingo_client = dingodb.DingoDB(user, password, host)
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError(f"Dingo failed to connect: {e}")
|
||||||
|
if kwargs is not None and kwargs.get("self_id") is True:
|
||||||
|
if index_name not in dingo_client.get_index():
|
||||||
|
dingo_client.create_index(index_name, 1024, auto_id=False)
|
||||||
|
else:
|
||||||
|
if index_name not in dingo_client.get_index():
|
||||||
|
dingo_client.create_index(index_name, 1024)
|
||||||
|
# dingo_client.create_index(index_name, 1024, index_type="hnsw")
|
||||||
|
|
||||||
|
# Embed and create the documents
|
||||||
|
|
||||||
|
ids = ids or [str(uuid.uuid1().int)[:13] for _ in texts]
|
||||||
|
metadatas_list = []
|
||||||
|
texts = list(texts)
|
||||||
|
embeds = embedding.embed_documents(texts)
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
metadata = metadatas[i] if metadatas else {}
|
||||||
|
metadata[text_key] = text
|
||||||
|
metadatas_list.append(metadata)
|
||||||
|
|
||||||
|
# upsert to Dingo
|
||||||
|
for i in range(0, len(list(texts)), batch_size):
|
||||||
|
j = i + batch_size
|
||||||
|
dingo_client.vector_add(
|
||||||
|
index_name, metadatas_list[i:j], embeds[i:j], ids[i:j]
|
||||||
|
)
|
||||||
|
return cls(embedding, text_key, client=dingo_client, index_name=index_name)
|
||||||
|
|
||||||
|
def delete(
|
||||||
|
self,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Any:
|
||||||
|
"""Delete by vector IDs or filter.
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if ids is None:
|
||||||
|
raise ValueError("No ids provided to delete.")
|
||||||
|
|
||||||
|
return self._client.vector_delete(self._index_name, ids=ids)
|
Loading…
Reference in New Issue
Block a user