You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/zep_cloud.py

478 lines
15 KiB
Python

from __future__ import annotations
import logging
import warnings
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
from zep_cloud import CreateDocumentRequest, DocumentCollectionResponse, SearchType
logger = logging.getLogger()
class ZepCloudVectorStore(VectorStore):
"""`Zep` vector store.
It provides methods for adding texts or documents to the store,
searching for similar documents, and deleting documents.
Search scores are calculated using cosine similarity normalized to [0, 1].
Args:
collection_name (str): The name of the collection in the Zep store.
api_key (str): The API key for the Zep API.
"""
def __init__(
self,
collection_name: str,
api_key: str,
) -> None:
super().__init__()
if not collection_name:
raise ValueError(
"collection_name must be specified when using ZepVectorStore."
)
try:
from zep_cloud.client import AsyncZep, Zep
except ImportError:
raise ImportError(
"Could not import zep-python python package. "
"Please install it with `pip install zep-python`."
)
self._client = Zep(api_key=api_key)
self._client_async = AsyncZep(api_key=api_key)
self.collection_name = collection_name
self._load_collection()
@property
def embeddings(self) -> Optional[Embeddings]:
"""Unavailable for ZepCloud"""
return None
def _load_collection(self) -> DocumentCollectionResponse:
"""
Load the collection from the Zep backend.
"""
from zep_cloud import NotFoundError
try:
collection = self._client.document.get_collection(self.collection_name)
except NotFoundError:
logger.info(
f"Collection {self.collection_name} not found. Creating new collection."
)
collection = self._create_collection()
return collection
def _create_collection(self) -> DocumentCollectionResponse:
"""
Create a new collection in the Zep backend.
"""
self._client.document.add_collection(self.collection_name)
collection = self._client.document.get_collection(self.collection_name)
return collection
def _generate_documents_to_add(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[Any, Any]]] = None,
document_ids: Optional[List[str]] = None,
) -> List[CreateDocumentRequest]:
from zep_cloud import CreateDocumentRequest as ZepDocument
documents: List[ZepDocument] = []
for i, d in enumerate(texts):
documents.append(
ZepDocument(
content=d,
metadata=metadatas[i] if metadatas else None,
document_id=document_ids[i] if document_ids else None,
)
)
return documents
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
document_ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
document_ids: Optional list of document ids associated with the texts.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
documents = self._generate_documents_to_add(texts, metadatas, document_ids)
uuids = self._client.document.add_documents(
self.collection_name, request=documents
)
return uuids
async def aadd_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
document_ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore."""
documents = self._generate_documents_to_add(texts, metadatas, document_ids)
uuids = await self._client_async.document.add_documents(
self.collection_name, request=documents
)
return uuids
def search(
self,
query: str,
search_type: SearchType,
metadata: Optional[Dict[str, Any]] = None,
k: int = 3,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query using specified search type."""
if search_type == "similarity":
return self.similarity_search(query, k=k, metadata=metadata, **kwargs)
elif search_type == "mmr":
return self.max_marginal_relevance_search(
query, k=k, metadata=metadata, **kwargs
)
else:
raise ValueError(
f"search_type of {search_type} not allowed. Expected "
"search_type to be 'similarity' or 'mmr'."
)
async def asearch(
self,
query: str,
search_type: str,
metadata: Optional[Dict[str, Any]] = None,
k: int = 3,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query using specified search type."""
if search_type == "similarity":
return await self.asimilarity_search(
query, k=k, metadata=metadata, **kwargs
)
elif search_type == "mmr":
return await self.amax_marginal_relevance_search(
query, k=k, metadata=metadata, **kwargs
)
else:
raise ValueError(
f"search_type of {search_type} not allowed. Expected "
"search_type to be 'similarity' or 'mmr'."
)
def similarity_search(
self,
query: str,
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query."""
results = self._similarity_search_with_relevance_scores(
query, k=k, metadata=metadata, **kwargs
)
return [doc for doc, _ in results]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Run similarity search with distance."""
return self._similarity_search_with_relevance_scores(
query, k=k, metadata=metadata, **kwargs
)
def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Default similarity search with relevance scores. Modify if necessary
in subclass.
Return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
Args:
query: input text
k: Number of Documents to return. Defaults to 4.
metadata: Optional, metadata filter
**kwargs: kwargs to be passed to similarity search. Should include:
score_threshold: Optional, a floating point value between 0 to 1 and
filter the resulting set of retrieved docs
Returns:
List of Tuples of (doc, similarity_score)
"""
results = self._client.document.search(
collection_name=self.collection_name,
text=query,
limit=k,
metadata=metadata,
**kwargs,
)
return [
(
Document(
page_content=str(doc.content),
metadata=doc.metadata,
),
doc.score or 0.0,
)
for doc in results.results or []
]
async def asimilarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query."""
results = await self._client_async.document.search(
collection_name=self.collection_name,
text=query,
limit=k,
metadata=metadata,
**kwargs,
)
return [
(
Document(
page_content=str(doc.content),
metadata=doc.metadata,
),
doc.score or 0.0,
)
for doc in results.results or []
]
async def asimilarity_search(
self,
query: str,
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query."""
results = await self.asimilarity_search_with_relevance_scores(
query, k, metadata=metadata, **kwargs
)
return [doc for doc, _ in results]
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Unsupported in Zep Cloud"""
warnings.warn("similarity_search_by_vector is not supported in Zep Cloud")
return []
async def asimilarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Unsupported in Zep Cloud"""
warnings.warn("asimilarity_search_by_vector is not supported in Zep Cloud")
return []
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Zep determines this automatically and this parameter is
ignored.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
metadata: Optional, metadata to filter the resulting set of retrieved docs
Returns:
List of Documents selected by maximal marginal relevance.
"""
results = self._client.document.search(
collection_name=self.collection_name,
text=query,
limit=k,
metadata=metadata,
search_type="mmr",
mmr_lambda=lambda_mult,
**kwargs,
)
return [
Document(page_content=str(d.content), metadata=d.metadata)
for d in results.results or []
]
async def amax_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance."""
results = await self._client_async.document.search(
collection_name=self.collection_name,
text=query,
limit=k,
metadata=metadata,
search_type="mmr",
mmr_lambda=lambda_mult,
**kwargs,
)
return [
Document(page_content=str(d.content), metadata=d.metadata)
for d in results.results or []
]
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Unsupported in Zep Cloud"""
warnings.warn(
"max_marginal_relevance_search_by_vector is not supported in Zep Cloud"
)
return []
async def amax_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Unsupported in Zep Cloud"""
warnings.warn(
"amax_marginal_relevance_search_by_vector is not supported in Zep Cloud"
)
return []
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = "",
api_key: Optional[str] = None,
**kwargs: Any,
) -> ZepCloudVectorStore:
"""
Class method that returns a ZepVectorStore instance initialized from texts.
If the collection does not exist, it will be created.
Args:
texts (List[str]): The list of texts to add to the vectorstore.
metadatas (Optional[List[Dict[str, Any]]]): Optional list of metadata
associated with the texts.
collection_name (str): The name of the collection in the Zep store.
api_key (str): The API key for the Zep API.
**kwargs: Additional parameters specific to the vectorstore.
Returns:
ZepVectorStore: An instance of ZepVectorStore.
"""
if not api_key:
raise ValueError("api_key must be specified when using ZepVectorStore.")
vecstore = cls(
collection_name=collection_name,
api_key=api_key,
)
vecstore.add_texts(texts, metadatas)
return vecstore
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
"""Delete by Zep vector UUIDs.
Parameters
----------
ids : Optional[List[str]]
The UUIDs of the vectors to delete.
Raises
------
ValueError
If no UUIDs are provided.
"""
if ids is None or len(ids) == 0:
raise ValueError("No uuids provided to delete.")
for u in ids:
self._client.document.delete_document(self.collection_name, u)