langchain/libs/community/langchain_community/vectorstores/zep.py

from __future__ import annotations

import logging
import warnings
from dataclasses import asdict, dataclass
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

if TYPE_CHECKING:
    from zep_python.document import Document as ZepDocument
    from zep_python.document import DocumentCollection


logger = logging.getLogger()


@dataclass
class CollectionConfig:
    """Configuration for a `Zep Collection`.

    If the collection does not exist, it will be created.

    Attributes:
        name (str): The name of the collection.
        description (Optional[str]): An optional description of the collection.
        metadata (Optional[Dict[str, Any]]): Optional metadata for the collection.
        embedding_dimensions (int): The number of dimensions for the embeddings in
            the collection. This should match the Zep server configuration
            if auto-embed is true.
        is_auto_embedded (bool): A flag indicating whether the collection is
            automatically embedded by Zep.
    """

    name: str
    description: Optional[str]
    metadata: Optional[Dict[str, Any]]
    embedding_dimensions: int
    is_auto_embedded: bool


class ZepVectorStore(VectorStore):
    """`Zep` vector store.

    It provides methods for adding texts or documents to the store,
    searching for similar documents, and deleting documents.

    Search scores are calculated using cosine similarity normalized to [0, 1].

    Args:
        api_url (str): The URL of the Zep API.
        collection_name (str): The name of the collection in the Zep store.
        api_key (Optional[str]): The API key for the Zep API.
        config (Optional[CollectionConfig]): The configuration for the collection.
            Required if the collection does not already exist.
        embedding (Optional[Embeddings]): Optional embedding function to use to
            embed the texts. Required if the collection is not auto-embedded.
    """

    def __init__(
        self,
        collection_name: str,
        api_url: str,
        *,
        api_key: Optional[str] = None,
        config: Optional[CollectionConfig] = None,
        embedding: Optional[Embeddings] = None,
    ) -> None:
        super().__init__()
        if not collection_name:
            raise ValueError(
                "collection_name must be specified when using ZepVectorStore."
            )
        try:
            from zep_python import ZepClient
        except ImportError:
            raise ImportError(
                "Could not import zep-python python package. "
                "Please install it with `pip install zep-python`."
            )
        self._client = ZepClient(api_url, api_key=api_key)

        self.collection_name = collection_name
        # If for some reason the collection name is not the same as the one in the
        # config, update it.
        if config and config.name != self.collection_name:
            config.name = self.collection_name

        self._collection_config = config
        self._collection = self._load_collection()
        self._embedding = embedding

        # self.add_texts(texts, metadatas=metadatas, **kwargs)

    @property
    def embeddings(self) -> Optional[Embeddings]:
        """Access the query embedding object if available."""
        return self._embedding

    def _load_collection(self) -> DocumentCollection:
        """
        Load the collection from the Zep backend.
        """
        from zep_python import NotFoundError

        try:
            collection = self._client.document.get_collection(self.collection_name)
        except NotFoundError:
            logger.info(
                f"Collection {self.collection_name} not found. Creating new collection."
            )
            collection = self._create_collection()

        return collection

    def _create_collection(self) -> DocumentCollection:
        """
        Create a new collection in the Zep backend.
        """
        if not self._collection_config:
            raise ValueError(
                "Collection config must be specified when creating a new collection."
            )
        collection = self._client.document.add_collection(
            **asdict(self._collection_config)
        )
        return collection

    def _generate_documents_to_add(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[Dict[Any, Any]]] = None,
        document_ids: Optional[List[str]] = None,
    ) -> List[ZepDocument]:
        from zep_python.document import Document as ZepDocument

        embeddings = None
        if self._collection and self._collection.is_auto_embedded:
            if self._embedding is not None:
                warnings.warn(
                    """The collection is set to auto-embed and an embedding 
                function is present. Ignoring the embedding function.""",
                    stacklevel=2,
                )
        elif self._embedding is not None:
            embeddings = self._embedding.embed_documents(list(texts))
            if self._collection and self._collection.embedding_dimensions != len(
                embeddings[0]
            ):
                raise ValueError(
                    "The embedding dimensions of the collection and the embedding"
                    " function do not match. Collection dimensions:"
                    f" {self._collection.embedding_dimensions}, Embedding dimensions:"
                    f" {len(embeddings[0])}"
                )
        else:
            pass

        documents: List[ZepDocument] = []
        for i, d in enumerate(texts):
            documents.append(
                ZepDocument(
                    content=d,
                    metadata=metadatas[i] if metadatas else None,
                    document_id=document_ids[i] if document_ids else None,
                    embedding=embeddings[i] if embeddings else None,
                )
            )
        return documents

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[Dict[str, Any]]] = None,
        document_ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            document_ids: Optional list of document ids associated with the texts.
            kwargs: vectorstore specific parameters

        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        documents = self._generate_documents_to_add(texts, metadatas, document_ids)
        uuids = self._collection.add_documents(documents)

        return uuids

    async def aadd_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[Dict[str, Any]]] = None,
        document_ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore."""
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        documents = self._generate_documents_to_add(texts, metadatas, document_ids)
        uuids = await self._collection.aadd_documents(documents)

        return uuids

    def search(
        self,
        query: str,
        search_type: str,
        metadata: Optional[Dict[str, Any]] = None,
        k: int = 3,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query using specified search type."""
        if search_type == "similarity":
            return self.similarity_search(query, k=k, metadata=metadata, **kwargs)
        elif search_type == "mmr":
            return self.max_marginal_relevance_search(
                query, k=k, metadata=metadata, **kwargs
            )
        else:
            raise ValueError(
                f"search_type of {search_type} not allowed. Expected "
                "search_type to be 'similarity' or 'mmr'."
            )

    async def asearch(
        self,
        query: str,
        search_type: str,
        metadata: Optional[Dict[str, Any]] = None,
        k: int = 3,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query using specified search type."""
        if search_type == "similarity":
            return await self.asimilarity_search(
                query, k=k, metadata=metadata, **kwargs
            )
        elif search_type == "mmr":
            return await self.amax_marginal_relevance_search(
                query, k=k, metadata=metadata, **kwargs
            )
        else:
            raise ValueError(
                f"search_type of {search_type} not allowed. Expected "
                "search_type to be 'similarity' or 'mmr'."
            )

    def similarity_search(
        self,
        query: str,
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query."""

        results = self._similarity_search_with_relevance_scores(
            query, k=k, metadata=metadata, **kwargs
        )
        return [doc for doc, _ in results]

    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """Run similarity search with distance."""

        return self._similarity_search_with_relevance_scores(
            query, k=k, metadata=metadata, **kwargs
        )

    def _similarity_search_with_relevance_scores(
        self,
        query: str,
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """
        Default similarity search with relevance scores. Modify if necessary
        in subclass.
        Return docs and relevance scores in the range [0, 1].

        0 is dissimilar, 1 is most similar.

        Args:
            query: input text
            k: Number of Documents to return. Defaults to 4.
            metadata: Optional, metadata filter
            **kwargs: kwargs to be passed to similarity search. Should include:
                score_threshold: Optional, a floating point value between 0 to 1 and
                    filter the resulting set of retrieved docs

        Returns:
            List of Tuples of (doc, similarity_score)
        """

        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        if not self._collection.is_auto_embedded and self._embedding:
            query_vector = self._embedding.embed_query(query)
            results = self._collection.search(
                embedding=query_vector, limit=k, metadata=metadata, **kwargs
            )
        else:
            results = self._collection.search(
                query, limit=k, metadata=metadata, **kwargs
            )

        return [
            (
                Document(
                    page_content=doc.content,
                    metadata=doc.metadata,
                ),
                doc.score or 0.0,
            )
            for doc in results
        ]

    async def asimilarity_search_with_relevance_scores(
        self,
        query: str,
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query."""

        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        if not self._collection.is_auto_embedded and self._embedding:
            query_vector = self._embedding.embed_query(query)
            results = await self._collection.asearch(
                embedding=query_vector, limit=k, metadata=metadata, **kwargs
            )
        else:
            results = await self._collection.asearch(
                query, limit=k, metadata=metadata, **kwargs
            )

        return [
            (
                Document(
                    page_content=doc.content,
                    metadata=doc.metadata,
                ),
                doc.score or 0.0,
            )
            for doc in results
        ]

    async def asimilarity_search(
        self,
        query: str,
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query."""

        results = await self.asimilarity_search_with_relevance_scores(
            query, k, metadata=metadata, **kwargs
        )

        return [doc for doc, _ in results]

    def similarity_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to embedding vector.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            metadata: Optional, metadata filter

        Returns:
            List of Documents most similar to the query vector.
        """
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        results = self._collection.search(
            embedding=embedding, limit=k, metadata=metadata, **kwargs
        )

        return [
            Document(
                page_content=doc.content,
                metadata=doc.metadata,
            )
            for doc in results
        ]

    async def asimilarity_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to embedding vector."""
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        results = self._collection.search(
            embedding=embedding, limit=k, metadata=metadata, **kwargs
        )

        return [
            Document(
                page_content=doc.content,
                metadata=doc.metadata,
            )
            for doc in results
        ]

    def max_marginal_relevance_search(
        self,
        query: str,
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
                     Zep determines this automatically and this parameter is
                        ignored.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
            metadata: Optional, metadata to filter the resulting set of retrieved docs
        Returns:
            List of Documents selected by maximal marginal relevance.
        """

        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        if not self._collection.is_auto_embedded and self._embedding:
            query_vector = self._embedding.embed_query(query)
            results = self._collection.search(
                embedding=query_vector,
                limit=k,
                metadata=metadata,
                search_type="mmr",
                mmr_lambda=lambda_mult,
                **kwargs,
            )
        else:
            results, query_vector = self._collection.search_return_query_vector(
                query,
                limit=k,
                metadata=metadata,
                search_type="mmr",
                mmr_lambda=lambda_mult,
                **kwargs,
            )

        return [Document(page_content=d.content, metadata=d.metadata) for d in results]

    async def amax_marginal_relevance_search(
        self,
        query: str,
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance."""

        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        if not self._collection.is_auto_embedded and self._embedding:
            query_vector = self._embedding.embed_query(query)
            results = await self._collection.asearch(
                embedding=query_vector,
                limit=k,
                metadata=metadata,
                search_type="mmr",
                mmr_lambda=lambda_mult,
                **kwargs,
            )
        else:
            results, query_vector = await self._collection.asearch_return_query_vector(
                query,
                limit=k,
                metadata=metadata,
                search_type="mmr",
                mmr_lambda=lambda_mult,
                **kwargs,
            )

        return [Document(page_content=d.content, metadata=d.metadata) for d in results]

    def max_marginal_relevance_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
                     Zep determines this automatically and this parameter is
                        ignored.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
            metadata: Optional, metadata to filter the resulting set of retrieved docs
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        results = self._collection.search(
            embedding=embedding,
            limit=k,
            metadata=metadata,
            search_type="mmr",
            mmr_lambda=lambda_mult,
            **kwargs,
        )

        return [Document(page_content=d.content, metadata=d.metadata) for d in results]

    async def amax_marginal_relevance_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        metadata: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance."""
        if not self._collection:
            raise ValueError(
                "collection should be an instance of a Zep DocumentCollection"
            )

        results = await self._collection.asearch(
            embedding=embedding,
            limit=k,
            metadata=metadata,
            search_type="mmr",
            mmr_lambda=lambda_mult,
            **kwargs,
        )

        return [Document(page_content=d.content, metadata=d.metadata) for d in results]

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Optional[Embeddings] = None,
        metadatas: Optional[List[dict]] = None,
        collection_name: str = "",
        api_url: str = "",
        api_key: Optional[str] = None,
        config: Optional[CollectionConfig] = None,
        **kwargs: Any,
    ) -> ZepVectorStore:
        """
        Class method that returns a ZepVectorStore instance initialized from texts.

        If the collection does not exist, it will be created.

        Args:
            texts (List[str]): The list of texts to add to the vectorstore.
            embedding (Optional[Embeddings]): Optional embedding function to use to
               embed the texts.
            metadatas (Optional[List[Dict[str, Any]]]): Optional list of metadata
               associated with the texts.
            collection_name (str): The name of the collection in the Zep store.
            api_url (str): The URL of the Zep API.
            api_key (Optional[str]): The API key for the Zep API.
            config (Optional[CollectionConfig]): The configuration for the collection.
            **kwargs: Additional parameters specific to the vectorstore.

        Returns:
            ZepVectorStore: An instance of ZepVectorStore.
        """
        vecstore = cls(
            collection_name,
            api_url,
            api_key=api_key,
            config=config,
            embedding=embedding,
        )
        vecstore.add_texts(texts, metadatas)
        return vecstore

    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
        """Delete by Zep vector UUIDs.

        Parameters
        ----------
        ids : Optional[List[str]]
            The UUIDs of the vectors to delete.

        Raises
        ------
        ValueError
            If no UUIDs are provided.
        """

        if ids is None or len(ids) == 0:
            raise ValueError("No uuids provided to delete.")

        if self._collection is None:
            raise ValueError("No collection name provided.")

        for u in ids:
            self._collection.delete_document(u)