langchain/libs/community/langchain_community/vectorstores/usearch.py

from __future__ import annotations

from typing import Any, Dict, Iterable, List, Optional, Tuple

import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

from langchain_community.docstore.base import AddableMixin, Docstore
from langchain_community.docstore.in_memory import InMemoryDocstore


def dependable_usearch_import() -> Any:
    """
    Import usearch if available, otherwise raise error.
    """
    try:
        import usearch.index
    except ImportError:
        raise ImportError(
            "Could not import usearch python package. "
            "Please install it with `pip install usearch` "
        )
    return usearch.index


class USearch(VectorStore):
    """`USearch` vector store.

    To use, you should have the ``usearch`` python package installed.
    """

    def __init__(
        self,
        embedding: Embeddings,
        index: Any,
        docstore: Docstore,
        ids: List[str],
    ):
        """Initialize with necessary components."""
        self.embedding = embedding
        self.index = index
        self.docstore = docstore
        self.ids = ids

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[Dict]] = None,
        ids: Optional[np.ndarray] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            ids: Optional list of unique IDs.

        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        if not isinstance(self.docstore, AddableMixin):
            raise ValueError(
                "If trying to add texts, the underlying docstore should support "
                f"adding items, which {self.docstore} does not"
            )

        embeddings = self.embedding.embed_documents(list(texts))
        documents = []
        for i, text in enumerate(texts):
            metadata = metadatas[i] if metadatas else {}
            documents.append(Document(page_content=text, metadata=metadata))
        last_id = int(self.ids[-1]) + 1
        if ids is None:
            ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])

        self.index.add(np.array(ids), np.array(embeddings))
        self.docstore.add(dict(zip(ids, documents)))
        self.ids.extend(ids)
        return ids.tolist()

    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.

        Returns:
            List of documents most similar to the query with distance.
        """
        query_embedding = self.embedding.embed_query(query)
        matches = self.index.search(np.array(query_embedding), k)

        docs_with_scores: List[Tuple[Document, float]] = []
        for id, score in zip(matches.keys, matches.distances):
            doc = self.docstore.search(str(id))
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {id}, got {doc}")
            docs_with_scores.append((doc, score))

        return docs_with_scores

    def similarity_search(
        self,
        query: str,
        k: int = 4,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.

        Returns:
            List of Documents most similar to the query.
        """
        query_embedding = self.embedding.embed_query(query)
        matches = self.index.search(np.array(query_embedding), k)

        docs: List[Document] = []
        for id in matches.keys:
            doc = self.docstore.search(str(id))
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {id}, got {doc}")
            docs.append(doc)

        return docs

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[Dict]] = None,
        ids: Optional[np.ndarray] = None,
        metric: str = "cos",
        **kwargs: Any,
    ) -> USearch:
        """Construct USearch wrapper from raw documents.
        This is a user friendly interface that:
            1. Embeds documents.
            2. Creates an in memory docstore
            3. Initializes the USearch database
        This is intended to be a quick way to get started.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import USearch
                from langchain_community.embeddings import OpenAIEmbeddings

                embeddings = OpenAIEmbeddings()
                usearch = USearch.from_texts(texts, embeddings)
        """
        embeddings = embedding.embed_documents(texts)

        documents: List[Document] = []
        if ids is None:
            ids = np.array([str(id) for id, _ in enumerate(texts)])
        for i, text in enumerate(texts):
            metadata = metadatas[i] if metadatas else {}
            documents.append(Document(page_content=text, metadata=metadata))

        docstore = InMemoryDocstore(dict(zip(ids, documents)))
        usearch = dependable_usearch_import()
        index = usearch.Index(ndim=len(embeddings[0]), metric=metric)
        index.add(np.array(ids), np.array(embeddings))
        return cls(embedding, index, docstore, ids.tolist())