langchain/libs/community/langchain_community/vectorstores/elastic_vector_search.py

from __future__ import annotations

import uuid
import warnings
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Iterable,
    List,
    Mapping,
    Optional,
    Tuple,
    Union,
)

from langchain_core._api import deprecated
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from langchain_core.vectorstores import VectorStore

if TYPE_CHECKING:
    from elasticsearch import Elasticsearch


def _default_text_mapping(dim: int) -> Dict:
    return {
        "properties": {
            "text": {"type": "text"},
            "vector": {"type": "dense_vector", "dims": dim},
        }
    }


def _default_script_query(query_vector: List[float], filter: Optional[dict]) -> Dict:
    if filter:
        ((key, value),) = filter.items()
        filter = {"match": {f"metadata.{key}.keyword": f"{value}"}}
    else:
        filter = {"match_all": {}}
    return {
        "script_score": {
            "query": filter,
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                "params": {"query_vector": query_vector},
            },
        }
    }


class ElasticVectorSearch(VectorStore):
    """

    ElasticVectorSearch uses the brute force method of searching on vectors.

    Recommended to use ElasticsearchStore instead, which gives you the option
    to uses the approx  HNSW algorithm which performs better on large datasets.

    ElasticsearchStore also supports metadata filtering, customising the
    query retriever and much more!

    You can read more on ElasticsearchStore:
    https://python.langchain.com/docs/integrations/vectorstores/elasticsearch

    To connect to an `Elasticsearch` instance that does not require
    login credentials, pass the Elasticsearch URL and index name along with the
    embedding object to the constructor.

    Example:
        .. code-block:: python

            from langchain_community.vectorstores import ElasticVectorSearch
            from langchain_community.embeddings import OpenAIEmbeddings

            embedding = OpenAIEmbeddings()
            elastic_vector_search = ElasticVectorSearch(
                elasticsearch_url="http://localhost:9200",
                index_name="test_index",
                embedding=embedding
            )


    To connect to an Elasticsearch instance that requires login credentials,
    including Elastic Cloud, use the Elasticsearch URL format
    https://username:password@es_host:9243. For example, to connect to Elastic
    Cloud, create the Elasticsearch URL with the required authentication details and
    pass it to the ElasticVectorSearch constructor as the named parameter
    elasticsearch_url.

    You can obtain your Elastic Cloud URL and login credentials by logging in to the
    Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
    navigating to the "Deployments" page.

    To obtain your Elastic Cloud password for the default "elastic" user:

    1. Log in to the Elastic Cloud console at https://cloud.elastic.co
    2. Go to "Security" > "Users"
    3. Locate the "elastic" user and click "Edit"
    4. Click "Reset password"
    5. Follow the prompts to reset the password

    The format for Elastic Cloud URLs is
    https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.

    Example:
        .. code-block:: python

            from langchain_community.vectorstores import ElasticVectorSearch
            from langchain_community.embeddings import OpenAIEmbeddings

            embedding = OpenAIEmbeddings()

            elastic_host = "cluster_id.region_id.gcp.cloud.es.io"
            elasticsearch_url = f"https://username:password@{elastic_host}:9243"
            elastic_vector_search = ElasticVectorSearch(
                elasticsearch_url=elasticsearch_url,
                index_name="test_index",
                embedding=embedding
            )

    Args:
        elasticsearch_url (str): The URL for the Elasticsearch instance.
        index_name (str): The name of the Elasticsearch index for the embeddings.
        embedding (Embeddings): An object that provides the ability to embed text.
                It should be an instance of a class that subclasses the Embeddings
                abstract base class, such as OpenAIEmbeddings()

    Raises:
        ValueError: If the elasticsearch python package is not installed.
    """

    def __init__(
        self,
        elasticsearch_url: str,
        index_name: str,
        embedding: Embeddings,
        *,
        ssl_verify: Optional[Dict[str, Any]] = None,
    ):
        """Initialize with necessary components."""
        warnings.warn(
            "ElasticVectorSearch will be removed in a future release. See"
            "Elasticsearch integration docs on how to upgrade."
        )

        try:
            import elasticsearch
        except ImportError:
            raise ImportError(
                "Could not import elasticsearch python package. "
                "Please install it with `pip install elasticsearch`."
            )
        self.embedding = embedding
        self.index_name = index_name
        _ssl_verify = ssl_verify or {}
        try:
            self.client = elasticsearch.Elasticsearch(
                elasticsearch_url,
                **_ssl_verify,
                headers={"user-agent": self.get_user_agent()},
            )
        except ValueError as e:
            raise ValueError(
                f"Your elasticsearch client string is mis-formatted. Got error: {e} "
            )

    @staticmethod
    def get_user_agent() -> str:
        from langchain_community import __version__

        return f"langchain-py-dvs/{__version__}"

    @property
    def embeddings(self) -> Embeddings:
        return self.embedding

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        ids: Optional[List[str]] = None,
        refresh_indices: bool = True,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            ids: Optional list of unique IDs.
            refresh_indices: bool to refresh ElasticSearch indices

        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        try:
            from elasticsearch.exceptions import NotFoundError
            from elasticsearch.helpers import bulk
        except ImportError:
            raise ImportError(
                "Could not import elasticsearch python package. "
                "Please install it with `pip install elasticsearch`."
            )
        requests = []
        ids = ids or [str(uuid.uuid4()) for _ in texts]
        embeddings = self.embedding.embed_documents(list(texts))
        dim = len(embeddings[0])
        mapping = _default_text_mapping(dim)

        # check to see if the index already exists
        try:
            self.client.indices.get(index=self.index_name)
        except NotFoundError:
            # TODO would be nice to create index before embedding,
            # just to save expensive steps for last
            self.create_index(self.client, self.index_name, mapping)

        for i, text in enumerate(texts):
            metadata = metadatas[i] if metadatas else {}
            request = {
                "_op_type": "index",
                "_index": self.index_name,
                "vector": embeddings[i],
                "text": text,
                "metadata": metadata,
                "_id": ids[i],
            }
            requests.append(request)
        bulk(self.client, requests)

        if refresh_indices:
            self.client.indices.refresh(index=self.index_name)
        return ids

    def similarity_search(
        self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
    ) -> List[Document]:
        """Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.

        Returns:
            List of Documents most similar to the query.
        """
        docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
        documents = [d[0] for d in docs_and_scores]
        return documents

    def similarity_search_with_score(
        self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query.
        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
        Returns:
            List of Documents most similar to the query.
        """
        embedding = self.embedding.embed_query(query)
        script_query = _default_script_query(embedding, filter)
        response = self.client_search(
            self.client, self.index_name, script_query, size=k
        )
        hits = [hit for hit in response["hits"]["hits"]]
        docs_and_scores = [
            (
                Document(
                    page_content=hit["_source"]["text"],
                    metadata=hit["_source"]["metadata"],
                ),
                hit["_score"],
            )
            for hit in hits
        ]
        return docs_and_scores

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        ids: Optional[List[str]] = None,
        index_name: Optional[str] = None,
        refresh_indices: bool = True,
        **kwargs: Any,
    ) -> ElasticVectorSearch:
        """Construct ElasticVectorSearch wrapper from raw documents.

        This is a user-friendly interface that:
            1. Embeds documents.
            2. Creates a new index for the embeddings in the Elasticsearch instance.
            3. Adds the documents to the newly created Elasticsearch index.

        This is intended to be a quick way to get started.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import ElasticVectorSearch
                from langchain_community.embeddings import OpenAIEmbeddings
                embeddings = OpenAIEmbeddings()
                elastic_vector_search = ElasticVectorSearch.from_texts(
                    texts,
                    embeddings,
                    elasticsearch_url="http://localhost:9200"
                )
        """
        elasticsearch_url = get_from_dict_or_env(
            kwargs, "elasticsearch_url", "ELASTICSEARCH_URL"
        )
        if "elasticsearch_url" in kwargs:
            del kwargs["elasticsearch_url"]
        index_name = index_name or uuid.uuid4().hex
        vectorsearch = cls(elasticsearch_url, index_name, embedding, **kwargs)
        vectorsearch.add_texts(
            texts, metadatas=metadatas, ids=ids, refresh_indices=refresh_indices
        )
        return vectorsearch

    def create_index(self, client: Any, index_name: str, mapping: Dict) -> None:
        version_num = client.info()["version"]["number"][0]
        version_num = int(version_num)
        if version_num >= 8:
            client.indices.create(index=index_name, mappings=mapping)
        else:
            client.indices.create(index=index_name, body={"mappings": mapping})

    def client_search(
        self, client: Any, index_name: str, script_query: Dict, size: int
    ) -> Any:
        version_num = client.info()["version"]["number"][0]
        version_num = int(version_num)
        if version_num >= 8:
            response = client.search(index=index_name, query=script_query, size=size)
        else:
            response = client.search(
                index=index_name, body={"query": script_query, "size": size}
            )
        return response

    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
        """Delete by vector IDs.

        Args:
            ids: List of ids to delete.
        """

        if ids is None:
            raise ValueError("No ids provided to delete.")

        # TODO: Check if this can be done in bulk
        for id in ids:
            self.client.delete(index=self.index_name, id=id)


@deprecated("0.0.265", alternative="ElasticsearchStore class.", pending=True)
class ElasticKnnSearch(VectorStore):
    """[DEPRECATED] `Elasticsearch` with k-nearest neighbor search
    (`k-NN`) vector store.

    Recommended to use ElasticsearchStore instead, which supports
    metadata filtering, customising the query retriever and much more!

    You can read more on ElasticsearchStore:
    https://python.langchain.com/docs/integrations/vectorstores/elasticsearch

    It creates an Elasticsearch index of text data that
    can be searched using k-NN search. The text data is transformed into
    vector embeddings using a provided embedding model, and these embeddings
    are stored in the Elasticsearch index.

    Attributes:
        index_name (str): The name of the Elasticsearch index.
        embedding (Embeddings): The embedding model to use for transforming text data
            into vector embeddings.
        es_connection (Elasticsearch, optional): An existing Elasticsearch connection.
        es_cloud_id (str, optional): The Cloud ID of your Elasticsearch Service
            deployment.
        es_user (str, optional): The username for your Elasticsearch Service deployment.
        es_password (str, optional): The password for your Elasticsearch Service
            deployment.
        vector_query_field (str, optional): The name of the field in the Elasticsearch
            index that contains the vector embeddings.
        query_field (str, optional): The name of the field in the Elasticsearch index
            that contains the original text data.

    Usage:
        >>> from embeddings import Embeddings
        >>> embedding = Embeddings.load('glove')
        >>> es_search = ElasticKnnSearch('my_index', embedding)
        >>> es_search.add_texts(['Hello world!', 'Another text'])
        >>> results = es_search.knn_search('Hello')
        [(Document(page_content='Hello world!', metadata={}), 0.9)]
    """

    def __init__(
        self,
        index_name: str,
        embedding: Embeddings,
        es_connection: Optional["Elasticsearch"] = None,
        es_cloud_id: Optional[str] = None,
        es_user: Optional[str] = None,
        es_password: Optional[str] = None,
        vector_query_field: Optional[str] = "vector",
        query_field: Optional[str] = "text",
    ):
        try:
            import elasticsearch
        except ImportError:
            raise ImportError(
                "Could not import elasticsearch python package. "
                "Please install it with `pip install elasticsearch`."
            )

        warnings.warn(
            "ElasticKnnSearch will be removed in a future release."
            "Use ElasticsearchStore instead. See Elasticsearch "
            "integration docs on how to upgrade."
        )
        self.embedding = embedding
        self.index_name = index_name
        self.query_field = query_field
        self.vector_query_field = vector_query_field

        # If a pre-existing Elasticsearch connection is provided, use it.
        if es_connection is not None:
            self.client = es_connection
        else:
            # If credentials for a new Elasticsearch connection are provided,
            # create a new connection.
            if es_cloud_id and es_user and es_password:
                self.client = elasticsearch.Elasticsearch(
                    cloud_id=es_cloud_id, basic_auth=(es_user, es_password)
                )
            else:
                raise ValueError(
                    """Either provide a pre-existing Elasticsearch connection, \
                or valid credentials for creating a new connection."""
                )

    @staticmethod
    def _default_knn_mapping(
        dims: int, similarity: Optional[str] = "dot_product"
    ) -> Dict:
        return {
            "properties": {
                "text": {"type": "text"},
                "vector": {
                    "type": "dense_vector",
                    "dims": dims,
                    "index": True,
                    "similarity": similarity,
                },
            }
        }

    def _default_knn_query(
        self,
        query_vector: Optional[List[float]] = None,
        query: Optional[str] = None,
        model_id: Optional[str] = None,
        k: Optional[int] = 10,
        num_candidates: Optional[int] = 10,
    ) -> Dict:
        knn: Dict = {
            "field": self.vector_query_field,
            "k": k,
            "num_candidates": num_candidates,
        }

        # Case 1: `query_vector` is provided, but not `model_id` -> use query_vector
        if query_vector and not model_id:
            knn["query_vector"] = query_vector

        # Case 2: `query` and `model_id` are provided, -> use query_vector_builder
        elif query and model_id:
            knn["query_vector_builder"] = {
                "text_embedding": {
                    "model_id": model_id,  # use 'model_id' argument
                    "model_text": query,  # use 'query' argument
                }
            }

        else:
            raise ValueError(
                "Either `query_vector` or `model_id` must be provided, but not both."
            )

        return knn

    def similarity_search(
        self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
    ) -> List[Document]:
        """
        Pass through to `knn_search`
        """
        results = self.knn_search(query=query, k=k, **kwargs)
        return [doc for doc, score in results]

    def similarity_search_with_score(
        self, query: str, k: int = 10, **kwargs: Any
    ) -> List[Tuple[Document, float]]:
        """Pass through to `knn_search including score`"""
        return self.knn_search(query=query, k=k, **kwargs)

    def knn_search(
        self,
        query: Optional[str] = None,
        k: Optional[int] = 10,
        query_vector: Optional[List[float]] = None,
        model_id: Optional[str] = None,
        size: Optional[int] = 10,
        source: Optional[bool] = True,
        fields: Optional[
            Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
        ] = None,
        page_content: Optional[str] = "text",
    ) -> List[Tuple[Document, float]]:
        """
        Perform a k-NN search on the Elasticsearch index.

        Args:
            query (str, optional): The query text to search for.
            k (int, optional): The number of nearest neighbors to return.
            query_vector (List[float], optional): The query vector to search for.
            model_id (str, optional): The ID of the model to use for transforming the
                query text into a vector.
            size (int, optional): The number of search results to return.
            source (bool, optional): Whether to return the source of the search results.
            fields (List[Mapping[str, Any]], optional): The fields to return in the
                search results.
            page_content (str, optional): The name of the field that contains the page
                content.

        Returns:
            A list of tuples, where each tuple contains a Document object and a score.
        """

        # if not source and (fields == None or page_content not in fields):
        if not source and (
            fields is None or not any(page_content in field for field in fields)
        ):
            raise ValueError("If source=False `page_content` field must be in `fields`")

        knn_query_body = self._default_knn_query(
            query_vector=query_vector, query=query, model_id=model_id, k=k
        )

        # Perform the kNN search on the Elasticsearch index and return the results.
        response = self.client.search(
            index=self.index_name,
            knn=knn_query_body,
            size=size,
            source=source,
            fields=fields,
        )

        hits = [hit for hit in response["hits"]["hits"]]
        docs_and_scores = [
            (
                Document(
                    page_content=hit["_source"][page_content]
                    if source
                    else hit["fields"][page_content][0],
                    metadata=hit["fields"] if fields else {},
                ),
                hit["_score"],
            )
            for hit in hits
        ]

        return docs_and_scores

    def knn_hybrid_search(
        self,
        query: Optional[str] = None,
        k: Optional[int] = 10,
        query_vector: Optional[List[float]] = None,
        model_id: Optional[str] = None,
        size: Optional[int] = 10,
        source: Optional[bool] = True,
        knn_boost: Optional[float] = 0.9,
        query_boost: Optional[float] = 0.1,
        fields: Optional[
            Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
        ] = None,
        page_content: Optional[str] = "text",
    ) -> List[Tuple[Document, float]]:
        """
        Perform a hybrid k-NN and text search on the Elasticsearch index.

        Args:
            query (str, optional): The query text to search for.
            k (int, optional): The number of nearest neighbors to return.
            query_vector (List[float], optional): The query vector to search for.
            model_id (str, optional): The ID of the model to use for transforming the
                query text into a vector.
            size (int, optional): The number of search results to return.
            source (bool, optional): Whether to return the source of the search results.
            knn_boost (float, optional): The boost value to apply to the k-NN search
                results.
            query_boost (float, optional): The boost value to apply to the text search
                results.
            fields (List[Mapping[str, Any]], optional): The fields to return in the
                search results.
            page_content (str, optional): The name of the field that contains the page
                content.

        Returns:
            A list of tuples, where each tuple contains a Document object and a score.
        """

        # if not source and (fields == None or page_content not in fields):
        if not source and (
            fields is None or not any(page_content in field for field in fields)
        ):
            raise ValueError("If source=False `page_content` field must be in `fields`")

        knn_query_body = self._default_knn_query(
            query_vector=query_vector, query=query, model_id=model_id, k=k
        )

        # Modify the knn_query_body to add a "boost" parameter
        knn_query_body["boost"] = knn_boost

        # Generate the body of the standard Elasticsearch query
        match_query_body = {
            "match": {self.query_field: {"query": query, "boost": query_boost}}
        }

        # Perform the hybrid search on the Elasticsearch index and return the results.
        response = self.client.search(
            index=self.index_name,
            query=match_query_body,
            knn=knn_query_body,
            fields=fields,
            size=size,
            source=source,
        )

        hits = [hit for hit in response["hits"]["hits"]]
        docs_and_scores = [
            (
                Document(
                    page_content=hit["_source"][page_content]
                    if source
                    else hit["fields"][page_content][0],
                    metadata=hit["fields"] if fields else {},
                ),
                hit["_score"],
            )
            for hit in hits
        ]

        return docs_and_scores

    def create_knn_index(self, mapping: Dict) -> None:
        """
        Create a new k-NN index in Elasticsearch.

        Args:
            mapping (Dict): The mapping to use for the new index.

        Returns:
            None
        """

        self.client.indices.create(index=self.index_name, mappings=mapping)

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[Dict[Any, Any]]] = None,
        model_id: Optional[str] = None,
        refresh_indices: bool = False,
        **kwargs: Any,
    ) -> List[str]:
        """
        Add a list of texts to the Elasticsearch index.

        Args:
            texts (Iterable[str]): The texts to add to the index.
            metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
                to associate with the texts.
            model_id (str, optional): The ID of the model to use for transforming the
                texts into vectors.
            refresh_indices (bool, optional): Whether to refresh the Elasticsearch
                indices after adding the texts.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            A list of IDs for the added texts.
        """

        # Check if the index exists.
        if not self.client.indices.exists(index=self.index_name):
            dims = kwargs.get("dims")

            if dims is None:
                raise ValueError("ElasticKnnSearch requires 'dims' parameter")

            similarity = kwargs.get("similarity")
            optional_args = {}

            if similarity is not None:
                optional_args["similarity"] = similarity

            mapping = self._default_knn_mapping(dims=dims, **optional_args)
            self.create_knn_index(mapping)

        embeddings = self.embedding.embed_documents(list(texts))

        # body = []
        body: List[Mapping[str, Any]] = []
        for text, vector in zip(texts, embeddings):
            body.extend(
                [
                    {"index": {"_index": self.index_name}},
                    {"text": text, "vector": vector},
                ]
            )

        responses = self.client.bulk(operations=body)

        ids = [
            item["index"]["_id"]
            for item in responses["items"]
            if item["index"]["result"] == "created"
        ]

        if refresh_indices:
            self.client.indices.refresh(index=self.index_name)

        return ids

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[Dict[Any, Any]]] = None,
        **kwargs: Any,
    ) -> ElasticKnnSearch:
        """
        Create a new ElasticKnnSearch instance and add a list of texts to the
            Elasticsearch index.

        Args:
            texts (List[str]): The texts to add to the index.
            embedding (Embeddings): The embedding model to use for transforming the
                texts into vectors.
            metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
                to associate with the texts.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            A new ElasticKnnSearch instance.
        """

        index_name = kwargs.get("index_name", str(uuid.uuid4()))
        es_connection = kwargs.get("es_connection")
        es_cloud_id = kwargs.get("es_cloud_id")
        es_user = kwargs.get("es_user")
        es_password = kwargs.get("es_password")
        vector_query_field = kwargs.get("vector_query_field", "vector")
        query_field = kwargs.get("query_field", "text")
        model_id = kwargs.get("model_id")
        dims = kwargs.get("dims")

        if dims is None:
            raise ValueError("ElasticKnnSearch requires 'dims' parameter")

        optional_args = {}

        if vector_query_field is not None:
            optional_args["vector_query_field"] = vector_query_field

        if query_field is not None:
            optional_args["query_field"] = query_field

        knnvectorsearch = cls(
            index_name=index_name,
            embedding=embedding,
            es_connection=es_connection,
            es_cloud_id=es_cloud_id,
            es_user=es_user,
            es_password=es_password,
            **optional_args,
        )
        # Encode the provided texts and add them to the newly created index.
        knnvectorsearch.add_texts(texts, model_id=model_id, dims=dims, **optional_args)

        return knnvectorsearch