You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/elastic_vector_search.py

799 lines
28 KiB
Python

from __future__ import annotations
import uuid
import warnings
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Mapping,
Optional,
Tuple,
Union,
)
from langchain_core._api import deprecated
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
from elasticsearch import Elasticsearch
def _default_text_mapping(dim: int) -> Dict:
return {
"properties": {
"text": {"type": "text"},
"vector": {"type": "dense_vector", "dims": dim},
}
}
def _default_script_query(query_vector: List[float], filter: Optional[dict]) -> Dict:
if filter:
((key, value),) = filter.items()
filter = {"match": {f"metadata.{key}.keyword": f"{value}"}}
else:
filter = {"match_all": {}}
return {
"script_score": {
"query": filter,
"script": {
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
"params": {"query_vector": query_vector},
},
}
}
class ElasticVectorSearch(VectorStore):
"""
ElasticVectorSearch uses the brute force method of searching on vectors.
Recommended to use ElasticsearchStore instead, which gives you the option
to uses the approx HNSW algorithm which performs better on large datasets.
ElasticsearchStore also supports metadata filtering, customising the
query retriever and much more!
You can read more on ElasticsearchStore:
https://python.langchain.com/docs/integrations/vectorstores/elasticsearch
To connect to an `Elasticsearch` instance that does not require
login credentials, pass the Elasticsearch URL and index name along with the
embedding object to the constructor.
Example:
.. code-block:: python
from langchain_community.vectorstores import ElasticVectorSearch
from langchain_community.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch(
elasticsearch_url="http://localhost:9200",
index_name="test_index",
embedding=embedding
)
To connect to an Elasticsearch instance that requires login credentials,
including Elastic Cloud, use the Elasticsearch URL format
https://username:password@es_host:9243. For example, to connect to Elastic
Cloud, create the Elasticsearch URL with the required authentication details and
pass it to the ElasticVectorSearch constructor as the named parameter
elasticsearch_url.
You can obtain your Elastic Cloud URL and login credentials by logging in to the
Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
navigating to the "Deployments" page.
To obtain your Elastic Cloud password for the default "elastic" user:
1. Log in to the Elastic Cloud console at https://cloud.elastic.co
2. Go to "Security" > "Users"
3. Locate the "elastic" user and click "Edit"
4. Click "Reset password"
5. Follow the prompts to reset the password
The format for Elastic Cloud URLs is
https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
Example:
.. code-block:: python
from langchain_community.vectorstores import ElasticVectorSearch
from langchain_community.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
elastic_host = "cluster_id.region_id.gcp.cloud.es.io"
elasticsearch_url = f"https://username:password@{elastic_host}:9243"
elastic_vector_search = ElasticVectorSearch(
elasticsearch_url=elasticsearch_url,
index_name="test_index",
embedding=embedding
)
Args:
elasticsearch_url (str): The URL for the Elasticsearch instance.
index_name (str): The name of the Elasticsearch index for the embeddings.
embedding (Embeddings): An object that provides the ability to embed text.
It should be an instance of a class that subclasses the Embeddings
abstract base class, such as OpenAIEmbeddings()
Raises:
ValueError: If the elasticsearch python package is not installed.
"""
def __init__(
self,
elasticsearch_url: str,
index_name: str,
embedding: Embeddings,
*,
ssl_verify: Optional[Dict[str, Any]] = None,
):
"""Initialize with necessary components."""
warnings.warn(
"ElasticVectorSearch will be removed in a future release. See"
"Elasticsearch integration docs on how to upgrade."
)
try:
import elasticsearch
except ImportError:
raise ImportError(
"Could not import elasticsearch python package. "
"Please install it with `pip install elasticsearch`."
)
self.embedding = embedding
self.index_name = index_name
_ssl_verify = ssl_verify or {}
try:
self.client = elasticsearch.Elasticsearch(
elasticsearch_url,
**_ssl_verify,
headers={"user-agent": self.get_user_agent()},
)
except ValueError as e:
raise ValueError(
f"Your elasticsearch client string is mis-formatted. Got error: {e} "
)
@staticmethod
def get_user_agent() -> str:
from langchain_community import __version__
return f"langchain-py-dvs/{__version__}"
@property
def embeddings(self) -> Embeddings:
return self.embedding
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
refresh_indices: bool = True,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of unique IDs.
refresh_indices: bool to refresh ElasticSearch indices
Returns:
List of ids from adding the texts into the vectorstore.
"""
try:
from elasticsearch.exceptions import NotFoundError
from elasticsearch.helpers import bulk
except ImportError:
raise ImportError(
"Could not import elasticsearch python package. "
"Please install it with `pip install elasticsearch`."
)
requests = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self.embedding.embed_documents(list(texts))
dim = len(embeddings[0])
mapping = _default_text_mapping(dim)
# check to see if the index already exists
try:
self.client.indices.get(index=self.index_name)
except NotFoundError:
# TODO would be nice to create index before embedding,
# just to save expensive steps for last
self.create_index(self.client, self.index_name, mapping)
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
request = {
"_op_type": "index",
"_index": self.index_name,
"vector": embeddings[i],
"text": text,
"metadata": metadata,
"_id": ids[i],
}
requests.append(request)
bulk(self.client, requests)
if refresh_indices:
self.client.indices.refresh(index=self.index_name)
return ids
def similarity_search(
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
documents = [d[0] for d in docs_and_scores]
return documents
def similarity_search_with_score(
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
embedding = self.embedding.embed_query(query)
script_query = _default_script_query(embedding, filter)
response = self.client_search(
self.client, self.index_name, script_query, size=k
)
hits = [hit for hit in response["hits"]["hits"]]
docs_and_scores = [
(
Document(
page_content=hit["_source"]["text"],
metadata=hit["_source"]["metadata"],
),
hit["_score"],
)
for hit in hits
]
return docs_and_scores
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
index_name: Optional[str] = None,
refresh_indices: bool = True,
**kwargs: Any,
) -> ElasticVectorSearch:
"""Construct ElasticVectorSearch wrapper from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Creates a new index for the embeddings in the Elasticsearch instance.
3. Adds the documents to the newly created Elasticsearch index.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain_community.vectorstores import ElasticVectorSearch
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch.from_texts(
texts,
embeddings,
elasticsearch_url="http://localhost:9200"
)
"""
elasticsearch_url = get_from_dict_or_env(
kwargs, "elasticsearch_url", "ELASTICSEARCH_URL"
)
if "elasticsearch_url" in kwargs:
del kwargs["elasticsearch_url"]
index_name = index_name or uuid.uuid4().hex
vectorsearch = cls(elasticsearch_url, index_name, embedding, **kwargs)
vectorsearch.add_texts(
texts, metadatas=metadatas, ids=ids, refresh_indices=refresh_indices
)
return vectorsearch
def create_index(self, client: Any, index_name: str, mapping: Dict) -> None:
version_num = client.info()["version"]["number"][0]
version_num = int(version_num)
if version_num >= 8:
client.indices.create(index=index_name, mappings=mapping)
else:
client.indices.create(index=index_name, body={"mappings": mapping})
def client_search(
self, client: Any, index_name: str, script_query: Dict, size: int
) -> Any:
version_num = client.info()["version"]["number"][0]
version_num = int(version_num)
if version_num >= 8:
response = client.search(index=index_name, query=script_query, size=size)
else:
response = client.search(
index=index_name, body={"query": script_query, "size": size}
)
return response
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
if ids is None:
raise ValueError("No ids provided to delete.")
# TODO: Check if this can be done in bulk
for id in ids:
self.client.delete(index=self.index_name, id=id)
@deprecated("0.0.265", alternative="ElasticsearchStore class.", pending=True)
class ElasticKnnSearch(VectorStore):
"""[DEPRECATED] `Elasticsearch` with k-nearest neighbor search
(`k-NN`) vector store.
Recommended to use ElasticsearchStore instead, which supports
metadata filtering, customising the query retriever and much more!
You can read more on ElasticsearchStore:
https://python.langchain.com/docs/integrations/vectorstores/elasticsearch
It creates an Elasticsearch index of text data that
can be searched using k-NN search. The text data is transformed into
vector embeddings using a provided embedding model, and these embeddings
are stored in the Elasticsearch index.
Attributes:
index_name (str): The name of the Elasticsearch index.
embedding (Embeddings): The embedding model to use for transforming text data
into vector embeddings.
es_connection (Elasticsearch, optional): An existing Elasticsearch connection.
es_cloud_id (str, optional): The Cloud ID of your Elasticsearch Service
deployment.
es_user (str, optional): The username for your Elasticsearch Service deployment.
es_password (str, optional): The password for your Elasticsearch Service
deployment.
vector_query_field (str, optional): The name of the field in the Elasticsearch
index that contains the vector embeddings.
query_field (str, optional): The name of the field in the Elasticsearch index
that contains the original text data.
Usage:
>>> from embeddings import Embeddings
>>> embedding = Embeddings.load('glove')
>>> es_search = ElasticKnnSearch('my_index', embedding)
>>> es_search.add_texts(['Hello world!', 'Another text'])
>>> results = es_search.knn_search('Hello')
[(Document(page_content='Hello world!', metadata={}), 0.9)]
"""
def __init__(
self,
index_name: str,
embedding: Embeddings,
es_connection: Optional["Elasticsearch"] = None,
es_cloud_id: Optional[str] = None,
es_user: Optional[str] = None,
es_password: Optional[str] = None,
vector_query_field: Optional[str] = "vector",
query_field: Optional[str] = "text",
):
try:
import elasticsearch
except ImportError:
raise ImportError(
"Could not import elasticsearch python package. "
"Please install it with `pip install elasticsearch`."
)
warnings.warn(
"ElasticKnnSearch will be removed in a future release."
"Use ElasticsearchStore instead. See Elasticsearch "
"integration docs on how to upgrade."
)
self.embedding = embedding
self.index_name = index_name
self.query_field = query_field
self.vector_query_field = vector_query_field
# If a pre-existing Elasticsearch connection is provided, use it.
if es_connection is not None:
self.client = es_connection
else:
# If credentials for a new Elasticsearch connection are provided,
# create a new connection.
if es_cloud_id and es_user and es_password:
self.client = elasticsearch.Elasticsearch(
cloud_id=es_cloud_id, basic_auth=(es_user, es_password)
)
else:
raise ValueError(
"""Either provide a pre-existing Elasticsearch connection, \
or valid credentials for creating a new connection."""
)
@staticmethod
def _default_knn_mapping(
dims: int, similarity: Optional[str] = "dot_product"
) -> Dict:
return {
"properties": {
"text": {"type": "text"},
"vector": {
"type": "dense_vector",
"dims": dims,
"index": True,
"similarity": similarity,
},
}
}
def _default_knn_query(
self,
query_vector: Optional[List[float]] = None,
query: Optional[str] = None,
model_id: Optional[str] = None,
k: Optional[int] = 10,
num_candidates: Optional[int] = 10,
) -> Dict:
knn: Dict = {
"field": self.vector_query_field,
"k": k,
"num_candidates": num_candidates,
}
# Case 1: `query_vector` is provided, but not `model_id` -> use query_vector
if query_vector and not model_id:
knn["query_vector"] = query_vector
# Case 2: `query` and `model_id` are provided, -> use query_vector_builder
elif query and model_id:
knn["query_vector_builder"] = {
"text_embedding": {
"model_id": model_id, # use 'model_id' argument
"model_text": query, # use 'query' argument
}
}
else:
raise ValueError(
"Either `query_vector` or `model_id` must be provided, but not both."
)
return knn
def similarity_search(
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
) -> List[Document]:
"""
Pass through to `knn_search`
"""
results = self.knn_search(query=query, k=k, **kwargs)
return [doc for doc, score in results]
def similarity_search_with_score(
self, query: str, k: int = 10, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Pass through to `knn_search including score`"""
return self.knn_search(query=query, k=k, **kwargs)
def knn_search(
self,
query: Optional[str] = None,
k: Optional[int] = 10,
query_vector: Optional[List[float]] = None,
model_id: Optional[str] = None,
size: Optional[int] = 10,
source: Optional[bool] = True,
fields: Optional[
Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
] = None,
page_content: Optional[str] = "text",
) -> List[Tuple[Document, float]]:
"""
Perform a k-NN search on the Elasticsearch index.
Args:
query (str, optional): The query text to search for.
k (int, optional): The number of nearest neighbors to return.
query_vector (List[float], optional): The query vector to search for.
model_id (str, optional): The ID of the model to use for transforming the
query text into a vector.
size (int, optional): The number of search results to return.
source (bool, optional): Whether to return the source of the search results.
fields (List[Mapping[str, Any]], optional): The fields to return in the
search results.
page_content (str, optional): The name of the field that contains the page
content.
Returns:
A list of tuples, where each tuple contains a Document object and a score.
"""
# if not source and (fields == None or page_content not in fields):
if not source and (
fields is None or not any(page_content in field for field in fields)
):
raise ValueError("If source=False `page_content` field must be in `fields`")
knn_query_body = self._default_knn_query(
query_vector=query_vector, query=query, model_id=model_id, k=k
)
# Perform the kNN search on the Elasticsearch index and return the results.
response = self.client.search(
index=self.index_name,
knn=knn_query_body,
size=size,
source=source,
fields=fields,
)
hits = [hit for hit in response["hits"]["hits"]]
docs_and_scores = [
(
Document(
page_content=hit["_source"][page_content]
if source
else hit["fields"][page_content][0],
metadata=hit["fields"] if fields else {},
),
hit["_score"],
)
for hit in hits
]
return docs_and_scores
def knn_hybrid_search(
self,
query: Optional[str] = None,
k: Optional[int] = 10,
query_vector: Optional[List[float]] = None,
model_id: Optional[str] = None,
size: Optional[int] = 10,
source: Optional[bool] = True,
knn_boost: Optional[float] = 0.9,
query_boost: Optional[float] = 0.1,
fields: Optional[
Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
] = None,
page_content: Optional[str] = "text",
) -> List[Tuple[Document, float]]:
"""
Perform a hybrid k-NN and text search on the Elasticsearch index.
Args:
query (str, optional): The query text to search for.
k (int, optional): The number of nearest neighbors to return.
query_vector (List[float], optional): The query vector to search for.
model_id (str, optional): The ID of the model to use for transforming the
query text into a vector.
size (int, optional): The number of search results to return.
source (bool, optional): Whether to return the source of the search results.
knn_boost (float, optional): The boost value to apply to the k-NN search
results.
query_boost (float, optional): The boost value to apply to the text search
results.
fields (List[Mapping[str, Any]], optional): The fields to return in the
search results.
page_content (str, optional): The name of the field that contains the page
content.
Returns:
A list of tuples, where each tuple contains a Document object and a score.
"""
# if not source and (fields == None or page_content not in fields):
if not source and (
fields is None or not any(page_content in field for field in fields)
):
raise ValueError("If source=False `page_content` field must be in `fields`")
knn_query_body = self._default_knn_query(
query_vector=query_vector, query=query, model_id=model_id, k=k
)
# Modify the knn_query_body to add a "boost" parameter
knn_query_body["boost"] = knn_boost
# Generate the body of the standard Elasticsearch query
match_query_body = {
"match": {self.query_field: {"query": query, "boost": query_boost}}
}
# Perform the hybrid search on the Elasticsearch index and return the results.
response = self.client.search(
index=self.index_name,
query=match_query_body,
knn=knn_query_body,
fields=fields,
size=size,
source=source,
)
hits = [hit for hit in response["hits"]["hits"]]
docs_and_scores = [
(
Document(
page_content=hit["_source"][page_content]
if source
else hit["fields"][page_content][0],
metadata=hit["fields"] if fields else {},
),
hit["_score"],
)
for hit in hits
]
return docs_and_scores
def create_knn_index(self, mapping: Dict) -> None:
"""
Create a new k-NN index in Elasticsearch.
Args:
mapping (Dict): The mapping to use for the new index.
Returns:
None
"""
self.client.indices.create(index=self.index_name, mappings=mapping)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[Any, Any]]] = None,
model_id: Optional[str] = None,
refresh_indices: bool = False,
**kwargs: Any,
) -> List[str]:
"""
Add a list of texts to the Elasticsearch index.
Args:
texts (Iterable[str]): The texts to add to the index.
metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
to associate with the texts.
model_id (str, optional): The ID of the model to use for transforming the
texts into vectors.
refresh_indices (bool, optional): Whether to refresh the Elasticsearch
indices after adding the texts.
**kwargs: Arbitrary keyword arguments.
Returns:
A list of IDs for the added texts.
"""
# Check if the index exists.
if not self.client.indices.exists(index=self.index_name):
dims = kwargs.get("dims")
if dims is None:
raise ValueError("ElasticKnnSearch requires 'dims' parameter")
similarity = kwargs.get("similarity")
optional_args = {}
if similarity is not None:
optional_args["similarity"] = similarity
mapping = self._default_knn_mapping(dims=dims, **optional_args)
self.create_knn_index(mapping)
embeddings = self.embedding.embed_documents(list(texts))
# body = []
body: List[Mapping[str, Any]] = []
for text, vector in zip(texts, embeddings):
body.extend(
[
{"index": {"_index": self.index_name}},
{"text": text, "vector": vector},
]
)
responses = self.client.bulk(operations=body)
ids = [
item["index"]["_id"]
for item in responses["items"]
if item["index"]["result"] == "created"
]
if refresh_indices:
self.client.indices.refresh(index=self.index_name)
return ids
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[Dict[Any, Any]]] = None,
**kwargs: Any,
) -> ElasticKnnSearch:
"""
Create a new ElasticKnnSearch instance and add a list of texts to the
Elasticsearch index.
Args:
texts (List[str]): The texts to add to the index.
embedding (Embeddings): The embedding model to use for transforming the
texts into vectors.
metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
to associate with the texts.
**kwargs: Arbitrary keyword arguments.
Returns:
A new ElasticKnnSearch instance.
"""
index_name = kwargs.get("index_name", str(uuid.uuid4()))
es_connection = kwargs.get("es_connection")
es_cloud_id = kwargs.get("es_cloud_id")
es_user = kwargs.get("es_user")
es_password = kwargs.get("es_password")
vector_query_field = kwargs.get("vector_query_field", "vector")
query_field = kwargs.get("query_field", "text")
model_id = kwargs.get("model_id")
dims = kwargs.get("dims")
if dims is None:
raise ValueError("ElasticKnnSearch requires 'dims' parameter")
optional_args = {}
if vector_query_field is not None:
optional_args["vector_query_field"] = vector_query_field
if query_field is not None:
optional_args["query_field"] = query_field
knnvectorsearch = cls(
index_name=index_name,
embedding=embedding,
es_connection=es_connection,
es_cloud_id=es_cloud_id,
es_user=es_user,
es_password=es_password,
**optional_args,
)
# Encode the provided texts and add them to the newly created index.
knnvectorsearch.add_texts(texts, model_id=model_id, dims=dims, **optional_args)
return knnvectorsearch