@ -3,13 +3,26 @@ from __future__ import annotations
import uuid
import uuid
from abc import ABC
from abc import ABC
from typing import Any , Dict , Iterable , List , Optional , Tuple
from typing import (
TYPE_CHECKING ,
Any ,
Dict ,
Iterable ,
List ,
Mapping ,
Optional ,
Tuple ,
Union ,
)
from langchain . docstore . document import Document
from langchain . docstore . document import Document
from langchain . embeddings . base import Embeddings
from langchain . embeddings . base import Embeddings
from langchain . utils import get_from_env
from langchain . utils import get_from_env
from langchain . vectorstores . base import VectorStore
from langchain . vectorstores . base import VectorStore
if TYPE_CHECKING :
from elasticsearch import Elasticsearch
def _default_text_mapping ( dim : int ) - > Dict :
def _default_text_mapping ( dim : int ) - > Dict :
return {
return {
@ -304,3 +317,239 @@ class ElasticVectorSearch(VectorStore, ABC):
index = index_name , body = { " query " : script_query , " size " : size }
index = index_name , body = { " query " : script_query , " size " : size }
)
)
return response
return response
class ElasticKnnSearch ( ElasticVectorSearch ) :
"""
A class for performing k - Nearest Neighbors ( k - NN ) search on an Elasticsearch index .
The class is designed for a text search scenario where documents are text strings
and their embeddings are vector representations of those strings .
"""
def __init__ (
self ,
index_name : str ,
embedding : Embeddings ,
es_connection : Optional [ " Elasticsearch " ] = None ,
es_cloud_id : Optional [ str ] = None ,
es_user : Optional [ str ] = None ,
es_password : Optional [ str ] = None ,
) :
"""
Initializes an instance of the ElasticKnnSearch class and sets up the
Elasticsearch client .
Args :
index_name : The name of the Elasticsearch index .
embedding : An instance of the Embeddings class , used to generate vector
representations of text strings .
es_connection : An existing Elasticsearch connection .
es_cloud_id : The Cloud ID of the Elasticsearch instance . Required if
creating a new connection .
es_user : The username for the Elasticsearch instance . Required if
creating a new connection .
es_password : The password for the Elasticsearch instance . Required if
creating a new connection .
"""
try :
import elasticsearch
except ImportError :
raise ImportError (
" Could not import elasticsearch python package. "
" Please install it with `pip install elasticsearch`. "
)
self . embedding = embedding
self . index_name = index_name
# If a pre-existing Elasticsearch connection is provided, use it.
if es_connection is not None :
self . client = es_connection
else :
# If credentials for a new Elasticsearch connection are provided,
# create a new connection.
if es_cloud_id and es_user and es_password :
self . client = elasticsearch . Elasticsearch (
cloud_id = es_cloud_id , basic_auth = ( es_user , es_password )
)
else :
raise ValueError (
""" Either provide a pre-existing Elasticsearch connection, \
or valid credentials for creating a new connection . """
)
@staticmethod
def _default_knn_mapping ( dims : int ) - > Dict :
""" Generates a default index mapping for kNN search. """
return {
" properties " : {
" text " : { " type " : " text " } ,
" vector " : {
" type " : " dense_vector " ,
" dims " : dims ,
" index " : True ,
" similarity " : " dot_product " ,
} ,
}
}
@staticmethod
def _default_knn_query (
query_vector : Optional [ List [ float ] ] = None ,
query : Optional [ str ] = None ,
model_id : Optional [ str ] = None ,
field : Optional [ str ] = " vector " ,
k : Optional [ int ] = 10 ,
num_candidates : Optional [ int ] = 10 ,
) - > Dict :
knn : Dict = {
" field " : field ,
" k " : k ,
" num_candidates " : num_candidates ,
}
# Case 1: `query_vector` is provided, but not `model_id` -> use query_vector
if query_vector and not model_id :
knn [ " query_vector " ] = query_vector
# Case 2: `query` and `model_id` are provided, -> use query_vector_builder
elif query and model_id :
knn [ " query_vector_builder " ] = {
" text_embedding " : {
" model_id " : model_id , # use 'model_id' argument
" model_text " : query , # use 'query' argument
}
}
else :
raise ValueError (
" Either `query_vector` or `model_id` must be provided, but not both. "
)
return knn
def knn_search (
self ,
query : Optional [ str ] = None ,
k : Optional [ int ] = 10 ,
query_vector : Optional [ List [ float ] ] = None ,
model_id : Optional [ str ] = None ,
size : Optional [ int ] = 10 ,
source : Optional [ bool ] = True ,
fields : Optional [
Union [ List [ Mapping [ str , Any ] ] , Tuple [ Mapping [ str , Any ] , . . . ] , None ]
] = None ,
) - > Dict :
"""
Performs a k - nearest neighbor ( k - NN ) search on the Elasticsearch index .
The search can be conducted using either a raw query vector or a model ID .
The method first generates
the body of the search query , which can be interpreted by Elasticsearch .
It then performs the k - NN
search on the Elasticsearch index and returns the results .
Args :
query : The query or queries to be used for the search . Required if
` query_vector ` is not provided .
k : The number of nearest neighbors to return . Defaults to 10.
query_vector : The query vector to be used for the search . Required if
` query ` is not provided .
model_id : The ID of the model to use for generating the query vector , if
` query ` is provided .
size : The number of search hits to return . Defaults to 10.
source : Whether to include the source of each hit in the results .
fields : The fields to include in the source of each hit . If None , all
fields are included .
Returns :
The search results .
Raises :
ValueError : If neither ` query_vector ` nor ` model_id ` is provided , or if
both are provided .
"""
knn_query_body = self . _default_knn_query (
query_vector = query_vector , query = query , model_id = model_id , k = k
)
# Perform the kNN search on the Elasticsearch index and return the results.
res = self . client . search (
index = self . index_name ,
knn = knn_query_body ,
size = size ,
source = source ,
fields = fields ,
)
return dict ( res )
def knn_hybrid_search (
self ,
query : Optional [ str ] = None ,
k : Optional [ int ] = 10 ,
query_vector : Optional [ List [ float ] ] = None ,
model_id : Optional [ str ] = None ,
size : Optional [ int ] = 10 ,
source : Optional [ bool ] = True ,
knn_boost : Optional [ float ] = 0.9 ,
query_boost : Optional [ float ] = 0.1 ,
fields : Optional [
Union [ List [ Mapping [ str , Any ] ] , Tuple [ Mapping [ str , Any ] , . . . ] , None ]
] = None ,
) - > Dict [ Any , Any ] :
""" Performs a hybrid k-nearest neighbor (k-NN) and text-based search on the
Elasticsearch index .
The search can be conducted using either a raw query vector or a model ID .
The method first generates
the body of the k - NN search query and the text - based query , which can be
interpreted by Elasticsearch .
It then performs the hybrid search on the Elasticsearch index and returns the
results .
Args :
query : The query or queries to be used for the search . Required if
` query_vector ` is not provided .
k : The number of nearest neighbors to return . Defaults to 10.
query_vector : The query vector to be used for the search . Required if
` query ` is not provided .
model_id : The ID of the model to use for generating the query vector , if
` query ` is provided .
size : The number of search hits to return . Defaults to 10.
source : Whether to include the source of each hit in the results .
knn_boost : The boost factor for the k - NN part of the search .
query_boost : The boost factor for the text - based part of the search .
fields
The fields to include in the source of each hit . If None , all fields are
included . Defaults to None .
Returns :
The search results .
Raises :
ValueError : If neither ` query_vector ` nor ` model_id ` is provided , or if
both are provided .
"""
knn_query_body = self . _default_knn_query (
query_vector = query_vector , query = query , model_id = model_id , k = k
)
# Modify the knn_query_body to add a "boost" parameter
knn_query_body [ " boost " ] = knn_boost
# Generate the body of the standard Elasticsearch query
match_query_body = { " match " : { " text " : { " query " : query , " boost " : query_boost } } }
# Perform the hybrid search on the Elasticsearch index and return the results.
res = self . client . search (
index = self . index_name ,
query = match_query_body ,
knn = knn_query_body ,
fields = fields ,
size = size ,
source = source ,
)
return dict ( res )