@ -65,6 +65,8 @@ def _bulk_ingest_embeddings(
embeddings : List [ List [ float ] ] ,
embeddings : List [ List [ float ] ] ,
texts : Iterable [ str ] ,
texts : Iterable [ str ] ,
metadatas : Optional [ List [ dict ] ] = None ,
metadatas : Optional [ List [ dict ] ] = None ,
vector_field : str = " vector_field " ,
text_field : str = " text " ,
) - > List [ str ] :
) - > List [ str ] :
""" Bulk Ingest Embeddings into given index. """
""" Bulk Ingest Embeddings into given index. """
bulk = _import_bulk ( )
bulk = _import_bulk ( )
@ -76,8 +78,8 @@ def _bulk_ingest_embeddings(
request = {
request = {
" _op_type " : " index " ,
" _op_type " : " index " ,
" _index " : index_name ,
" _index " : index_name ,
" vector_field " : embeddings [ i ] ,
vector_field : embeddings [ i ] ,
" text " : text ,
text_field : text ,
" metadata " : metadata ,
" metadata " : metadata ,
" _id " : _id ,
" _id " : _id ,
}
}
@ -88,12 +90,15 @@ def _bulk_ingest_embeddings(
return ids
return ids
def _default_scripting_text_mapping ( dim : int ) - > Dict :
def _default_scripting_text_mapping (
dim : int ,
vector_field : str = " vector_field " ,
) - > Dict :
""" For Painless Scripting or Script Scoring,the default mapping to create index. """
""" For Painless Scripting or Script Scoring,the default mapping to create index. """
return {
return {
" mappings " : {
" mappings " : {
" properties " : {
" properties " : {
" vector_field " : { " type " : " knn_vector " , " dimension " : dim } ,
vector_field : { " type " : " knn_vector " , " dimension " : dim } ,
}
}
}
}
}
}
@ -106,13 +111,14 @@ def _default_text_mapping(
ef_search : int = 512 ,
ef_search : int = 512 ,
ef_construction : int = 512 ,
ef_construction : int = 512 ,
m : int = 16 ,
m : int = 16 ,
vector_field : str = " vector_field " ,
) - > Dict :
) - > Dict :
""" For Approximate k-NN Search, this is the default mapping to create index. """
""" For Approximate k-NN Search, this is the default mapping to create index. """
return {
return {
" settings " : { " index " : { " knn " : True , " knn.algo_param.ef_search " : ef_search } } ,
" settings " : { " index " : { " knn " : True , " knn.algo_param.ef_search " : ef_search } } ,
" mappings " : {
" mappings " : {
" properties " : {
" properties " : {
" vector_field " : {
vector_field : {
" type " : " knn_vector " ,
" type " : " knn_vector " ,
" dimension " : dim ,
" dimension " : dim ,
" method " : {
" method " : {
@ -165,10 +171,18 @@ def _default_script_query(
}
}
def __get_painless_scripting_source ( space_type : str , query_vector : List [ float ] ) - > str :
def __get_painless_scripting_source (
space_type : str , query_vector : List [ float ] , vector_field : str = " vector_field "
) - > str :
""" For Painless Scripting, it returns the script source based on space type. """
""" For Painless Scripting, it returns the script source based on space type. """
source_value = (
source_value = (
" (1.0 + " + space_type + " ( " + str ( query_vector ) + " , doc[ ' vector_field ' ])) "
" (1.0 + "
+ space_type
+ " ( "
+ str ( query_vector )
+ " , doc[ ' "
+ vector_field
+ " ' ])) "
)
)
if space_type == " cosineSimilarity " :
if space_type == " cosineSimilarity " :
return source_value
return source_value
@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore):
Returns :
Returns :
List of ids from adding the texts into the vectorstore .
List of ids from adding the texts into the vectorstore .
Optional Args :
vector_field : Document field embeddings are stored in . Defaults to
" vector_field " .
text_field : Document field the text of the document is stored in . Defaults
to " text " .
"""
"""
embeddings = [
embeddings = self . embedding_function . embed_documents ( list ( texts ) )
self . embedding_function . embed_documents ( [ text ] ) [ 0 ] for text in texts
]
_validate_embeddings_and_bulk_size ( len ( embeddings ) , bulk_size )
_validate_embeddings_and_bulk_size ( len ( embeddings ) , bulk_size )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
text_field = _get_kwargs_value ( kwargs , " text_field " , " text " )
return _bulk_ingest_embeddings (
return _bulk_ingest_embeddings (
self . client , self . index_name , embeddings , texts , metadatas
self . client ,
self . index_name ,
embeddings ,
texts ,
metadatas ,
vector_field ,
text_field ,
)
)
def similarity_search (
def similarity_search (
@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args :
Optional Args :
vector_field : Document field embeddings are stored in . Defaults to
vector_field : Document field embeddings are stored in . Defaults to
" vector_field " .
" vector_field " .
text_field : Document field the text of the document is stored in . Defaults
text_field : Document field the text of the document is stored in . Defaults
to " text " .
to " text " .
metadata_field : Document field that metadata is stored in . Defaults to
metadata_field : Document field that metadata is stored in . Defaults to
" metadata " .
" metadata " .
Can be set to a special value " * " to include the entire document .
Can be set to a special value " * " to include the entire document .
Optional Args for Approximate Search :
Optional Args for Approximate Search :
search_type : " approximate_search " ; default : " approximate_search "
search_type : " approximate_search " ; default : " approximate_search "
size : number of results the query actually returns ; default : 4
size : number of results the query actually returns ; default : 4
Optional Args for Script Scoring Search :
Optional Args for Script Scoring Search :
@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args for Painless Scripting Search :
Optional Args for Painless Scripting Search :
search_type : " painless_scripting " ; default : " approximate_search "
search_type : " painless_scripting " ; default : " approximate_search "
space_type : " l2Squared " , " l1Norm " , " cosineSimilarity " ; default : " l2Squared "
space_type : " l2Squared " , " l1Norm " , " cosineSimilarity " ; default : " l2Squared "
pre_filter : script_score query to pre - filter documents before identifying
pre_filter : script_score query to pre - filter documents before identifying
@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore):
search_type = _get_kwargs_value ( kwargs , " search_type " , " approximate_search " )
search_type = _get_kwargs_value ( kwargs , " search_type " , " approximate_search " )
text_field = _get_kwargs_value ( kwargs , " text_field " , " text " )
text_field = _get_kwargs_value ( kwargs , " text_field " , " text " )
metadata_field = _get_kwargs_value ( kwargs , " metadata_field " , " metadata " )
metadata_field = _get_kwargs_value ( kwargs , " metadata_field " , " metadata " )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
if search_type == " approximate_search " :
if search_type == " approximate_search " :
size = _get_kwargs_value ( kwargs , " size " , 4 )
size = _get_kwargs_value ( kwargs , " size " , 4 )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
search_query = _default_approximate_search_query (
search_query = _default_approximate_search_query (
embedding , size , k , vector_field
embedding , size , k , vector_field
)
)
elif search_type == SCRIPT_SCORING_SEARCH :
elif search_type == SCRIPT_SCORING_SEARCH :
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2 " )
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2 " )
pre_filter = _get_kwargs_value ( kwargs , " pre_filter " , MATCH_ALL_QUERY )
pre_filter = _get_kwargs_value ( kwargs , " pre_filter " , MATCH_ALL_QUERY )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
search_query = _default_script_query (
search_query = _default_script_query (
embedding , space_type , pre_filter , vector_field
embedding , space_type , pre_filter , vector_field
)
)
elif search_type == PAINLESS_SCRIPTING_SEARCH :
elif search_type == PAINLESS_SCRIPTING_SEARCH :
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2Squared " )
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2Squared " )
pre_filter = _get_kwargs_value ( kwargs , " pre_filter " , MATCH_ALL_QUERY )
pre_filter = _get_kwargs_value ( kwargs , " pre_filter " , MATCH_ALL_QUERY )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
search_query = _default_painless_scripting_query (
search_query = _default_painless_scripting_query (
embedding , space_type , pre_filter , vector_field
embedding , space_type , pre_filter , vector_field
)
)
@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore):
and lucene engines recommended for large datasets . Also supports brute force
and lucene engines recommended for large datasets . Also supports brute force
search through Script Scoring and Painless Scripting .
search through Script Scoring and Painless Scripting .
Optional Args :
vector_field : Document field embeddings are stored in . Defaults to
" vector_field " .
text_field : Document field the text of the document is stored in . Defaults
to " text " .
Optional Keyword Args for Approximate Search :
Optional Keyword Args for Approximate Search :
engine : " nmslib " , " faiss " , " hnsw " ; default : " nmslib "
engine : " nmslib " , " faiss " , " hnsw " ; default : " nmslib "
@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore):
kwargs , " index_name " , " OPENSEARCH_INDEX_NAME " , default = uuid . uuid4 ( ) . hex
kwargs , " index_name " , " OPENSEARCH_INDEX_NAME " , default = uuid . uuid4 ( ) . hex
)
)
is_appx_search = _get_kwargs_value ( kwargs , " is_appx_search " , True )
is_appx_search = _get_kwargs_value ( kwargs , " is_appx_search " , True )
vector_field = _get_kwargs_value ( kwargs , " vector_field " , " vector_field " )
text_field = _get_kwargs_value ( kwargs , " text_field " , " text " )
if is_appx_search :
if is_appx_search :
engine = _get_kwargs_value ( kwargs , " engine " , " nmslib " )
engine = _get_kwargs_value ( kwargs , " engine " , " nmslib " )
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2 " )
space_type = _get_kwargs_value ( kwargs , " space_type " , " l2 " )
@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore):
m = _get_kwargs_value ( kwargs , " m " , 16 )
m = _get_kwargs_value ( kwargs , " m " , 16 )
mapping = _default_text_mapping (
mapping = _default_text_mapping (
dim , engine , space_type , ef_search , ef_construction , m
dim , engine , space_type , ef_search , ef_construction , m , vector_field
)
)
else :
else :
mapping = _default_scripting_text_mapping ( dim )
mapping = _default_scripting_text_mapping ( dim )
client . indices . create ( index = index_name , body = mapping )
client . indices . create ( index = index_name , body = mapping )
_bulk_ingest_embeddings ( client , index_name , embeddings , texts , metadatas )
_bulk_ingest_embeddings (
client , index_name , embeddings , texts , metadatas , vector_field , text_field
)
return cls ( opensearch_url , index_name , embedding )
return cls ( opensearch_url , index_name , embedding )