Add custom vector fields and text fields for OpenSearch (#2652)

**Description**
Add custom vector field name and text field name while indexing and
querying for OpenSearch

**Issues**
https://github.com/hwchase17/langchain/issues/2500

Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
fix_agent_callbacks
Naveen Tatikonda 1 year ago committed by GitHub
parent 023de9a70b
commit 4364d3316e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -55,7 +55,7 @@
},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n",
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
@ -94,7 +94,7 @@
},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
@ -133,7 +133,7 @@
},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")"
@ -172,10 +172,10 @@
},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)"
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)"
]
},
{
@ -238,4 +238,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}

@ -65,6 +65,8 @@ def _bulk_ingest_embeddings(
embeddings: List[List[float]],
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
vector_field: str = "vector_field",
text_field: str = "text",
) -> List[str]:
"""Bulk Ingest Embeddings into given index."""
bulk = _import_bulk()
@ -76,8 +78,8 @@ def _bulk_ingest_embeddings(
request = {
"_op_type": "index",
"_index": index_name,
"vector_field": embeddings[i],
"text": text,
vector_field: embeddings[i],
text_field: text,
"metadata": metadata,
"_id": _id,
}
@ -88,12 +90,15 @@ def _bulk_ingest_embeddings(
return ids
def _default_scripting_text_mapping(dim: int) -> Dict:
def _default_scripting_text_mapping(
dim: int,
vector_field: str = "vector_field",
) -> Dict:
"""For Painless Scripting or Script Scoring,the default mapping to create index."""
return {
"mappings": {
"properties": {
"vector_field": {"type": "knn_vector", "dimension": dim},
vector_field: {"type": "knn_vector", "dimension": dim},
}
}
}
@ -106,13 +111,14 @@ def _default_text_mapping(
ef_search: int = 512,
ef_construction: int = 512,
m: int = 16,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, this is the default mapping to create index."""
return {
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}},
"mappings": {
"properties": {
"vector_field": {
vector_field: {
"type": "knn_vector",
"dimension": dim,
"method": {
@ -165,10 +171,18 @@ def _default_script_query(
}
def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str:
def __get_painless_scripting_source(
space_type: str, query_vector: List[float], vector_field: str = "vector_field"
) -> str:
"""For Painless Scripting, it returns the script source based on space type."""
source_value = (
"(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))"
"(1.0 + "
+ space_type
+ "("
+ str(query_vector)
+ ", doc['"
+ vector_field
+ "']))"
)
if space_type == "cosineSimilarity":
return source_value
@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore):
Returns:
List of ids from adding the texts into the vectorstore.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
"""
embeddings = [
self.embedding_function.embed_documents([text])[0] for text in texts
]
embeddings = self.embedding_function.embed_documents(list(texts))
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
text_field = _get_kwargs_value(kwargs, "text_field", "text")
return _bulk_ingest_embeddings(
self.client, self.index_name, embeddings, texts, metadatas
self.client,
self.index_name,
embeddings,
texts,
metadatas,
vector_field,
text_field,
)
def similarity_search(
@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
metadata_field: Document field that metadata is stored in. Defaults to
"metadata".
Can be set to a special value "*" to include the entire document.
Optional Args for Approximate Search:
search_type: "approximate_search"; default: "approximate_search"
size: number of results the query actually returns; default: 4
Optional Args for Script Scoring Search:
@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args for Painless Scripting Search:
search_type: "painless_scripting"; default: "approximate_search"
space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared"
pre_filter: script_score query to pre-filter documents before identifying
@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore):
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
text_field = _get_kwargs_value(kwargs, "text_field", "text")
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
if search_type == "approximate_search":
size = _get_kwargs_value(kwargs, "size", 4)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_approximate_search_query(
embedding, size, k, vector_field
)
elif search_type == SCRIPT_SCORING_SEARCH:
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_script_query(
embedding, space_type, pre_filter, vector_field
)
elif search_type == PAINLESS_SCRIPTING_SEARCH:
space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_painless_scripting_query(
embedding, space_type, pre_filter, vector_field
)
@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore):
and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore):
kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex
)
is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
text_field = _get_kwargs_value(kwargs, "text_field", "text")
if is_appx_search:
engine = _get_kwargs_value(kwargs, "engine", "nmslib")
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore):
m = _get_kwargs_value(kwargs, "m", 16)
mapping = _default_text_mapping(
dim, engine, space_type, ef_search, ef_construction, m
dim, engine, space_type, ef_search, ef_construction, m, vector_field
)
else:
mapping = _default_scripting_text_mapping(dim)
client.indices.create(index=index_name, body=mapping)
_bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas)
_bulk_ingest_embeddings(
client, index_name, embeddings, texts, metadatas, vector_field, text_field
)
return cls(opensearch_url, index_name, embedding)

@ -23,6 +23,30 @@ def test_opensearch() -> None:
assert output == [Document(page_content="foo")]
def test_opensearch_with_custom_field_name() -> None:
"""Test indexing and search using custom vector field and text field name."""
docsearch = OpenSearchVectorSearch.from_texts(
texts,
FakeEmbeddings(),
opensearch_url=DEFAULT_OPENSEARCH_URL,
vector_field="my_vector",
text_field="custom_text",
)
output = docsearch.similarity_search(
"foo", k=1, vector_field="my_vector", text_field="custom_text"
)
assert output == [Document(page_content="foo")]
text_input = ["test", "add", "text", "method"]
OpenSearchVectorSearch.add_texts(
docsearch, text_input, vector_field="my_vector", text_field="custom_text"
)
output = docsearch.similarity_search(
"add", k=1, vector_field="my_vector", text_field="custom_text"
)
assert output == [Document(page_content="foo")]
def test_opensearch_with_metadatas() -> None:
"""Test end to end indexing and search with metadata."""
metadatas = [{"page": i} for i in range(len(texts))]

Loading…
Cancel
Save