Add custom vector fields and text fields for OpenSearch (#2652)

**Description**
Add custom vector field name and text field name while indexing and
querying for OpenSearch

**Issues**
https://github.com/hwchase17/langchain/issues/2500

Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
fix_agent_callbacks
Naveen Tatikonda 1 year ago committed by GitHub
parent 023de9a70b
commit 4364d3316e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -55,7 +55,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n", "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n",
"\n", "\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)" "docs = docsearch.similarity_search(query)"
@ -94,7 +94,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n", "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
"\n", "\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)" "docs = docsearch.similarity_search(query)"
@ -133,7 +133,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"\n", "\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")" "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")"
@ -172,10 +172,10 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
"filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n", "filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)" "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)"
] ]
}, },
{ {
@ -238,4 +238,4 @@
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 5 "nbformat_minor": 5
} }

@ -65,6 +65,8 @@ def _bulk_ingest_embeddings(
embeddings: List[List[float]], embeddings: List[List[float]],
texts: Iterable[str], texts: Iterable[str],
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
vector_field: str = "vector_field",
text_field: str = "text",
) -> List[str]: ) -> List[str]:
"""Bulk Ingest Embeddings into given index.""" """Bulk Ingest Embeddings into given index."""
bulk = _import_bulk() bulk = _import_bulk()
@ -76,8 +78,8 @@ def _bulk_ingest_embeddings(
request = { request = {
"_op_type": "index", "_op_type": "index",
"_index": index_name, "_index": index_name,
"vector_field": embeddings[i], vector_field: embeddings[i],
"text": text, text_field: text,
"metadata": metadata, "metadata": metadata,
"_id": _id, "_id": _id,
} }
@ -88,12 +90,15 @@ def _bulk_ingest_embeddings(
return ids return ids
def _default_scripting_text_mapping(dim: int) -> Dict: def _default_scripting_text_mapping(
dim: int,
vector_field: str = "vector_field",
) -> Dict:
"""For Painless Scripting or Script Scoring,the default mapping to create index.""" """For Painless Scripting or Script Scoring,the default mapping to create index."""
return { return {
"mappings": { "mappings": {
"properties": { "properties": {
"vector_field": {"type": "knn_vector", "dimension": dim}, vector_field: {"type": "knn_vector", "dimension": dim},
} }
} }
} }
@ -106,13 +111,14 @@ def _default_text_mapping(
ef_search: int = 512, ef_search: int = 512,
ef_construction: int = 512, ef_construction: int = 512,
m: int = 16, m: int = 16,
vector_field: str = "vector_field",
) -> Dict: ) -> Dict:
"""For Approximate k-NN Search, this is the default mapping to create index.""" """For Approximate k-NN Search, this is the default mapping to create index."""
return { return {
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}}, "settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}},
"mappings": { "mappings": {
"properties": { "properties": {
"vector_field": { vector_field: {
"type": "knn_vector", "type": "knn_vector",
"dimension": dim, "dimension": dim,
"method": { "method": {
@ -165,10 +171,18 @@ def _default_script_query(
} }
def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str: def __get_painless_scripting_source(
space_type: str, query_vector: List[float], vector_field: str = "vector_field"
) -> str:
"""For Painless Scripting, it returns the script source based on space type.""" """For Painless Scripting, it returns the script source based on space type."""
source_value = ( source_value = (
"(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))" "(1.0 + "
+ space_type
+ "("
+ str(query_vector)
+ ", doc['"
+ vector_field
+ "']))"
) )
if space_type == "cosineSimilarity": if space_type == "cosineSimilarity":
return source_value return source_value
@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore):
Returns: Returns:
List of ids from adding the texts into the vectorstore. List of ids from adding the texts into the vectorstore.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
""" """
embeddings = [ embeddings = self.embedding_function.embed_documents(list(texts))
self.embedding_function.embed_documents([text])[0] for text in texts
]
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) _validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
text_field = _get_kwargs_value(kwargs, "text_field", "text")
return _bulk_ingest_embeddings( return _bulk_ingest_embeddings(
self.client, self.index_name, embeddings, texts, metadatas self.client,
self.index_name,
embeddings,
texts,
metadatas,
vector_field,
text_field,
) )
def similarity_search( def similarity_search(
@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args: Optional Args:
vector_field: Document field embeddings are stored in. Defaults to vector_field: Document field embeddings are stored in. Defaults to
"vector_field". "vector_field".
text_field: Document field the text of the document is stored in. Defaults text_field: Document field the text of the document is stored in. Defaults
to "text". to "text".
metadata_field: Document field that metadata is stored in. Defaults to metadata_field: Document field that metadata is stored in. Defaults to
"metadata". "metadata".
Can be set to a special value "*" to include the entire document. Can be set to a special value "*" to include the entire document.
Optional Args for Approximate Search: Optional Args for Approximate Search:
search_type: "approximate_search"; default: "approximate_search" search_type: "approximate_search"; default: "approximate_search"
size: number of results the query actually returns; default: 4 size: number of results the query actually returns; default: 4
Optional Args for Script Scoring Search: Optional Args for Script Scoring Search:
@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args for Painless Scripting Search: Optional Args for Painless Scripting Search:
search_type: "painless_scripting"; default: "approximate_search" search_type: "painless_scripting"; default: "approximate_search"
space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared" space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared"
pre_filter: script_score query to pre-filter documents before identifying pre_filter: script_score query to pre-filter documents before identifying
@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore):
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search") search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
text_field = _get_kwargs_value(kwargs, "text_field", "text") text_field = _get_kwargs_value(kwargs, "text_field", "text")
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata") metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
if search_type == "approximate_search": if search_type == "approximate_search":
size = _get_kwargs_value(kwargs, "size", 4) size = _get_kwargs_value(kwargs, "size", 4)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_approximate_search_query( search_query = _default_approximate_search_query(
embedding, size, k, vector_field embedding, size, k, vector_field
) )
elif search_type == SCRIPT_SCORING_SEARCH: elif search_type == SCRIPT_SCORING_SEARCH:
space_type = _get_kwargs_value(kwargs, "space_type", "l2") space_type = _get_kwargs_value(kwargs, "space_type", "l2")
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_script_query( search_query = _default_script_query(
embedding, space_type, pre_filter, vector_field embedding, space_type, pre_filter, vector_field
) )
elif search_type == PAINLESS_SCRIPTING_SEARCH: elif search_type == PAINLESS_SCRIPTING_SEARCH:
space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared") space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
search_query = _default_painless_scripting_query( search_query = _default_painless_scripting_query(
embedding, space_type, pre_filter, vector_field embedding, space_type, pre_filter, vector_field
) )
@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore):
and lucene engines recommended for large datasets. Also supports brute force and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting. search through Script Scoring and Painless Scripting.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
Optional Keyword Args for Approximate Search: Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "hnsw"; default: "nmslib" engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore):
kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex
) )
is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True) is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True)
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
text_field = _get_kwargs_value(kwargs, "text_field", "text")
if is_appx_search: if is_appx_search:
engine = _get_kwargs_value(kwargs, "engine", "nmslib") engine = _get_kwargs_value(kwargs, "engine", "nmslib")
space_type = _get_kwargs_value(kwargs, "space_type", "l2") space_type = _get_kwargs_value(kwargs, "space_type", "l2")
@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore):
m = _get_kwargs_value(kwargs, "m", 16) m = _get_kwargs_value(kwargs, "m", 16)
mapping = _default_text_mapping( mapping = _default_text_mapping(
dim, engine, space_type, ef_search, ef_construction, m dim, engine, space_type, ef_search, ef_construction, m, vector_field
) )
else: else:
mapping = _default_scripting_text_mapping(dim) mapping = _default_scripting_text_mapping(dim)
client.indices.create(index=index_name, body=mapping) client.indices.create(index=index_name, body=mapping)
_bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas) _bulk_ingest_embeddings(
client, index_name, embeddings, texts, metadatas, vector_field, text_field
)
return cls(opensearch_url, index_name, embedding) return cls(opensearch_url, index_name, embedding)

@ -23,6 +23,30 @@ def test_opensearch() -> None:
assert output == [Document(page_content="foo")] assert output == [Document(page_content="foo")]
def test_opensearch_with_custom_field_name() -> None:
"""Test indexing and search using custom vector field and text field name."""
docsearch = OpenSearchVectorSearch.from_texts(
texts,
FakeEmbeddings(),
opensearch_url=DEFAULT_OPENSEARCH_URL,
vector_field="my_vector",
text_field="custom_text",
)
output = docsearch.similarity_search(
"foo", k=1, vector_field="my_vector", text_field="custom_text"
)
assert output == [Document(page_content="foo")]
text_input = ["test", "add", "text", "method"]
OpenSearchVectorSearch.add_texts(
docsearch, text_input, vector_field="my_vector", text_field="custom_text"
)
output = docsearch.similarity_search(
"add", k=1, vector_field="my_vector", text_field="custom_text"
)
assert output == [Document(page_content="foo")]
def test_opensearch_with_metadatas() -> None: def test_opensearch_with_metadatas() -> None:
"""Test end to end indexing and search with metadata.""" """Test end to end indexing and search with metadata."""
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]

Loading…
Cancel
Save