From 4364d3316e10e6642db98fbd2a25331d68f374f9 Mon Sep 17 00:00:00 2001 From: Naveen Tatikonda Date: Mon, 10 Apr 2023 23:02:02 -0500 Subject: [PATCH] Add custom vector fields and text fields for OpenSearch (#2652) **Description** Add custom vector field name and text field name while indexing and querying for OpenSearch **Issues** https://github.com/hwchase17/langchain/issues/2500 Signed-off-by: Naveen Tatikonda --- .../vectorstores/examples/opensearch.ipynb | 12 ++-- .../vectorstores/opensearch_vector_search.py | 72 ++++++++++++++----- .../vectorstores/test_opensearch.py | 24 +++++++ 3 files changed, 86 insertions(+), 22 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/opensearch.ipynb b/docs/modules/indexes/vectorstores/examples/opensearch.ipynb index 450f35f3..98b300c2 100644 --- a/docs/modules/indexes/vectorstores/examples/opensearch.ipynb +++ b/docs/modules/indexes/vectorstores/examples/opensearch.ipynb @@ -55,7 +55,7 @@ }, "outputs": [], "source": [ - "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n", + "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n", "\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs = docsearch.similarity_search(query)" @@ -94,7 +94,7 @@ }, "outputs": [], "source": [ - "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n", + "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n", "\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs = docsearch.similarity_search(query)" @@ -133,7 +133,7 @@ }, "outputs": [], "source": [ - "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", + "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", "\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")" @@ -172,10 +172,10 @@ }, "outputs": [], "source": [ - "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", + "docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", "filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)" + "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)" ] }, { @@ -238,4 +238,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/langchain/vectorstores/opensearch_vector_search.py b/langchain/vectorstores/opensearch_vector_search.py index 8c38c10e..9f87c156 100644 --- a/langchain/vectorstores/opensearch_vector_search.py +++ b/langchain/vectorstores/opensearch_vector_search.py @@ -65,6 +65,8 @@ def _bulk_ingest_embeddings( embeddings: List[List[float]], texts: Iterable[str], metadatas: Optional[List[dict]] = None, + vector_field: str = "vector_field", + text_field: str = "text", ) -> List[str]: """Bulk Ingest Embeddings into given index.""" bulk = _import_bulk() @@ -76,8 +78,8 @@ def _bulk_ingest_embeddings( request = { "_op_type": "index", "_index": index_name, - "vector_field": embeddings[i], - "text": text, + vector_field: embeddings[i], + text_field: text, "metadata": metadata, "_id": _id, } @@ -88,12 +90,15 @@ def _bulk_ingest_embeddings( return ids -def _default_scripting_text_mapping(dim: int) -> Dict: +def _default_scripting_text_mapping( + dim: int, + vector_field: str = "vector_field", +) -> Dict: """For Painless Scripting or Script Scoring,the default mapping to create index.""" return { "mappings": { "properties": { - "vector_field": {"type": "knn_vector", "dimension": dim}, + vector_field: {"type": "knn_vector", "dimension": dim}, } } } @@ -106,13 +111,14 @@ def _default_text_mapping( ef_search: int = 512, ef_construction: int = 512, m: int = 16, + vector_field: str = "vector_field", ) -> Dict: """For Approximate k-NN Search, this is the default mapping to create index.""" return { "settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}}, "mappings": { "properties": { - "vector_field": { + vector_field: { "type": "knn_vector", "dimension": dim, "method": { @@ -165,10 +171,18 @@ def _default_script_query( } -def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str: +def __get_painless_scripting_source( + space_type: str, query_vector: List[float], vector_field: str = "vector_field" +) -> str: """For Painless Scripting, it returns the script source based on space type.""" source_value = ( - "(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))" + "(1.0 + " + + space_type + + "(" + + str(query_vector) + + ", doc['" + + vector_field + + "']))" ) if space_type == "cosineSimilarity": return source_value @@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore): Returns: List of ids from adding the texts into the vectorstore. + + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". """ - embeddings = [ - self.embedding_function.embed_documents([text])[0] for text in texts - ] + embeddings = self.embedding_function.embed_documents(list(texts)) _validate_embeddings_and_bulk_size(len(embeddings), bulk_size) + vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") + text_field = _get_kwargs_value(kwargs, "text_field", "text") return _bulk_ingest_embeddings( - self.client, self.index_name, embeddings, texts, metadatas + self.client, + self.index_name, + embeddings, + texts, + metadatas, + vector_field, + text_field, ) def similarity_search( @@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore): Optional Args: vector_field: Document field embeddings are stored in. Defaults to "vector_field". + text_field: Document field the text of the document is stored in. Defaults to "text". + metadata_field: Document field that metadata is stored in. Defaults to "metadata". Can be set to a special value "*" to include the entire document. Optional Args for Approximate Search: search_type: "approximate_search"; default: "approximate_search" + size: number of results the query actually returns; default: 4 Optional Args for Script Scoring Search: @@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore): Optional Args for Painless Scripting Search: search_type: "painless_scripting"; default: "approximate_search" + space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared" pre_filter: script_score query to pre-filter documents before identifying @@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore): search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search") text_field = _get_kwargs_value(kwargs, "text_field", "text") metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata") + vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") if search_type == "approximate_search": size = _get_kwargs_value(kwargs, "size", 4) - vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") search_query = _default_approximate_search_query( embedding, size, k, vector_field ) elif search_type == SCRIPT_SCORING_SEARCH: space_type = _get_kwargs_value(kwargs, "space_type", "l2") pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) - vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") search_query = _default_script_query( embedding, space_type, pre_filter, vector_field ) elif search_type == PAINLESS_SCRIPTING_SEARCH: space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared") pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) - vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") search_query = _default_painless_scripting_query( embedding, space_type, pre_filter, vector_field ) @@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore): and lucene engines recommended for large datasets. Also supports brute force search through Script Scoring and Painless Scripting. + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". + Optional Keyword Args for Approximate Search: engine: "nmslib", "faiss", "hnsw"; default: "nmslib" @@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore): kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex ) is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True) + vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") + text_field = _get_kwargs_value(kwargs, "text_field", "text") if is_appx_search: engine = _get_kwargs_value(kwargs, "engine", "nmslib") space_type = _get_kwargs_value(kwargs, "space_type", "l2") @@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore): m = _get_kwargs_value(kwargs, "m", 16) mapping = _default_text_mapping( - dim, engine, space_type, ef_search, ef_construction, m + dim, engine, space_type, ef_search, ef_construction, m, vector_field ) else: mapping = _default_scripting_text_mapping(dim) client.indices.create(index=index_name, body=mapping) - _bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas) + _bulk_ingest_embeddings( + client, index_name, embeddings, texts, metadatas, vector_field, text_field + ) return cls(opensearch_url, index_name, embedding) diff --git a/tests/integration_tests/vectorstores/test_opensearch.py b/tests/integration_tests/vectorstores/test_opensearch.py index efa1d9d7..ec132a1e 100644 --- a/tests/integration_tests/vectorstores/test_opensearch.py +++ b/tests/integration_tests/vectorstores/test_opensearch.py @@ -23,6 +23,30 @@ def test_opensearch() -> None: assert output == [Document(page_content="foo")] +def test_opensearch_with_custom_field_name() -> None: + """Test indexing and search using custom vector field and text field name.""" + docsearch = OpenSearchVectorSearch.from_texts( + texts, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + vector_field="my_vector", + text_field="custom_text", + ) + output = docsearch.similarity_search( + "foo", k=1, vector_field="my_vector", text_field="custom_text" + ) + assert output == [Document(page_content="foo")] + + text_input = ["test", "add", "text", "method"] + OpenSearchVectorSearch.add_texts( + docsearch, text_input, vector_field="my_vector", text_field="custom_text" + ) + output = docsearch.similarity_search( + "add", k=1, vector_field="my_vector", text_field="custom_text" + ) + assert output == [Document(page_content="foo")] + + def test_opensearch_with_metadatas() -> None: """Test end to end indexing and search with metadata.""" metadatas = [{"page": i} for i in range(len(texts))]