From 3453b7457ca60227430d85e6f6f58a2aafae559d Mon Sep 17 00:00:00 2001 From: Naveen Tatikonda Date: Mon, 17 Apr 2023 22:26:26 -0500 Subject: [PATCH] OpenSearch: Add Support for Boolean Filter with ANN search (#3038) ### Description Add Support for Boolean Filter with ANN search Documentation - https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/#boolean-filter-with-ann-search ### Issues Resolved https://github.com/hwchase17/langchain/issues/2924 Signed-off-by: Naveen Tatikonda --- .../vectorstores/opensearch_vector_search.py | 41 +++++++++++++++++-- .../vectorstores/test_opensearch.py | 14 +++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/langchain/vectorstores/opensearch_vector_search.py b/langchain/vectorstores/opensearch_vector_search.py index 9f87c156..9c509d17 100644 --- a/langchain/vectorstores/opensearch_vector_search.py +++ b/langchain/vectorstores/opensearch_vector_search.py @@ -146,6 +146,28 @@ def _default_approximate_search_query( } +def _approximate_search_query_with_boolean_filter( + query_vector: List[float], + boolean_filter: Dict, + size: int = 4, + k: int = 4, + vector_field: str = "vector_field", + subquery_clause: str = "must", +) -> Dict: + """For Approximate k-NN Search, with Boolean Filter.""" + return { + "size": size, + "query": { + "bool": { + "filter": boolean_filter, + subquery_clause: [ + {"knn": {vector_field: {"vector": query_vector, "k": k}}} + ], + } + }, + } + + def _default_script_query( query_vector: List[float], space_type: str = "l2", @@ -317,6 +339,11 @@ class OpenSearchVectorSearch(VectorStore): size: number of results the query actually returns; default: 4 + boolean_filter: A Boolean filter consists of a Boolean query that + contains a k-NN query and a filter + + subquery_clause: Query clause on the knn vector field; default: "must" + Optional Args for Script Scoring Search: search_type: "script_scoring"; default: "approximate_search" @@ -339,11 +366,19 @@ class OpenSearchVectorSearch(VectorStore): text_field = _get_kwargs_value(kwargs, "text_field", "text") metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata") vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") + if search_type == "approximate_search": size = _get_kwargs_value(kwargs, "size", 4) - search_query = _default_approximate_search_query( - embedding, size, k, vector_field - ) + boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {}) + subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must") + if boolean_filter != {}: + search_query = _approximate_search_query_with_boolean_filter( + embedding, boolean_filter, size, k, vector_field, subquery_clause + ) + else: + search_query = _default_approximate_search_query( + embedding, size, k, vector_field + ) elif search_type == SCRIPT_SCORING_SEARCH: space_type = _get_kwargs_value(kwargs, "space_type", "l2") pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) diff --git a/tests/integration_tests/vectorstores/test_opensearch.py b/tests/integration_tests/vectorstores/test_opensearch.py index ec132a1e..92bfca4b 100644 --- a/tests/integration_tests/vectorstores/test_opensearch.py +++ b/tests/integration_tests/vectorstores/test_opensearch.py @@ -150,3 +150,17 @@ def test_opensearch_embedding_size_zero() -> None: OpenSearchVectorSearch.from_texts( [], FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL ) + + +def test_appx_search_with_boolean_filter() -> None: + """Test Approximate Search with Boolean Filter.""" + boolean_filter_val = {"bool": {"must": [{"term": {"text": "bar"}}]}} + docsearch = OpenSearchVectorSearch.from_texts( + texts, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + ) + output = docsearch.similarity_search( + "foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should" + ) + assert output == [Document(page_content="bar")]