Extend opensearch to better support existing instances (#2500) (#2509)

Closes #2500.
2024-11-08 07:10:35 +00:00 · 2023-04-06 15:45:56 -04:00 · 2023-04-06 15:45:56 -04:00 · 2ffb90b161
commit 2ffb90b161
parent ad87584c35
2 changed files with 62 additions and 9 deletions
--- a/docs/modules/indexes/vectorstores/examples/opensearch.ipynb
+++ b/docs/modules/indexes/vectorstores/examples/opensearch.ipynb
@ -175,7 +175,7 @@
    "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
    "filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
-    "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)"
+    "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)"
   ]
  },
  {
@ -191,6 +191,30 @@
   "source": [
    "print(docs[0].page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73264864",
   "metadata": {},
   "source": [
    "#### Using a preexisting OpenSearch instance\n",
    "\n",
    "It's also possible to use a preexisting OpenSearch instance with documents that already have vectors present."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82a23440",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this is just an example, you would need to change these values to point to another opensearch instance\n",
    "docsearch = OpenSearchVectorSearch(index_name=\"index-*\", embedding_function=embeddings, opensearch_url=\"http://localhost:9200\")\n",
    "\n",
    "# you can specify custom field names to match the fields you're using to store your embedding, document text value, and metadata\n",
    "docs = docsearch.similarity_search(\"Who was asking about getting lunch today?\", search_type=\"script_scoring\", space_type=\"cosinesimil\", vector_field=\"message_embedding\", text_field=\"message\", metadata_field=\"message_metadata\")"
   ]
  }
 ],
 "metadata": {
--- a/langchain/vectorstores/opensearch_vector_search.py
+++ b/langchain/vectorstores/opensearch_vector_search.py
@ -128,12 +128,15 @@ def _default_text_mapping(
 def _default_approximate_search_query(
-    query_vector: List[float], size: int = 4, k: int = 4
+    query_vector: List[float],
    size: int = 4,
    k: int = 4,
    vector_field: str = "vector_field",
 ) -> Dict:
    """For Approximate k-NN Search, this is the default query."""
    return {
        "size": size,
-        "query": {"knn": {"vector_field": {"vector": query_vector, "k": k}}},
+        "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
    }
@ -141,6 +144,7 @@ def _default_script_query(
    query_vector: List[float],
    space_type: str = "l2",
    pre_filter: Dict = MATCH_ALL_QUERY,
    vector_field: str = "vector_field",
 ) -> Dict:
    """For Script Scoring Search, this is the default query."""
    return {
@ -151,7 +155,7 @@ def _default_script_query(
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
-                        "field": "vector_field",
+                        "field": vector_field,
                        "query_value": query_vector,
                        "space_type": space_type,
                    },
@ -176,6 +180,7 @@ def _default_painless_scripting_query(
    query_vector: List[float],
    space_type: str = "l2Squared",
    pre_filter: Dict = MATCH_ALL_QUERY,
    vector_field: str = "vector_field",
 ) -> Dict:
    """For Painless Scripting Search, this is the default query."""
    source = __get_painless_scripting_source(space_type, query_vector)
@ -186,7 +191,7 @@ def _default_painless_scripting_query(
                "script": {
                    "source": source,
                    "params": {
-                        "field": "vector_field",
+                        "field": vector_field,
                        "query_value": query_vector,
                    },
                },
@ -269,6 +274,15 @@ class OpenSearchVectorSearch(VectorStore):
        Returns:
            List of Documents most similar to the query.
        Optional Args:
            vector_field: Document field embeddings are stored in. Defaults to
            "vector_field".
            text_field: Document field the text of the document is stored in. Defaults
            to "text".
            metadata_field: Document field that metadata is stored in. Defaults to
            "metadata".
            Can be set to a special value "*" to include the entire document.
        Optional Args for Approximate Search:
            search_type: "approximate_search"; default: "approximate_search"
            size: number of results the query actually returns; default: 4
@ -291,18 +305,27 @@ class OpenSearchVectorSearch(VectorStore):
        """
        embedding = self.embedding_function.embed_query(query)
        search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
        text_field = _get_kwargs_value(kwargs, "text_field", "text")
        metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
        if search_type == "approximate_search":
            size = _get_kwargs_value(kwargs, "size", 4)
-            search_query = _default_approximate_search_query(embedding, size, k)
+            vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
            search_query = _default_approximate_search_query(
                embedding, size, k, vector_field
            )
        elif search_type == SCRIPT_SCORING_SEARCH:
            space_type = _get_kwargs_value(kwargs, "space_type", "l2")
            pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
-            search_query = _default_script_query(embedding, space_type, pre_filter)
+            vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
            search_query = _default_script_query(
                embedding, space_type, pre_filter, vector_field
            )
        elif search_type == PAINLESS_SCRIPTING_SEARCH:
            space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
            pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
            vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
            search_query = _default_painless_scripting_query(
-                embedding, space_type, pre_filter
+                embedding, space_type, pre_filter, vector_field
            )
        else:
            raise ValueError("Invalid `search_type` provided as an argument")
@ -310,7 +333,13 @@ class OpenSearchVectorSearch(VectorStore):
        response = self.client.search(index=self.index_name, body=search_query)
        hits = [hit["_source"] for hit in response["hits"]["hits"][:k]]
        documents = [
-            Document(page_content=hit["text"], metadata=hit["metadata"]) for hit in hits
+            Document(
                page_content=hit[text_field],
                metadata=hit
                if metadata_field == "*" or metadata_field not in hit
                else hit[metadata_field],
            )
            for hit in hits
        ]
        return documents