Support add_embeddings for opensearch (#11050)

- **Description:** - Make running integration test for opensearch easy - Provide a way to use different text for embedding: refer to #11002 for more of the use case and design decision. - **Issue:** N/A - **Dependencies:** None other than the existing ones.
11 months ago · 17fcbed92c
parent c586f6dc1b
commit 17fcbed92c
7 changed files with 161 additions and 21 deletions
--- a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py
+++ b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py
@ -347,33 +347,15 @@ class OpenSearchVectorSearch(VectorStore):
    def embeddings(self) -> Embeddings:
        return self.embedding_function

-    def add_texts(
+    def __add(
        self,
        texts: Iterable[str],
+        embeddings: List[List[float]],
        metadatas: Optional[List[dict]] = None,
        ids: Optional[List[str]] = None,
        bulk_size: int = 500,
        **kwargs: Any,
    ) -> List[str]:
-        """Run more texts through the embeddings and add to the vectorstore.
-
-        Args:
-            texts: Iterable of strings to add to the vectorstore.
-            metadatas: Optional list of metadatas associated with the texts.
-            ids: Optional list of ids to associate with the texts.
-            bulk_size: Bulk API request count; Default: 500
-
-        Returns:
-            List of ids from adding the texts into the vectorstore.
-
-        Optional Args:
-            vector_field: Document field embeddings are stored in. Defaults to
-            "vector_field".
-
-            text_field: Document field the text of the document is stored in. Defaults
-            to "text".
-        """
-        embeddings = self.embedding_function.embed_documents(list(texts))
        _validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
        index_name = _get_kwargs_value(kwargs, "index_name", self.index_name)
        text_field = _get_kwargs_value(kwargs, "text_field", "text")
@ -406,6 +388,79 @@ class OpenSearchVectorSearch(VectorStore):
            is_aoss=self.is_aoss,
        )

+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        bulk_size: int = 500,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of ids to associate with the texts.
+            bulk_size: Bulk API request count; Default: 500
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+
+        Optional Args:
+            vector_field: Document field embeddings are stored in. Defaults to
+            "vector_field".
+
+            text_field: Document field the text of the document is stored in. Defaults
+            to "text".
+        """
+        embeddings = self.embedding_function.embed_documents(list(texts))
+        return self.__add(
+            texts,
+            embeddings,
+            metadatas=metadatas,
+            ids=ids,
+            bulk_size=bulk_size,
+            kwargs=kwargs,
+        )
+
+    def add_embeddings(
+        self,
+        text_embeddings: Iterable[Tuple[str, List[float]]],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        bulk_size: int = 500,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Add the given texts and embeddings to the vectorstore.
+
+        Args:
+            text_embeddings: Iterable pairs of string and embedding to
+                add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of ids to associate with the texts.
+            bulk_size: Bulk API request count; Default: 500
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+
+        Optional Args:
+            vector_field: Document field embeddings are stored in. Defaults to
+            "vector_field".
+
+            text_field: Document field the text of the document is stored in. Defaults
+            to "text".
+        """
+        texts, embeddings = zip(*text_embeddings)
+        return self.__add(
+            list(texts),
+            list(embeddings),
+            metadatas=metadatas,
+            ids=ids,
+            bulk_size=bulk_size,
+            kwargs=kwargs,
+        )
+
    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
--- a/libs/langchain/tests/README.md
+++ b/libs/langchain/tests/README.md
@ -71,6 +71,10 @@ cd tests/integration_tests/vectorstores/docker-compose
 docker-compose -f elasticsearch.yml up
 ```

+For environments that requires more involving preparation, look for `*.sh`. For instance,
+`opensearch.sh` builds a required docker image and then launch opensearch.
+
+
 ### Prepare environment variables for local testing:

 - copy `tests/.env.example` to `tests/.env`
--- a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch.sh
+++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch.sh
@ -0,0 +1,8 @@
+#/bin/sh
+# references:
+#   https://github.com/opensearch-project/documentation-website/blob/2.10/assets/examples/docker-compose.yml
+#   https://opensearch.org/docs/latest/security/configuration/disable/
+
+cd opensearch
+docker build --tag=opensearch-dashboards-no-security -f opensearch-dashboards-no-security.Dockerfile .
+docker compose -f opensearch.yml up
--- a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch-dashboards-no-security.Dockerfile
+++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch-dashboards-no-security.Dockerfile
@ -0,0 +1,3 @@
+FROM opensearchproject/opensearch-dashboards:2.10.0
+RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards
+COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/
--- a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch.yml
+++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch.yml
@ -0,0 +1,39 @@
+version: '3'
+services:
+  opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. http://opensearch-node1/)
+    image: opensearchproject/opensearch:2.10.0
+    container_name: opensearch-node1
+    environment:
+      - node.name=opensearch-node1 # Name the node that will run in this container
+      - plugins.security.disabled=true # security has been disabled, so no login or password is required.
+      - discovery.type=single-node
+      - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM
+    volumes:
+      - opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
+    ports:
+      - 9200:9200 # REST API
+      - 9600:9600 # Performance Analyzer
+    networks:
+      - opensearch-net # All of the containers will join the same Docker bridge network
+
+# opensearch-dashboards does not work if OpenSearch cluster is not secure.
+# to use dashboards, build opensearch-dashboards-no-security first by running
+# 
+  opensearch-dashboards:
+    image: opensearch-dashboards-no-security
+    container_name: opensearch-dashboards
+    ports:
+      - 5601:5601 # Map host port 5601 to container port 5601
+    expose:
+      - "5601" # Expose port 5601 for web access to OpenSearch Dashboards
+    environment:
+      OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
+    networks:
+      - opensearch-net
+
+volumes:
+  opensearch-data1:
+
+networks:
+  opensearch-net:
+  
--- a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch_dashboards.yml
+++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch_dashboards.yml
@ -0,0 +1,3 @@
+server.name: opensearch-dashboards
+server.host: "0.0.0.0"
+opensearch.hosts: http://localhost:9200
--- a/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py
@ -8,7 +8,10 @@ from langchain.vectorstores.opensearch_vector_search import (
    SCRIPT_SCORING_SEARCH,
    OpenSearchVectorSearch,
 )
-from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    ConsistentFakeEmbeddings,
+    FakeEmbeddings,
+)

 DEFAULT_OPENSEARCH_URL = "http://localhost:9200"
 texts = ["foo", "bar", "baz"]
@ -87,6 +90,31 @@ def test_add_text() -> None:
    assert len(docids) == len(text_input)


+def test_add_embeddings() -> None:
+    """
+    Test add_embeddings, which accepts pre-built embeddings instead of
+     using inference for the texts.
+    This allows you to separate the embeddings text and the page_content
+     for better proximity between user's question and embedded text.
+    For example, your embedding text can be a question, whereas page_content
+     is the answer.
+    """
+    embeddings = ConsistentFakeEmbeddings()
+    text_input = ["foo1", "foo2", "foo3"]
+    metadatas = [{"page": i} for i in range(len(text_input))]
+
+    """In real use case, embedding_input can be questions for each text"""
+    embedding_input = ["foo2", "foo3", "foo1"]
+    embedding_vectors = embeddings.embed_documents(embedding_input)
+
+    docsearch = OpenSearchVectorSearch.from_texts(
+        ["filler"], embeddings, opensearch_url=DEFAULT_OPENSEARCH_URL
+    )
+    docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas)
+    output = docsearch.similarity_search("foo1", k=1)
+    assert output == [Document(page_content="foo3", metadata={"page": 2})]
+
+
 def test_opensearch_script_scoring() -> None:
    """Test end to end indexing and search using Script Scoring Search."""
    pre_filter_val = {"bool": {"filter": {"term": {"text": "bar"}}}}