diff --git a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py index 74437d6d86..12e1928d05 100644 --- a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py +++ b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py @@ -347,33 +347,15 @@ class OpenSearchVectorSearch(VectorStore): def embeddings(self) -> Embeddings: return self.embedding_function - def add_texts( + def __add( self, texts: Iterable[str], + embeddings: List[List[float]], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, bulk_size: int = 500, **kwargs: Any, ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - ids: Optional list of ids to associate with the texts. - bulk_size: Bulk API request count; Default: 500 - - Returns: - List of ids from adding the texts into the vectorstore. - - Optional Args: - vector_field: Document field embeddings are stored in. Defaults to - "vector_field". - - text_field: Document field the text of the document is stored in. Defaults - to "text". - """ - embeddings = self.embedding_function.embed_documents(list(texts)) _validate_embeddings_and_bulk_size(len(embeddings), bulk_size) index_name = _get_kwargs_value(kwargs, "index_name", self.index_name) text_field = _get_kwargs_value(kwargs, "text_field", "text") @@ -406,6 +388,79 @@ class OpenSearchVectorSearch(VectorStore): is_aoss=self.is_aoss, ) + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + bulk_size: int = 500, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + bulk_size: Bulk API request count; Default: 500 + + Returns: + List of ids from adding the texts into the vectorstore. + + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". + """ + embeddings = self.embedding_function.embed_documents(list(texts)) + return self.__add( + texts, + embeddings, + metadatas=metadatas, + ids=ids, + bulk_size=bulk_size, + kwargs=kwargs, + ) + + def add_embeddings( + self, + text_embeddings: Iterable[Tuple[str, List[float]]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + bulk_size: int = 500, + **kwargs: Any, + ) -> List[str]: + """Add the given texts and embeddings to the vectorstore. + + Args: + text_embeddings: Iterable pairs of string and embedding to + add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + bulk_size: Bulk API request count; Default: 500 + + Returns: + List of ids from adding the texts into the vectorstore. + + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". + """ + texts, embeddings = zip(*text_embeddings) + return self.__add( + list(texts), + list(embeddings), + metadatas=metadatas, + ids=ids, + bulk_size=bulk_size, + kwargs=kwargs, + ) + def similarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: diff --git a/libs/langchain/tests/README.md b/libs/langchain/tests/README.md index f360690023..7e98d49e40 100644 --- a/libs/langchain/tests/README.md +++ b/libs/langchain/tests/README.md @@ -71,6 +71,10 @@ cd tests/integration_tests/vectorstores/docker-compose docker-compose -f elasticsearch.yml up ``` +For environments that requires more involving preparation, look for `*.sh`. For instance, +`opensearch.sh` builds a required docker image and then launch opensearch. + + ### Prepare environment variables for local testing: - copy `tests/.env.example` to `tests/.env` diff --git a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch.sh b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch.sh new file mode 100755 index 0000000000..baba336969 --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch.sh @@ -0,0 +1,8 @@ +#/bin/sh +# references: +# https://github.com/opensearch-project/documentation-website/blob/2.10/assets/examples/docker-compose.yml +# https://opensearch.org/docs/latest/security/configuration/disable/ + +cd opensearch +docker build --tag=opensearch-dashboards-no-security -f opensearch-dashboards-no-security.Dockerfile . +docker compose -f opensearch.yml up \ No newline at end of file diff --git a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch-dashboards-no-security.Dockerfile b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch-dashboards-no-security.Dockerfile new file mode 100644 index 0000000000..f7dccd2206 --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch-dashboards-no-security.Dockerfile @@ -0,0 +1,3 @@ +FROM opensearchproject/opensearch-dashboards:2.10.0 +RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards +COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/ \ No newline at end of file diff --git a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch.yml b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch.yml new file mode 100644 index 0000000000..10366e4a07 --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch.yml @@ -0,0 +1,39 @@ +version: '3' +services: + opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. http://opensearch-node1/) + image: opensearchproject/opensearch:2.10.0 + container_name: opensearch-node1 + environment: + - node.name=opensearch-node1 # Name the node that will run in this container + - plugins.security.disabled=true # security has been disabled, so no login or password is required. + - discovery.type=single-node + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM + volumes: + - opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container + ports: + - 9200:9200 # REST API + - 9600:9600 # Performance Analyzer + networks: + - opensearch-net # All of the containers will join the same Docker bridge network + +# opensearch-dashboards does not work if OpenSearch cluster is not secure. +# to use dashboards, build opensearch-dashboards-no-security first by running +# + opensearch-dashboards: + image: opensearch-dashboards-no-security + container_name: opensearch-dashboards + ports: + - 5601:5601 # Map host port 5601 to container port 5601 + expose: + - "5601" # Expose port 5601 for web access to OpenSearch Dashboards + environment: + OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query + networks: + - opensearch-net + +volumes: + opensearch-data1: + +networks: + opensearch-net: + \ No newline at end of file diff --git a/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch_dashboards.yml b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch_dashboards.yml new file mode 100644 index 0000000000..985654b1ce --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/docker-compose/opensearch/opensearch_dashboards.yml @@ -0,0 +1,3 @@ +server.name: opensearch-dashboards +server.host: "0.0.0.0" +opensearch.hosts: http://localhost:9200 \ No newline at end of file diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py b/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py index 49752c88a0..76573cdc22 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_opensearch.py @@ -8,7 +8,10 @@ from langchain.vectorstores.opensearch_vector_search import ( SCRIPT_SCORING_SEARCH, OpenSearchVectorSearch, ) -from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, + FakeEmbeddings, +) DEFAULT_OPENSEARCH_URL = "http://localhost:9200" texts = ["foo", "bar", "baz"] @@ -87,6 +90,31 @@ def test_add_text() -> None: assert len(docids) == len(text_input) +def test_add_embeddings() -> None: + """ + Test add_embeddings, which accepts pre-built embeddings instead of + using inference for the texts. + This allows you to separate the embeddings text and the page_content + for better proximity between user's question and embedded text. + For example, your embedding text can be a question, whereas page_content + is the answer. + """ + embeddings = ConsistentFakeEmbeddings() + text_input = ["foo1", "foo2", "foo3"] + metadatas = [{"page": i} for i in range(len(text_input))] + + """In real use case, embedding_input can be questions for each text""" + embedding_input = ["foo2", "foo3", "foo1"] + embedding_vectors = embeddings.embed_documents(embedding_input) + + docsearch = OpenSearchVectorSearch.from_texts( + ["filler"], embeddings, opensearch_url=DEFAULT_OPENSEARCH_URL + ) + docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas) + output = docsearch.similarity_search("foo1", k=1) + assert output == [Document(page_content="foo3", metadata={"page": 2})] + + def test_opensearch_script_scoring() -> None: """Test end to end indexing and search using Script Scoring Search.""" pre_filter_val = {"bool": {"filter": {"term": {"text": "bar"}}}}