Support add_embeddings for opensearch (#11050)

- **Description:**
      -  Make running integration test for opensearch easy
- Provide a way to use different text for embedding: refer to #11002 for
more of the use case and design decision.
  - **Issue:** N/A
  - **Dependencies:** None other than the existing ones.
pull/11202/head
Kenneth Choe 11 months ago committed by GitHub
parent c586f6dc1b
commit 17fcbed92c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -347,33 +347,15 @@ class OpenSearchVectorSearch(VectorStore):
def embeddings(self) -> Embeddings:
return self.embedding_function
def add_texts(
def __add(
self,
texts: Iterable[str],
embeddings: List[List[float]],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
bulk_size: int = 500,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
bulk_size: Bulk API request count; Default: 500
Returns:
List of ids from adding the texts into the vectorstore.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
"""
embeddings = self.embedding_function.embed_documents(list(texts))
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
index_name = _get_kwargs_value(kwargs, "index_name", self.index_name)
text_field = _get_kwargs_value(kwargs, "text_field", "text")
@ -406,6 +388,79 @@ class OpenSearchVectorSearch(VectorStore):
is_aoss=self.is_aoss,
)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
bulk_size: int = 500,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
bulk_size: Bulk API request count; Default: 500
Returns:
List of ids from adding the texts into the vectorstore.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
"""
embeddings = self.embedding_function.embed_documents(list(texts))
return self.__add(
texts,
embeddings,
metadatas=metadatas,
ids=ids,
bulk_size=bulk_size,
kwargs=kwargs,
)
def add_embeddings(
self,
text_embeddings: Iterable[Tuple[str, List[float]]],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
bulk_size: int = 500,
**kwargs: Any,
) -> List[str]:
"""Add the given texts and embeddings to the vectorstore.
Args:
text_embeddings: Iterable pairs of string and embedding to
add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
bulk_size: Bulk API request count; Default: 500
Returns:
List of ids from adding the texts into the vectorstore.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
"""
texts, embeddings = zip(*text_embeddings)
return self.__add(
list(texts),
list(embeddings),
metadatas=metadatas,
ids=ids,
bulk_size=bulk_size,
kwargs=kwargs,
)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:

@ -71,6 +71,10 @@ cd tests/integration_tests/vectorstores/docker-compose
docker-compose -f elasticsearch.yml up
```
For environments that requires more involving preparation, look for `*.sh`. For instance,
`opensearch.sh` builds a required docker image and then launch opensearch.
### Prepare environment variables for local testing:
- copy `tests/.env.example` to `tests/.env`

@ -0,0 +1,8 @@
#/bin/sh
# references:
# https://github.com/opensearch-project/documentation-website/blob/2.10/assets/examples/docker-compose.yml
# https://opensearch.org/docs/latest/security/configuration/disable/
cd opensearch
docker build --tag=opensearch-dashboards-no-security -f opensearch-dashboards-no-security.Dockerfile .
docker compose -f opensearch.yml up

@ -0,0 +1,3 @@
FROM opensearchproject/opensearch-dashboards:2.10.0
RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards
COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/

@ -0,0 +1,39 @@
version: '3'
services:
opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. http://opensearch-node1/)
image: opensearchproject/opensearch:2.10.0
container_name: opensearch-node1
environment:
- node.name=opensearch-node1 # Name the node that will run in this container
- plugins.security.disabled=true # security has been disabled, so no login or password is required.
- discovery.type=single-node
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM
volumes:
- opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
ports:
- 9200:9200 # REST API
- 9600:9600 # Performance Analyzer
networks:
- opensearch-net # All of the containers will join the same Docker bridge network
# opensearch-dashboards does not work if OpenSearch cluster is not secure.
# to use dashboards, build opensearch-dashboards-no-security first by running
#
opensearch-dashboards:
image: opensearch-dashboards-no-security
container_name: opensearch-dashboards
ports:
- 5601:5601 # Map host port 5601 to container port 5601
expose:
- "5601" # Expose port 5601 for web access to OpenSearch Dashboards
environment:
OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
networks:
- opensearch-net
volumes:
opensearch-data1:
networks:
opensearch-net:

@ -0,0 +1,3 @@
server.name: opensearch-dashboards
server.host: "0.0.0.0"
opensearch.hosts: http://localhost:9200

@ -8,7 +8,10 @@ from langchain.vectorstores.opensearch_vector_search import (
SCRIPT_SCORING_SEARCH,
OpenSearchVectorSearch,
)
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
FakeEmbeddings,
)
DEFAULT_OPENSEARCH_URL = "http://localhost:9200"
texts = ["foo", "bar", "baz"]
@ -87,6 +90,31 @@ def test_add_text() -> None:
assert len(docids) == len(text_input)
def test_add_embeddings() -> None:
"""
Test add_embeddings, which accepts pre-built embeddings instead of
using inference for the texts.
This allows you to separate the embeddings text and the page_content
for better proximity between user's question and embedded text.
For example, your embedding text can be a question, whereas page_content
is the answer.
"""
embeddings = ConsistentFakeEmbeddings()
text_input = ["foo1", "foo2", "foo3"]
metadatas = [{"page": i} for i in range(len(text_input))]
"""In real use case, embedding_input can be questions for each text"""
embedding_input = ["foo2", "foo3", "foo1"]
embedding_vectors = embeddings.embed_documents(embedding_input)
docsearch = OpenSearchVectorSearch.from_texts(
["filler"], embeddings, opensearch_url=DEFAULT_OPENSEARCH_URL
)
docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas)
output = docsearch.similarity_search("foo1", k=1)
assert output == [Document(page_content="foo3", metadata={"page": 2})]
def test_opensearch_script_scoring() -> None:
"""Test end to end indexing and search using Script Scoring Search."""
pre_filter_val = {"bool": {"filter": {"term": {"text": "bar"}}}}

Loading…
Cancel
Save