diff --git a/libs/community/langchain_community/vectorstores/opensearch_vector_search.py b/libs/community/langchain_community/vectorstores/opensearch_vector_search.py index 23d51c2a7a..5f28de34ef 100644 --- a/libs/community/langchain_community/vectorstores/opensearch_vector_search.py +++ b/libs/community/langchain_community/vectorstores/opensearch_vector_search.py @@ -1022,6 +1022,71 @@ class OpenSearchVectorSearch(VectorStore): **kwargs, ) + @classmethod + async def afrom_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + bulk_size: int = 500, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> OpenSearchVectorSearch: + """Asynchronously construct OpenSearchVectorSearch wrapper from raw texts. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import OpenSearchVectorSearch + from langchain_community.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + opensearch_vector_search = await OpenSearchVectorSearch.afrom_texts( + texts, + embeddings, + opensearch_url="http://localhost:9200" + ) + + OpenSearch by default supports Approximate Search powered by nmslib, faiss + and lucene engines recommended for large datasets. Also supports brute force + search through Script Scoring and Painless Scripting. + + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". + + Optional Keyword Args for Approximate Search: + engine: "nmslib", "faiss", "lucene"; default: "nmslib" + + space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2" + + ef_search: Size of the dynamic list used during k-NN searches. Higher values + lead to more accurate but slower searches; default: 512 + + ef_construction: Size of the dynamic list used during k-NN graph creation. + Higher values lead to more accurate graph but slower indexing speed; + default: 512 + + m: Number of bidirectional links created for each new element. Large impact + on memory consumption. Between 2 and 100; default: 16 + + Keyword Args for Script Scoring or Painless Scripting: + is_appx_search: False + + """ + embeddings = await embedding.aembed_documents(texts) + return await cls.afrom_embeddings( + embeddings, + texts, + embedding, + metadatas=metadatas, + bulk_size=bulk_size, + ids=ids, + **kwargs, + ) + @classmethod def from_embeddings( cls, @@ -1151,3 +1216,135 @@ class OpenSearchVectorSearch(VectorStore): ) kwargs["engine"] = engine return cls(opensearch_url, index_name, embedding, **kwargs) + + @classmethod + async def afrom_embeddings( + cls, + embeddings: List[List[float]], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + bulk_size: int = 500, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> OpenSearchVectorSearch: + """Asynchronously construct OpenSearchVectorSearch wrapper from pre-vectorized + embeddings. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import OpenSearchVectorSearch + from langchain_community.embeddings import OpenAIEmbeddings + embedder = OpenAIEmbeddings() + embeddings = await embedder.aembed_documents(["foo", "bar"]) + opensearch_vector_search = + await OpenSearchVectorSearch.afrom_embeddings( + embeddings, + texts, + embedder, + opensearch_url="http://localhost:9200" + ) + + OpenSearch by default supports Approximate Search powered by nmslib, faiss + and lucene engines recommended for large datasets. Also supports brute force + search through Script Scoring and Painless Scripting. + + Optional Args: + vector_field: Document field embeddings are stored in. Defaults to + "vector_field". + + text_field: Document field the text of the document is stored in. Defaults + to "text". + + Optional Keyword Args for Approximate Search: + engine: "nmslib", "faiss", "lucene"; default: "nmslib" + + space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2" + + ef_search: Size of the dynamic list used during k-NN searches. Higher values + lead to more accurate but slower searches; default: 512 + + ef_construction: Size of the dynamic list used during k-NN graph creation. + Higher values lead to more accurate graph but slower indexing speed; + default: 512 + + m: Number of bidirectional links created for each new element. Large impact + on memory consumption. Between 2 and 100; default: 16 + + Keyword Args for Script Scoring or Painless Scripting: + is_appx_search: False + + """ + opensearch_url = get_from_dict_or_env( + kwargs, "opensearch_url", "OPENSEARCH_URL" + ) + # List of arguments that needs to be removed from kwargs + # before passing kwargs to get opensearch client + keys_list = [ + "opensearch_url", + "index_name", + "is_appx_search", + "vector_field", + "text_field", + "engine", + "space_type", + "ef_search", + "ef_construction", + "m", + "max_chunk_bytes", + "is_aoss", + ] + _validate_embeddings_and_bulk_size(len(embeddings), bulk_size) + dim = len(embeddings[0]) + # Get the index name from either from kwargs or ENV Variable + # before falling back to random generation + index_name = get_from_dict_or_env( + kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex + ) + is_appx_search = kwargs.get("is_appx_search", True) + vector_field = kwargs.get("vector_field", "vector_field") + text_field = kwargs.get("text_field", "text") + max_chunk_bytes = kwargs.get("max_chunk_bytes", 1 * 1024 * 1024) + http_auth = kwargs.get("http_auth") + is_aoss = _is_aoss_enabled(http_auth=http_auth) + engine = None + + if is_aoss and not is_appx_search: + raise ValueError( + "Amazon OpenSearch Service Serverless only " + "supports `approximate_search`" + ) + + if is_appx_search: + engine = kwargs.get("engine", "nmslib") + space_type = kwargs.get("space_type", "l2") + ef_search = kwargs.get("ef_search", 512) + ef_construction = kwargs.get("ef_construction", 512) + m = kwargs.get("m", 16) + + _validate_aoss_with_engines(is_aoss, engine) + + mapping = _default_text_mapping( + dim, engine, space_type, ef_search, ef_construction, m, vector_field + ) + else: + mapping = _default_scripting_text_mapping(dim) + + [kwargs.pop(key, None) for key in keys_list] + client = _get_async_opensearch_client(opensearch_url, **kwargs) + await _abulk_ingest_embeddings( + client, + index_name, + embeddings, + texts, + ids=ids, + metadatas=metadatas, + vector_field=vector_field, + text_field=text_field, + mapping=mapping, + max_chunk_bytes=max_chunk_bytes, + is_aoss=is_aoss, + ) + kwargs["engine"] = engine + return cls(opensearch_url, index_name, embedding, **kwargs)