Add from_embeddings for opensearch (#10957)

This commit is contained in:
William FH 2023-09-29 00:00:58 -07:00 committed by GitHub
parent 73693c18fc
commit db05ea2b78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -736,7 +736,7 @@ class OpenSearchVectorSearch(VectorStore):
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> OpenSearchVectorSearch: ) -> OpenSearchVectorSearch:
"""Construct OpenSearchVectorSearch wrapper from raw documents. """Construct OpenSearchVectorSearch wrapper from raw texts.
Example: Example:
.. code-block:: python .. code-block:: python
@ -754,6 +754,74 @@ class OpenSearchVectorSearch(VectorStore):
and lucene engines recommended for large datasets. Also supports brute force and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting. search through Script Scoring and Painless Scripting.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "lucene"; default: "nmslib"
space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2"
ef_search: Size of the dynamic list used during k-NN searches. Higher values
lead to more accurate but slower searches; default: 512
ef_construction: Size of the dynamic list used during k-NN graph creation.
Higher values lead to more accurate graph but slower indexing speed;
default: 512
m: Number of bidirectional links created for each new element. Large impact
on memory consumption. Between 2 and 100; default: 16
Keyword Args for Script Scoring or Painless Scripting:
is_appx_search: False
"""
embeddings = embedding.embed_documents(texts)
return cls.from_embeddings(
embeddings,
texts,
embedding,
metadatas=metadatas,
bulk_size=bulk_size,
ids=ids,
**kwargs,
)
@classmethod
def from_embeddings(
cls,
embeddings: List[List[float]],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
bulk_size: int = 500,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> OpenSearchVectorSearch:
"""Construct OpenSearchVectorSearch wrapper from pre-vectorized embeddings.
Example:
.. code-block:: python
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.embeddings import OpenAIEmbeddings
embedder = OpenAIEmbeddings()
embeddings = embedder.embed_documents(["foo", "bar"])
opensearch_vector_search = OpenSearchVectorSearch.from_embeddings(
embeddings,
texts,
embedder,
opensearch_url="http://localhost:9200"
)
OpenSearch by default supports Approximate Search powered by nmslib, faiss
and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting.
Optional Args: Optional Args:
vector_field: Document field embeddings are stored in. Defaults to vector_field: Document field embeddings are stored in. Defaults to
"vector_field". "vector_field".
@ -799,7 +867,6 @@ class OpenSearchVectorSearch(VectorStore):
"max_chunk_bytes", "max_chunk_bytes",
"is_aoss", "is_aoss",
] ]
embeddings = embedding.embed_documents(texts)
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) _validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
dim = len(embeddings[0]) dim = len(embeddings[0])
# Get the index name from either from kwargs or ENV Variable # Get the index name from either from kwargs or ENV Variable
@ -843,8 +910,8 @@ class OpenSearchVectorSearch(VectorStore):
index_name, index_name,
embeddings, embeddings,
texts, texts,
metadatas=metadatas,
ids=ids, ids=ids,
metadatas=metadatas,
vector_field=vector_field, vector_field=vector_field,
text_field=text_field, text_field=text_field,
mapping=mapping, mapping=mapping,