Add from_embeddings for opensearch (#10957)

pull/11216/head
William FH 11 months ago committed by GitHub
parent 73693c18fc
commit db05ea2b78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -736,7 +736,7 @@ class OpenSearchVectorSearch(VectorStore):
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> OpenSearchVectorSearch:
"""Construct OpenSearchVectorSearch wrapper from raw documents.
"""Construct OpenSearchVectorSearch wrapper from raw texts.
Example:
.. code-block:: python
@ -754,6 +754,74 @@ class OpenSearchVectorSearch(VectorStore):
and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
text_field: Document field the text of the document is stored in. Defaults
to "text".
Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "lucene"; default: "nmslib"
space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2"
ef_search: Size of the dynamic list used during k-NN searches. Higher values
lead to more accurate but slower searches; default: 512
ef_construction: Size of the dynamic list used during k-NN graph creation.
Higher values lead to more accurate graph but slower indexing speed;
default: 512
m: Number of bidirectional links created for each new element. Large impact
on memory consumption. Between 2 and 100; default: 16
Keyword Args for Script Scoring or Painless Scripting:
is_appx_search: False
"""
embeddings = embedding.embed_documents(texts)
return cls.from_embeddings(
embeddings,
texts,
embedding,
metadatas=metadatas,
bulk_size=bulk_size,
ids=ids,
**kwargs,
)
@classmethod
def from_embeddings(
cls,
embeddings: List[List[float]],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
bulk_size: int = 500,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> OpenSearchVectorSearch:
"""Construct OpenSearchVectorSearch wrapper from pre-vectorized embeddings.
Example:
.. code-block:: python
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.embeddings import OpenAIEmbeddings
embedder = OpenAIEmbeddings()
embeddings = embedder.embed_documents(["foo", "bar"])
opensearch_vector_search = OpenSearchVectorSearch.from_embeddings(
embeddings,
texts,
embedder,
opensearch_url="http://localhost:9200"
)
OpenSearch by default supports Approximate Search powered by nmslib, faiss
and lucene engines recommended for large datasets. Also supports brute force
search through Script Scoring and Painless Scripting.
Optional Args:
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
@ -799,7 +867,6 @@ class OpenSearchVectorSearch(VectorStore):
"max_chunk_bytes",
"is_aoss",
]
embeddings = embedding.embed_documents(texts)
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
dim = len(embeddings[0])
# Get the index name from either from kwargs or ENV Variable
@ -843,8 +910,8 @@ class OpenSearchVectorSearch(VectorStore):
index_name,
embeddings,
texts,
metadatas=metadatas,
ids=ids,
metadatas=metadatas,
vector_field=vector_field,
text_field=text_field,
mapping=mapping,

Loading…
Cancel
Save