forked from Archives/langchain
Use embedding instead of embedding function in ElasticVectorStore (#1692)
While it might be a bit more restrictive, I find that using the Embedding interface as an input for the vector store creation is better than an embedding function because we can use bulk requests and possibly the retry logic if needed. I have seen that some vector store implementations use Embedding while others use embedding function so I don't know what is the criteria to have one or the other, in my opinion they should all just be Embedding or have a way more complex embedding function that accepts multiple texts instead of one by one. --------- Co-authored-by: Bernat Felip <bernat.felip@rea.ch>
This commit is contained in:
parent
951c158106
commit
262d4cb9a8
@ -2,7 +2,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
@ -19,7 +19,7 @@ def _default_text_mapping(dim: int) -> Dict:
|
||||
}
|
||||
|
||||
|
||||
def _default_script_query(query_vector: List[int]) -> Dict:
|
||||
def _default_script_query(query_vector: List[float]) -> Dict:
|
||||
return {
|
||||
"script_score": {
|
||||
"query": {"match_all": {}},
|
||||
@ -41,14 +41,12 @@ class ElasticVectorSearch(VectorStore):
|
||||
elastic_vector_search = ElasticVectorSearch(
|
||||
"http://localhost:9200",
|
||||
"embeddings",
|
||||
embedding_function
|
||||
embedding
|
||||
)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, elasticsearch_url: str, index_name: str, embedding_function: Callable
|
||||
):
|
||||
def __init__(self, elasticsearch_url: str, index_name: str, embedding: Embeddings):
|
||||
"""Initialize with necessary components."""
|
||||
try:
|
||||
import elasticsearch
|
||||
@ -57,7 +55,7 @@ class ElasticVectorSearch(VectorStore):
|
||||
"Could not import elasticsearch python package. "
|
||||
"Please install it with `pip install elasticsearch`."
|
||||
)
|
||||
self.embedding_function = embedding_function
|
||||
self.embedding = embedding
|
||||
self.index_name = index_name
|
||||
try:
|
||||
es_client = elasticsearch.Elasticsearch(elasticsearch_url) # noqa
|
||||
@ -91,13 +89,14 @@ class ElasticVectorSearch(VectorStore):
|
||||
)
|
||||
requests = []
|
||||
ids = []
|
||||
embeddings = self.embedding.embed_documents(list(texts))
|
||||
for i, text in enumerate(texts):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
_id = str(uuid.uuid4())
|
||||
request = {
|
||||
"_op_type": "index",
|
||||
"_index": self.index_name,
|
||||
"vector": self.embedding_function(text),
|
||||
"vector": embeddings[i],
|
||||
"text": text,
|
||||
"metadata": metadata,
|
||||
"_id": _id,
|
||||
@ -121,7 +120,7 @@ class ElasticVectorSearch(VectorStore):
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
embedding = self.embedding_function(query)
|
||||
embedding = self.embedding.embed_query(query)
|
||||
script_query = _default_script_query(embedding)
|
||||
response = self.client.search(index=self.index_name, query=script_query)
|
||||
hits = [hit["_source"] for hit in response["hits"]["hits"][:k]]
|
||||
@ -196,4 +195,4 @@ class ElasticVectorSearch(VectorStore):
|
||||
requests.append(request)
|
||||
bulk(client, requests)
|
||||
client.indices.refresh(index=index_name)
|
||||
return cls(elasticsearch_url, index_name, embedding.embed_query)
|
||||
return cls(elasticsearch_url, index_name, embedding)
|
||||
|
Loading…
Reference in New Issue
Block a user