From 262d4cb9a84f46063304b0cb6007655e37efe773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernat=20Felip=20i=20D=C3=ADaz?= Date: Sun, 19 Mar 2023 18:23:38 +0100 Subject: [PATCH] Use embedding instead of embedding function in ElasticVectorStore (#1692) While it might be a bit more restrictive, I find that using the Embedding interface as an input for the vector store creation is better than an embedding function because we can use bulk requests and possibly the retry logic if needed. I have seen that some vector store implementations use Embedding while others use embedding function so I don't know what is the criteria to have one or the other, in my opinion they should all just be Embedding or have a way more complex embedding function that accepts multiple texts instead of one by one. --------- Co-authored-by: Bernat Felip --- .../vectorstores/elastic_vector_search.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py index 4f37800e..57a73dcd 100644 --- a/langchain/vectorstores/elastic_vector_search.py +++ b/langchain/vectorstores/elastic_vector_search.py @@ -2,7 +2,7 @@ from __future__ import annotations import uuid -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings @@ -19,7 +19,7 @@ def _default_text_mapping(dim: int) -> Dict: } -def _default_script_query(query_vector: List[int]) -> Dict: +def _default_script_query(query_vector: List[float]) -> Dict: return { "script_score": { "query": {"match_all": {}}, @@ -41,14 +41,12 @@ class ElasticVectorSearch(VectorStore): elastic_vector_search = ElasticVectorSearch( "http://localhost:9200", "embeddings", - embedding_function + embedding ) """ - def __init__( - self, elasticsearch_url: str, index_name: str, embedding_function: Callable - ): + def __init__(self, elasticsearch_url: str, index_name: str, embedding: Embeddings): """Initialize with necessary components.""" try: import elasticsearch @@ -57,7 +55,7 @@ class ElasticVectorSearch(VectorStore): "Could not import elasticsearch python package. " "Please install it with `pip install elasticsearch`." ) - self.embedding_function = embedding_function + self.embedding = embedding self.index_name = index_name try: es_client = elasticsearch.Elasticsearch(elasticsearch_url) # noqa @@ -91,13 +89,14 @@ class ElasticVectorSearch(VectorStore): ) requests = [] ids = [] + embeddings = self.embedding.embed_documents(list(texts)) for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} _id = str(uuid.uuid4()) request = { "_op_type": "index", "_index": self.index_name, - "vector": self.embedding_function(text), + "vector": embeddings[i], "text": text, "metadata": metadata, "_id": _id, @@ -121,7 +120,7 @@ class ElasticVectorSearch(VectorStore): Returns: List of Documents most similar to the query. """ - embedding = self.embedding_function(query) + embedding = self.embedding.embed_query(query) script_query = _default_script_query(embedding) response = self.client.search(index=self.index_name, query=script_query) hits = [hit["_source"] for hit in response["hits"]["hits"][:k]] @@ -196,4 +195,4 @@ class ElasticVectorSearch(VectorStore): requests.append(request) bulk(client, requests) client.indices.refresh(index=index_name) - return cls(elasticsearch_url, index_name, embedding.embed_query) + return cls(elasticsearch_url, index_name, embedding)