From 9e649462cefdcba1ffb9ecb2e7fd0e98a1d79411 Mon Sep 17 00:00:00 2001 From: Shawn91 Date: Tue, 23 May 2023 03:35:52 +0800 Subject: [PATCH] fix: add_texts method of Weaviate vector store creats wrong embeddings (#4933) # fix a bug in the add_texts method of Weaviate vector store that creats wrong embeddings The following is the original code in the `add_texts` method of the Weaviate vector store, from line 131 to 153, which contains a bug. The code here includes some extra explanations in the form of comments and some omissions. ```python for i, doc in enumerate(texts): # some code omitted if self._embedding is not None: # variable texts is a list of string and doc here is just a string. # list(doc) actually breaks up the string into characters. # so, embeddings[0] is just the embedding of the first character embeddings = self._embedding.embed_documents(list(doc)) batch.add_data_object( data_object=data_properties, class_name=self._index_name, uuid=_id, vector=embeddings[0], ) ``` To fix this bug, I pulled the embedding operation out of the for loop and embed all texts at once. Co-authored-by: Shawn91 Co-authored-by: Dev 2049 --- langchain/vectorstores/weaviate.py | 52 +++++++++++++----------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/langchain/vectorstores/weaviate.py b/langchain/vectorstores/weaviate.py index 9d2a168b..a7398ae7 100644 --- a/langchain/vectorstores/weaviate.py +++ b/langchain/vectorstores/weaviate.py @@ -63,6 +63,12 @@ def _default_score_normalizer(val: float) -> float: return 1 - 1 / (1 + np.exp(val)) +def _json_serializable(value: Any) -> Any: + if isinstance(value, datetime.datetime): + return value.isoformat() + return value + + class Weaviate(VectorStore): """Wrapper around Weaviate vector database. @@ -121,42 +127,30 @@ class Weaviate(VectorStore): """Upload texts with metadata (properties) to Weaviate.""" from weaviate.util import get_valid_uuid - def json_serializable(value: Any) -> Any: - if isinstance(value, datetime.datetime): - return value.isoformat() - return value - + ids = [] with self._client.batch as batch: - ids = [] - for i, doc in enumerate(texts): - data_properties = { - self._text_key: doc, - } + for i, text in enumerate(texts): + data_properties = {self._text_key: text} if metadatas is not None: - for key in metadatas[i].keys(): - data_properties[key] = json_serializable(metadatas[i][key]) + for key, val in metadatas[i].items(): + data_properties[key] = _json_serializable(val) # If the UUID of one of the objects already exists - # then the existing objectwill be replaced by the new object. - if "uuids" in kwargs: - _id = kwargs["uuids"][i] - else: - _id = get_valid_uuid(uuid4()) + # then the existing object will be replaced by the new object. + _id = ( + kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4()) + ) if self._embedding is not None: - embeddings = self._embedding.embed_documents(list(doc)) - batch.add_data_object( - data_object=data_properties, - class_name=self._index_name, - uuid=_id, - vector=embeddings[0], - ) + vector = self._embedding.embed_documents([text])[0] else: - batch.add_data_object( - data_object=data_properties, - class_name=self._index_name, - uuid=_id, - ) + vector = None + batch.add_data_object( + data_object=data_properties, + class_name=self._index_name, + uuid=_id, + vector=vector, + ) ids.append(_id) return ids