Weaviate: Batch embed texts (#5903)

When a custom Embeddings object is set, embed all given texts in a batch
instead of passing them through individually. Any code calling add_texts
can then appropriately size the chunks of texts that are passed through
to take full advantage of the hardware it's running on.
This commit is contained in:
Ben Perry 2023-07-13 19:57:58 -05:00 committed by GitHub
parent 574698a5fb
commit 3874bb256e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -135,6 +135,12 @@ class Weaviate(VectorStore):
from weaviate.util import get_valid_uuid from weaviate.util import get_valid_uuid
ids = [] ids = []
embeddings: Optional[List[List[float]]] = None
if self._embedding:
if not isinstance(texts, list):
texts = list(texts)
embeddings = self._embedding.embed_documents(texts)
with self._client.batch as batch: with self._client.batch as batch:
for i, text in enumerate(texts): for i, text in enumerate(texts):
data_properties = {self._text_key: text} data_properties = {self._text_key: text}
@ -152,15 +158,11 @@ class Weaviate(VectorStore):
elif "ids" in kwargs: elif "ids" in kwargs:
_id = kwargs["ids"][i] _id = kwargs["ids"][i]
if self._embedding is not None:
vector = self._embedding.embed_documents([text])[0]
else:
vector = None
batch.add_data_object( batch.add_data_object(
data_object=data_properties, data_object=data_properties,
class_name=self._index_name, class_name=self._index_name,
uuid=_id, uuid=_id,
vector=vector, vector=embeddings[i] if embeddings else None,
) )
ids.append(_id) ids.append(_id)
return ids return ids