From 79b59a8e06dcaeaad9c1e9b7c1a9c8b22555c057 Mon Sep 17 00:00:00 2001 From: Feras Almannaa Date: Wed, 5 Jul 2023 23:19:42 +0300 Subject: [PATCH] optimize pgvector `add_texts` (#7185) - Description: At the moment, inserting new embeddings to pgvector is querying all embeddings every time as the defined `embeddings` relationship is using the default params, which sets `lazy="select"`. This change drastically improves the performance and adds a few additional cleanups: * remove `collection.embeddings.append` as it was querying all embeddings on insert, replace with `collection_id` param * centralize storing logic in add_embeddings function to reduce duplication * remove boilerplate - Issue: No issue was opened. - Dependencies: None. - Tag maintainer: this is a vectorstore update, so I think @rlancemartin, @eyurtsev - Twitter handle: @falmannaa --- langchain/vectorstores/pgvector.py | 50 ++++++++++-------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/langchain/vectorstores/pgvector.py b/langchain/vectorstores/pgvector.py index 59d1c8f786..d124808eed 100644 --- a/langchain/vectorstores/pgvector.py +++ b/langchain/vectorstores/pgvector.py @@ -207,12 +207,6 @@ class PGVector(VectorStore): pre_delete_collection: bool = False, **kwargs: Any, ) -> PGVector: - if ids is None: - ids = [str(uuid.uuid1()) for _ in texts] - - if not metadatas: - metadatas = [{} for _ in texts] - connection_string = cls.get_connection_string(kwargs) store = cls( @@ -231,12 +225,12 @@ class PGVector(VectorStore): def add_embeddings( self, - texts: List[str], + texts: Iterable[str], embeddings: List[List[float]], - metadatas: List[dict], - ids: List[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, **kwargs: Any, - ) -> None: + ) -> List[str]: """Add embeddings to the vectorstore. Args: @@ -245,6 +239,12 @@ class PGVector(VectorStore): metadatas: List of metadatas associated with the texts. kwargs: vectorstore specific parameters """ + if ids is None: + ids = [str(uuid.uuid1()) for _ in texts] + + if not metadatas: + metadatas = [{} for _ in texts] + with Session(self._conn) as session: collection = self.get_collection(session) if not collection: @@ -255,11 +255,13 @@ class PGVector(VectorStore): document=text, cmetadata=metadata, custom_id=id, + collection_id=collection.uuid, ) - collection.embeddings.append(embedding_store) session.add(embedding_store) session.commit() + return ids + def add_texts( self, texts: Iterable[str], @@ -277,30 +279,10 @@ class PGVector(VectorStore): Returns: List of ids from adding the texts into the vectorstore. """ - if ids is None: - ids = [str(uuid.uuid1()) for _ in texts] - embeddings = self.embedding_function.embed_documents(list(texts)) - - if not metadatas: - metadatas = [{} for _ in texts] - - with Session(self._conn) as session: - collection = self.get_collection(session) - if not collection: - raise ValueError("Collection not found") - for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids): - embedding_store = EmbeddingStore( - embedding=embedding, - document=text, - cmetadata=metadata, - custom_id=id, - ) - collection.embeddings.append(embedding_store) - session.add(embedding_store) - session.commit() - - return ids + return self.add_embeddings( + texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs + ) def similarity_search( self,