From 3613d8a2ad7faee0f3d0a7893179b3eabccebcf9 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Fri, 19 Jan 2024 03:35:39 +0100 Subject: [PATCH] community[patch]: Use SQLAlchemy's `bulk_save_objects` method to improve insert performance (#16244) - **Description:** Improve [pgvector vector store adapter](https://github.com/langchain-ai/langchain/blob/v0.1.1/libs/community/langchain_community/vectorstores/pgvector.py) to save embeddings in batches, to improve its performance. - **Issue:** NA - **Dependencies:** NA - **References:** https://github.com/crate-workbench/langchain/pull/1 Hi again from the CrateDB team, following up on GH-16243, this is another minor patch to the pgvector vector store adapter. Inserting embeddings in batches, using [SQLAlchemy's `bulk_save_objects`](https://docs.sqlalchemy.org/en/20/orm/session_api.html#sqlalchemy.orm.Session.bulk_save_objects) method, can deliver substantial performance gains. With kind regards, Andreas. NB: As I am seeing just now that this method is a legacy feature of SA 2.0, it will need to be reworked on a future iteration. However, it is not deprecated yet, and I haven't been able to come up with a different implementation, yet. --- libs/community/langchain_community/vectorstores/pgvector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/pgvector.py b/libs/community/langchain_community/vectorstores/pgvector.py index 7fed642c45..eff9fac354 100644 --- a/libs/community/langchain_community/vectorstores/pgvector.py +++ b/libs/community/langchain_community/vectorstores/pgvector.py @@ -379,6 +379,7 @@ class PGVector(VectorStore): collection = self.get_collection(session) if not collection: raise ValueError("Collection not found") + documents = [] for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids): embedding_store = self.EmbeddingStore( embedding=embedding, @@ -387,7 +388,8 @@ class PGVector(VectorStore): custom_id=id, collection_id=collection.uuid, ) - session.add(embedding_store) + documents.append(embedding_store) + session.bulk_save_objects(documents) session.commit() return ids