From 3613d8a2ad7faee0f3d0a7893179b3eabccebcf9 Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@elmyra.de>
Date: Fri, 19 Jan 2024 03:35:39 +0100
Subject: [PATCH] community[patch]: Use SQLAlchemy's `bulk_save_objects` method
 to improve insert performance (#16244)

- **Description:** Improve [pgvector vector store
adapter](https://github.com/langchain-ai/langchain/blob/v0.1.1/libs/community/langchain_community/vectorstores/pgvector.py)
to save embeddings in batches, to improve its performance.
  - **Issue:** NA
  - **Dependencies:** NA
  - **References:** https://github.com/crate-workbench/langchain/pull/1


Hi again from the CrateDB team,

following up on GH-16243, this is another minor patch to the pgvector
vector store adapter. Inserting embeddings in batches, using
[SQLAlchemy's
`bulk_save_objects`](https://docs.sqlalchemy.org/en/20/orm/session_api.html#sqlalchemy.orm.Session.bulk_save_objects)
method, can deliver substantial performance gains.

With kind regards,
Andreas.

NB: As I am seeing just now that this method is a legacy feature of SA
2.0, it will need to be reworked on a future iteration. However, it is
not deprecated yet, and I haven't been able to come up with a different
implementation, yet.
---
 libs/community/langchain_community/vectorstores/pgvector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/community/langchain_community/vectorstores/pgvector.py b/libs/community/langchain_community/vectorstores/pgvector.py
index 7fed642c45..eff9fac354 100644
--- a/libs/community/langchain_community/vectorstores/pgvector.py
+++ b/libs/community/langchain_community/vectorstores/pgvector.py
@@ -379,6 +379,7 @@ class PGVector(VectorStore):
             collection = self.get_collection(session)
             if not collection:
                 raise ValueError("Collection not found")
+            documents = []
             for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
                 embedding_store = self.EmbeddingStore(
                     embedding=embedding,
@@ -387,7 +388,8 @@ class PGVector(VectorStore):
                     custom_id=id,
                     collection_id=collection.uuid,
                 )
-                session.add(embedding_store)
+                documents.append(embedding_store)
+            session.bulk_save_objects(documents)
             session.commit()
 
         return ids