mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
optimize pgvector add_texts
(#7185)
- Description: At the moment, inserting new embeddings to pgvector is querying all embeddings every time as the defined `embeddings` relationship is using the default params, which sets `lazy="select"`. This change drastically improves the performance and adds a few additional cleanups: * remove `collection.embeddings.append` as it was querying all embeddings on insert, replace with `collection_id` param * centralize storing logic in add_embeddings function to reduce duplication * remove boilerplate - Issue: No issue was opened. - Dependencies: None. - Tag maintainer: this is a vectorstore update, so I think @rlancemartin, @eyurtsev - Twitter handle: @falmannaa
This commit is contained in:
parent
6711854e30
commit
79b59a8e06
@ -207,12 +207,6 @@ class PGVector(VectorStore):
|
||||
pre_delete_collection: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> PGVector:
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
connection_string = cls.get_connection_string(kwargs)
|
||||
|
||||
store = cls(
|
||||
@ -231,12 +225,12 @@ class PGVector(VectorStore):
|
||||
|
||||
def add_embeddings(
|
||||
self,
|
||||
texts: List[str],
|
||||
texts: Iterable[str],
|
||||
embeddings: List[List[float]],
|
||||
metadatas: List[dict],
|
||||
ids: List[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
) -> List[str]:
|
||||
"""Add embeddings to the vectorstore.
|
||||
|
||||
Args:
|
||||
@ -245,6 +239,12 @@ class PGVector(VectorStore):
|
||||
metadatas: List of metadatas associated with the texts.
|
||||
kwargs: vectorstore specific parameters
|
||||
"""
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
with Session(self._conn) as session:
|
||||
collection = self.get_collection(session)
|
||||
if not collection:
|
||||
@ -255,11 +255,13 @@ class PGVector(VectorStore):
|
||||
document=text,
|
||||
cmetadata=metadata,
|
||||
custom_id=id,
|
||||
collection_id=collection.uuid,
|
||||
)
|
||||
collection.embeddings.append(embedding_store)
|
||||
session.add(embedding_store)
|
||||
session.commit()
|
||||
|
||||
return ids
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
@ -277,30 +279,10 @@ class PGVector(VectorStore):
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
with Session(self._conn) as session:
|
||||
collection = self.get_collection(session)
|
||||
if not collection:
|
||||
raise ValueError("Collection not found")
|
||||
for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
|
||||
embedding_store = EmbeddingStore(
|
||||
embedding=embedding,
|
||||
document=text,
|
||||
cmetadata=metadata,
|
||||
custom_id=id,
|
||||
)
|
||||
collection.embeddings.append(embedding_store)
|
||||
session.add(embedding_store)
|
||||
session.commit()
|
||||
|
||||
return ids
|
||||
return self.add_embeddings(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user