From d006be60ecc856c53bb07eb6db13422ca4387063 Mon Sep 17 00:00:00 2001 From: savoiepe <47090050+savoiepe@users.noreply.github.com> Date: Mon, 1 Jan 2024 19:01:22 -0500 Subject: [PATCH] Added more filtering options to pgvector vectorstore (#14852) - **Description:** Using PGVector vector store, it was only possible to filter for values equals, in or not in metadata. Extended this feature to work with the following keywords : IN, NIN, BETWEEN, GT, LT, NE, EQ, LIKE, CONTAINS, OR, AND --------- Co-authored-by: Harrison Chase --- .../vectorstores/pgvector.py | 77 +++++++++++++++---- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/pgvector.py b/libs/community/langchain_community/vectorstores/pgvector.py index 8a75297ba0..8e3186b6b8 100644 --- a/libs/community/langchain_community/vectorstores/pgvector.py +++ b/libs/community/langchain_community/vectorstores/pgvector.py @@ -485,6 +485,66 @@ class PGVector(VectorStore): ] return docs + def _create_filter_clause(self, key, value): + IN, NIN, BETWEEN, GT, LT, NE = "in", "nin", "between", "gt", "lt", "ne" + EQ, LIKE, CONTAINS, OR, AND = "eq", "like", "contains", "or", "and" + + value_case_insensitive = {k.lower(): v for k, v in value.items()} + if IN in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.in_( + value_case_insensitive[IN] + ) + elif NIN in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.not_in( + value_case_insensitive[NIN] + ) + elif BETWEEN in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.between( + str(value_case_insensitive[BETWEEN][0]), + str(value_case_insensitive[BETWEEN][1]), + ) + elif GT in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext > str( + value_case_insensitive[GT] + ) + elif LT in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext < str( + value_case_insensitive[LT] + ) + elif NE in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext != str( + value_case_insensitive[NE] + ) + elif EQ in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext == str( + value_case_insensitive[EQ] + ) + elif LIKE in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.like( + value_case_insensitive[LIKE] + ) + elif CONTAINS in map(str.lower, value): + filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.contains( + value_case_insensitive[CONTAINS] + ) + elif OR in map(str.lower, value): + or_clauses = [ + self._create_filter_clause(key, sub_value) + for sub_value in value_case_insensitive[OR] + ] + filter_by_metadata = sqlalchemy.or_(or_clauses) + elif AND in map(str.lower, value): + and_clauses = [ + self._create_filter_clause(key, sub_value) + for sub_value in value_case_insensitive[AND] + ] + filter_by_metadata = sqlalchemy.and_(and_clauses) + + else: + filter_by_metadata = None + + return filter_by_metadata + def __query_collection( self, embedding: List[float], @@ -501,22 +561,11 @@ class PGVector(VectorStore): if filter is not None: filter_clauses = [] - IN, NIN = "in", "nin" + for key, value in filter.items(): if isinstance(value, dict): - value_case_insensitive = { - k.lower(): v for k, v in value.items() - } - if IN in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[ - key - ].astext.in_(value_case_insensitive[IN]) - elif NIN in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[ - key - ].astext.not_in(value_case_insensitive[NIN]) - else: - filter_by_metadata = None + filter_by_metadata = self._create_filter_clause(key, value) + if filter_by_metadata is not None: filter_clauses.append(filter_by_metadata) else: