From e93240f02375ca49d95feb00d1ec8d28c54c4c07 Mon Sep 17 00:00:00 2001 From: Jeremy Lai <53891374+PTTrazavi@users.noreply.github.com> Date: Sat, 9 Sep 2023 07:50:30 +0800 Subject: [PATCH] add where_document filter for chroma (#10214) - Description: add where_document filter parameter in Chroma - Issue: [10082](https://github.com/langchain-ai/langchain/issues/10082) - Dependencies: no - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: no @hwchase17 --------- Co-authored-by: Jeremy Lai Co-authored-by: Bagatur --- .../langchain/vectorstores/chroma.py | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py index 76469357a2..706588202b 100644 --- a/libs/langchain/langchain/vectorstores/chroma.py +++ b/libs/langchain/langchain/vectorstores/chroma.py @@ -142,6 +142,7 @@ class Chroma(VectorStore): query_embeddings: Optional[List[List[float]]] = None, n_results: int = 4, where: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Query the chroma collection.""" @@ -157,6 +158,7 @@ class Chroma(VectorStore): query_embeddings=query_embeddings, n_results=n_results, where=where, + where_document=where_document, **kwargs, ) @@ -264,6 +266,7 @@ class Chroma(VectorStore): embedding: List[float], k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -275,7 +278,10 @@ class Chroma(VectorStore): List of Documents most similar to the query vector. """ results = self.__query_collection( - query_embeddings=embedding, n_results=k, where=filter + query_embeddings=embedding, + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs(results) @@ -284,6 +290,7 @@ class Chroma(VectorStore): embedding: List[float], k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """ @@ -300,7 +307,10 @@ class Chroma(VectorStore): Lower score represents more similarity. """ results = self.__query_collection( - query_embeddings=embedding, n_results=k, where=filter + query_embeddings=embedding, + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs_and_scores(results) @@ -309,6 +319,7 @@ class Chroma(VectorStore): query: str, k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Run similarity search with Chroma with distance. @@ -325,12 +336,18 @@ class Chroma(VectorStore): """ if self._embedding_function is None: results = self.__query_collection( - query_texts=[query], n_results=k, where=filter + query_texts=[query], + n_results=k, + where=filter, + where_document=where_document, ) else: query_embedding = self._embedding_function.embed_query(query) results = self.__query_collection( - query_embeddings=[query_embedding], n_results=k, where=filter + query_embeddings=[query_embedding], + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs_and_scores(results) @@ -374,6 +391,7 @@ class Chroma(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -398,6 +416,7 @@ class Chroma(VectorStore): query_embeddings=embedding, n_results=fetch_k, where=filter, + where_document=where_document, include=["metadatas", "documents", "distances", "embeddings"], ) mmr_selected = maximal_marginal_relevance( @@ -419,6 +438,7 @@ class Chroma(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -445,7 +465,12 @@ class Chroma(VectorStore): embedding = self._embedding_function.embed_query(query) docs = self.max_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult=lambda_mult, filter=filter + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + where_document=where_document, ) return docs @@ -472,7 +497,7 @@ class Chroma(VectorStore): offset: The offset to start returning results from. Useful for paging results with limit. Optional. where_document: A WhereDocument type dict used to filter by the documents. - E.g. `{$contains: {"text": "hello"}}`. Optional. + E.g. `{$contains: "hello"}`. Optional. include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`. Ids are always included.