[Community]: Added Metadata filter support for DocumentDB Vector Store (#22777)

- **Description:** As pointed out in this issue #22770, DocumentDB `similarity_search` does not support filtering through metadata which this PR adds by passing in the parameter `filter`. Also this PR fixes a minor Documentation error. - **Issue:** #22770 --------- Co-authored-by: Erick Friis <erickfriis@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
1 month ago · 36cad5d25c
parent 912751e268
commit 36cad5d25c
1 changed files with 15 additions and 8 deletions
--- a/libs/community/langchain_community/vectorstores/documentdb.py
+++ b/libs/community/langchain_community/vectorstores/documentdb.py
@ -175,6 +175,10 @@ class DocumentDBVectorSearch(VectorStore):
                The maximum number of supported dimensions is 2000

            similarity: Similarity algorithm to use with the HNSW index.
+                 Possible options are:
+                    - DocumentDBSimilarityType.COS (cosine distance),
+                    - DocumentDBSimilarityType.EUC (Euclidean distance), and
+                    - DocumentDBSimilarityType.DOT (dot product).

            m: Specifies the max number of connections for an HNSW index.
                Large impact on memory consumption.
@ -183,10 +187,6 @@ class DocumentDBVectorSearch(VectorStore):
                for constructing the graph for HNSW index. Higher values lead
                to more accurate results but slower indexing speed.

-                Possible options are:
-                    - DocumentDBSimilarityType.COS (cosine distance),
-                    - DocumentDBSimilarityType.EUC (Euclidean distance), and
-                    - DocumentDBSimilarityType.DOT (dot product).

        Returns:
            An object describing the created index
@ -309,7 +309,11 @@ class DocumentDBVectorSearch(VectorStore):
        self._collection.delete_one({"_id": ObjectId(document_id)})

    def _similarity_search_without_score(
-        self, embeddings: List[float], k: int = 4, ef_search: int = 40
+        self,
+        embeddings: List[float],
+        k: int = 4,
+        ef_search: int = 40,
+        filter: Optional[Dict[str, Any]] = None,
    ) -> List[Document]:
        """Returns a list of documents.

@ -319,12 +323,13 @@ class DocumentDBVectorSearch(VectorStore):
            ef_search: Specifies the size of the dynamic candidate list
                that HNSW index uses during search. A higher value of
                efSearch provides better recall at cost of speed.
-
+            filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
        Returns:
            A list of documents closest to the query vector
        """
        pipeline: List[dict[str, Any]] = [
            {
+                "$match": filter,
                "$search": {
                    "vectorSearch": {
                        "vector": embeddings,
@ -333,7 +338,7 @@ class DocumentDBVectorSearch(VectorStore):
                        "k": k,
                        "efSearch": ef_search,
                    }
-                }
+                },
            }
        ]

@ -352,10 +357,12 @@ class DocumentDBVectorSearch(VectorStore):
        query: str,
        k: int = 4,
        ef_search: int = 40,
+        *,
+        filter: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        embeddings = self._embedding.embed_query(query)
        docs = self._similarity_search_without_score(
-            embeddings=embeddings, k=k, ef_search=ef_search
+            embeddings=embeddings, k=k, ef_search=ef_search, filter=filter
        )
        return [doc for doc in docs]