From 0d5a90f30af185278a0334b11838c042d29d4ef9 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Fri, 4 Aug 2023 08:13:32 -0700
Subject: [PATCH] Revert "add filter to sklearn vector store functions (#8113)"
 (#8760)

---
 .../integrations/vectorstores/sklearn.ipynb   | 37 ++-------
 .../langchain/vectorstores/sklearn.py         | 80 ++++---------------
 .../unit_tests/vectorstores/test_sklearn.py   | 40 ++--------
 3 files changed, 26 insertions(+), 131 deletions(-)

diff --git a/docs/extras/integrations/vectorstores/sklearn.ipynb b/docs/extras/integrations/vectorstores/sklearn.ipynb
index ea86d68da1..b93c734a74 100644
--- a/docs/extras/integrations/vectorstores/sklearn.ipynb
+++ b/docs/extras/integrations/vectorstores/sklearn.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +65,7 @@
     "from langchain.vectorstores import SKLearnVectorStore\n",
     "from langchain.document_loaders import TextLoader\n",
     "\n",
-    "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n",
+    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
     "documents = loader.load()\n",
     "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
     "docs = text_splitter.split_documents(documents)\n",
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -100,7 +100,6 @@
    ],
    "source": [
     "import tempfile\n",
-    "import os\n",
     "\n",
     "persist_path = os.path.join(tempfile.gettempdir(), \"union.parquet\")\n",
     "\n",
@@ -185,32 +184,6 @@
     "print(docs[0].page_content)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Filter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n"
-     ]
-    }
-   ],
-   "source": [
-    "_filter = {\"id\": \"c53e6eac-0070-403c-8435-a9e528539610\"}\n",
-    "docs = vector_store.similarity_search(query, filter=_filter)\n",
-    "print(len(docs))"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -244,7 +217,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.1"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/libs/langchain/langchain/vectorstores/sklearn.py b/libs/langchain/langchain/vectorstores/sklearn.py
index 69845a2c70..dcc6237c25 100644
--- a/libs/langchain/langchain/vectorstores/sklearn.py
+++ b/libs/langchain/langchain/vectorstores/sklearn.py
@@ -233,66 +233,33 @@ class SKLearnVectorStore(VectorStore):
         return list(zip(neigh_idxs[0], neigh_dists[0]))
 
     def similarity_search_with_score(
-        self,
-        query: str,
-        *,
-        k: int = DEFAULT_K,
-        fetch_k: int = DEFAULT_FETCH_K,
-        filter: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
+        self, query: str, *, k: int = DEFAULT_K, **kwargs: Any
     ) -> List[Tuple[Document, float]]:
         query_embedding = self._embedding_function.embed_query(query)
         indices_dists = self._similarity_index_search_with_score(
-            query_embedding, k=fetch_k, **kwargs
+            query_embedding, k=k, **kwargs
         )
-
-        docs: List[Tuple[Document, float]] = []
-        for idx, dist in indices_dists:
-            doc = (
+        return [
+            (
                 Document(
                     page_content=self._texts[idx],
                     metadata={"id": self._ids[idx], **self._metadatas[idx]},
                 ),
                 dist,
             )
-
-            if filter is None:
-                docs.append(doc)
-            else:
-                filter = {
-                    key: [value] if not isinstance(value, list) else value
-                    for key, value in filter.items()
-                }
-                if all(
-                    doc[0].metadata.get(key) in value for key, value in filter.items()
-                ):
-                    docs.append(doc)
-        return docs[:k]
+            for idx, dist in indices_dists
+        ]
 
     def similarity_search(
-        self,
-        query: str,
-        k: int = DEFAULT_K,
-        fetch_k: int = DEFAULT_FETCH_K,
-        filter: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
+        self, query: str, k: int = DEFAULT_K, **kwargs: Any
     ) -> List[Document]:
-        docs_scores = self.similarity_search_with_score(
-            query, k=k, fetch_k=fetch_k, filter=filter, **kwargs
-        )
+        docs_scores = self.similarity_search_with_score(query, k=k, **kwargs)
         return [doc for doc, _ in docs_scores]
 
     def _similarity_search_with_relevance_scores(
-        self,
-        query: str,
-        k: int = DEFAULT_K,
-        fetch_k: int = DEFAULT_FETCH_K,
-        filter: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
+        self, query: str, k: int = DEFAULT_K, **kwargs: Any
     ) -> List[Tuple[Document, float]]:
-        docs_dists = self.similarity_search_with_score(
-            query, k=k, fetch_k=fetch_k, filter=filter, **kwargs
-        )
+        docs_dists = self.similarity_search_with_score(query, k=k, **kwargs)
         docs, dists = zip(*docs_dists)
         scores = [1 / math.exp(dist) for dist in dists]
         return list(zip(list(docs), scores))
@@ -303,7 +270,6 @@ class SKLearnVectorStore(VectorStore):
         k: int = DEFAULT_K,
         fetch_k: int = DEFAULT_FETCH_K,
         lambda_mult: float = 0.5,
-        filter: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> List[Document]:
         """Return docs selected using the maximal marginal relevance.
@@ -317,7 +283,6 @@ class SKLearnVectorStore(VectorStore):
                         of diversity among the results with 0 corresponding
                         to maximum diversity and 1 to minimum diversity.
                         Defaults to 0.5.
-            filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
         Returns:
             List of Documents selected by maximal marginal relevance.
         """
@@ -329,28 +294,17 @@ class SKLearnVectorStore(VectorStore):
         mmr_selected = maximal_marginal_relevance(
             self._np.array(embedding, dtype=self._np.float32),
             result_embeddings,
-            k=fetch_k,
+            k=k,
             lambda_mult=lambda_mult,
         )
         mmr_indices = [indices[i] for i in mmr_selected]
-
-        docs = []
-        for idx in mmr_indices:
-            doc = Document(
+        return [
+            Document(
                 page_content=self._texts[idx],
                 metadata={"id": self._ids[idx], **self._metadatas[idx]},
             )
-            if filter is None:
-                docs.append(doc)
-            else:
-                filter = {
-                    key: [value] if not isinstance(value, list) else value
-                    for key, value in filter.items()
-                }
-                if all(doc.metadata.get(key) in value for key, value in filter.items()):
-                    docs.append(doc)
-
-        return docs[:k]
+            for idx in mmr_indices
+        ]
 
     def max_marginal_relevance_search(
         self,
@@ -358,7 +312,6 @@ class SKLearnVectorStore(VectorStore):
         k: int = DEFAULT_K,
         fetch_k: int = DEFAULT_FETCH_K,
         lambda_mult: float = 0.5,
-        filter: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> List[Document]:
         """Return docs selected using the maximal marginal relevance.
@@ -372,7 +325,6 @@ class SKLearnVectorStore(VectorStore):
                         of diversity among the results with 0 corresponding
                         to maximum diversity and 1 to minimum diversity.
                         Defaults to 0.5.
-            filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
         Returns:
             List of Documents selected by maximal marginal relevance.
         """
@@ -383,7 +335,7 @@ class SKLearnVectorStore(VectorStore):
 
         embedding = self._embedding_function.embed_query(query)
         docs = self.max_marginal_relevance_search_by_vector(
-            embedding, k, fetch_k, lambda_mul=lambda_mult, filter=filter, **kwargs
+            embedding, k, fetch_k, lambda_mul=lambda_mult
         )
         return docs
 
diff --git a/libs/langchain/tests/unit_tests/vectorstores/test_sklearn.py b/libs/langchain/tests/unit_tests/vectorstores/test_sklearn.py
index b14d3b2db6..36bfca1e02 100644
--- a/libs/langchain/tests/unit_tests/vectorstores/test_sklearn.py
+++ b/libs/langchain/tests/unit_tests/vectorstores/test_sklearn.py
@@ -12,7 +12,7 @@ def test_sklearn() -> None:
     """Test end to end construction and search."""
     texts = ["foo", "bar", "baz"]
     docsearch = SKLearnVectorStore.from_texts(texts, FakeEmbeddings())
-    output = docsearch.similarity_search("foo", k=1, fetch_k=3)
+    output = docsearch.similarity_search("foo", k=1)
     assert len(output) == 1
     assert output[0].page_content == "foo"
 
@@ -27,24 +27,10 @@ def test_sklearn_with_metadatas() -> None:
         FakeEmbeddings(),
         metadatas=metadatas,
     )
-    output = docsearch.similarity_search("foo", k=1, fetch_k=3)
+    output = docsearch.similarity_search("foo", k=1)
     assert output[0].metadata["page"] == "0"
 
 
-@pytest.mark.requires("numpy", "sklearn")
-def test_sklearn_with_metadatas_and_filter() -> None:
-    """Test end to end construction and search."""
-    texts = ["foo", "foo", "bar", "baz"]
-    metadatas = [{"page": str(i)} for i in range(len(texts))]
-    docsearch = SKLearnVectorStore.from_texts(
-        texts,
-        FakeEmbeddings(),
-        metadatas=metadatas,
-    )
-    output = docsearch.similarity_search("foo", k=1, fetch_k=4, filter={"page": "1"})
-    assert output[0].metadata["page"] == "1"
-
-
 @pytest.mark.requires("numpy", "sklearn")
 def test_sklearn_with_metadatas_with_scores() -> None:
     """Test end to end construction and scored search."""
@@ -55,7 +41,7 @@ def test_sklearn_with_metadatas_with_scores() -> None:
         FakeEmbeddings(),
         metadatas=metadatas,
     )
-    output = docsearch.similarity_search_with_relevance_scores("foo", k=1, fetch_k=3)
+    output = docsearch.similarity_search_with_relevance_scores("foo", k=1)
     assert len(output) == 1
     doc, score = output[0]
     assert doc.page_content == "foo"
@@ -75,7 +61,7 @@ def test_sklearn_with_persistence(tmpdir: Path) -> None:
         serializer="json",
     )
 
-    output = docsearch.similarity_search("foo", k=1, fetch_k=3)
+    output = docsearch.similarity_search("foo", k=1)
     assert len(output) == 1
     assert output[0].page_content == "foo"
 
@@ -85,7 +71,7 @@ def test_sklearn_with_persistence(tmpdir: Path) -> None:
     docsearch = SKLearnVectorStore(
         FakeEmbeddings(), persist_path=str(persist_path), serializer="json"
     )
-    output = docsearch.similarity_search("foo", k=1, fetch_k=3)
+    output = docsearch.similarity_search("foo", k=1)
     assert len(output) == 1
     assert output[0].page_content == "foo"
 
@@ -112,19 +98,3 @@ def test_sklearn_mmr_by_vector() -> None:
     )
     assert len(output) == 1
     assert output[0].page_content == "foo"
-
-
-@pytest.mark.requires("numpy", "sklearn")
-def test_sklearn_mmr_with_metadata_and_filter() -> None:
-    """Test end to end construction and search."""
-    texts = ["foo", "foo", "bar", "baz"]
-    metadatas = [{"page": str(i)} for i in range(len(texts))]
-    docsearch = SKLearnVectorStore.from_texts(
-        texts, FakeEmbeddings(), metadatas=metadatas
-    )
-    output = docsearch.max_marginal_relevance_search(
-        "foo", k=1, fetch_k=4, filter={"page": "1"}
-    )
-    assert len(output) == 1
-    assert output[0].page_content == "foo"
-    assert output[0].metadata["page"] == "1"