Support similarity search by vector (in FAISS) (#961)

Alternate implementation to PR #960 Again - only FAISS is implemented. If accepted can add this to other vectorstores or leave as NotImplemented? Suggestions welcome...
2024-11-06 03:20:49 +00:00 · 2023-02-16 17:50:00 +11:00 · 2023-02-16 17:50:00 +11:00 · f0a258555b
commit f0a258555b
parent 05ad399abe
4 changed files with 144 additions and 23 deletions
--- a/docs/modules/utils/combine_docs_examples/vectorstores.ipynb
+++ b/docs/modules/utils/combine_docs_examples/vectorstores.ipynb
@ -297,6 +297,26 @@
    "docs_and_scores[0]"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d5170563",
+   "metadata": {},
+   "source": [
+    "It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7675b0aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_vector = embeddings.embed_query(query)\n",
+    "docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "b386dbb8",
--- a/langchain/vectorstores/base.py
+++ b/langchain/vectorstores/base.py
@ -31,6 +31,20 @@ class VectorStore(ABC):
    ) -> List[Document]:
        """Return docs most similar to query."""

+    def similarity_search_by_vector(
+        self, embedding: List[float], k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs most similar to embedding vector.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query vector.
+        """
+        raise NotImplementedError
+
    def max_marginal_relevance_search(
        self, query: str, k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
@ -49,6 +63,24 @@ class VectorStore(ABC):
        """
        raise NotImplementedError

+    def max_marginal_relevance_search_by_vector(
+        self, embedding: List[float], k: int = 4, fetch_k: int = 20
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        raise NotImplementedError
+
    @classmethod
    def from_documents(
        cls,
--- a/langchain/vectorstores/faiss.py
+++ b/langchain/vectorstores/faiss.py
@ -92,6 +92,31 @@ class FAISS(VectorStore):
        self.index_to_docstore_id.update(index_to_id)
        return [_id for _, _id, _ in full_info]

+    def similarity_search_with_score_by_vector(
+        self, embedding: List[float], k: int = 4
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query and score for each
+        """
+        scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
+        docs = []
+        for j, i in enumerate(indices[0]):
+            if i == -1:
+                # This happens when not enough docs are returned.
+                continue
+            _id = self.index_to_docstore_id[i]
+            doc = self.docstore.search(_id)
+            if not isinstance(doc, Document):
+                raise ValueError(f"Could not find document for id {_id}, got {doc}")
+            docs.append((doc, scores[0][j]))
+        return docs
+
    def similarity_search_with_score(
        self, query: str, k: int = 4
    ) -> List[Tuple[Document, float]]:
@ -105,19 +130,24 @@ class FAISS(VectorStore):
            List of Documents most similar to the query and score for each
        """
        embedding = self.embedding_function(query)
-        scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
-        docs = []
-        for j, i in enumerate(indices[0]):
-            if i == -1:
-                # This happens when not enough docs are returned.
-                continue
-            _id = self.index_to_docstore_id[i]
-            doc = self.docstore.search(_id)
-            if not isinstance(doc, Document):
-                raise ValueError(f"Could not find document for id {_id}, got {doc}")
-            docs.append((doc, scores[0][j]))
+        docs = self.similarity_search_with_score_by_vector(embedding, k)
        return docs

+    def similarity_search_by_vector(
+        self, embedding: List[float], k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs most similar to embedding vector.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the embedding.
+        """
+        docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
+        return [doc for doc, _ in docs_and_scores]
+
    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
@ -133,6 +163,38 @@ class FAISS(VectorStore):
        docs_and_scores = self.similarity_search_with_score(query, k)
        return [doc for doc, _ in docs_and_scores]

+    def max_marginal_relevance_search_by_vector(
+        self, embedding: List[float], k: int = 4, fetch_k: int = 20
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        _, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
+        # -1 happens when not enough docs are returned.
+        embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
+        mmr_selected = maximal_marginal_relevance(
+            np.array([embedding], dtype=np.float32), embeddings, k=k
+        )
+        selected_indices = [indices[0][i] for i in mmr_selected]
+        docs = []
+        for i in selected_indices:
+            _id = self.index_to_docstore_id[i]
+            doc = self.docstore.search(_id)
+            if not isinstance(doc, Document):
+                raise ValueError(f"Could not find document for id {_id}, got {doc}")
+            docs.append(doc)
+        return docs
+
    def max_marginal_relevance_search(
        self, query: str, k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
@ -150,18 +212,7 @@ class FAISS(VectorStore):
            List of Documents selected by maximal marginal relevance.
        """
        embedding = self.embedding_function(query)
-        _, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
-        # -1 happens when not enough docs are returned.
-        embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
-        mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
-        selected_indices = [indices[0][i] for i in mmr_selected]
-        docs = []
-        for i in selected_indices:
-            _id = self.index_to_docstore_id[i]
-            doc = self.docstore.search(_id)
-            if not isinstance(doc, Document):
-                raise ValueError(f"Could not find document for id {_id}, got {doc}")
-            docs.append(doc)
+        docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
        return docs

    @classmethod
--- a/tests/integration_tests/vectorstores/test_faiss.py
+++ b/tests/integration_tests/vectorstores/test_faiss.py
@ -27,6 +27,24 @@ def test_faiss() -> None:
    assert output == [Document(page_content="foo")]


+def test_faiss_vector_sim() -> None:
+    """Test vector similarity."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = FAISS.from_texts(texts, FakeEmbeddings())
+    index_to_id = docsearch.index_to_docstore_id
+    expected_docstore = InMemoryDocstore(
+        {
+            index_to_id[0]: Document(page_content="foo"),
+            index_to_id[1]: Document(page_content="bar"),
+            index_to_id[2]: Document(page_content="baz"),
+        }
+    )
+    assert docsearch.docstore.__dict__ == expected_docstore.__dict__
+    query_vec = FakeEmbeddings().embed_query(text="foo")
+    output = docsearch.similarity_search_by_vector(query_vec, k=1)
+    assert output == [Document(page_content="foo")]
+
+
 def test_faiss_with_metadatas() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]