Support similarity search by vector (in FAISS) (#961)

Alternate implementation to PR #960 Again - only FAISS is implemented. If accepted can add this to other vectorstores or leave as NotImplemented? Suggestions welcome...
2024-11-08 07:10:35 +00:00 · 2023-02-16 17:50:00 +11:00 · 2023-02-16 17:50:00 +11:00 · f0a258555b
commit f0a258555b
parent 05ad399abe
4 changed files with 144 additions and 23 deletions
--- a/docs/modules/utils/combine_docs_examples/vectorstores.ipynb
+++ b/docs/modules/utils/combine_docs_examples/vectorstores.ipynb
@ -297,6 +297,26 @@
    "docs_and_scores[0]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "d5170563",
   "metadata": {},
   "source": [
    "It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7675b0aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_vector = embeddings.embed_query(query)\n",
    "docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b386dbb8",
--- a/langchain/vectorstores/base.py
+++ b/langchain/vectorstores/base.py
@ -31,6 +31,20 @@ class VectorStore(ABC):
    ) -> List[Document]:
        """Return docs most similar to query."""
    def similarity_search_by_vector(
        self, embedding: List[float], k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Return docs most similar to embedding vector.
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
        Returns:
            List of Documents most similar to the query vector.
        """
        raise NotImplementedError
    def max_marginal_relevance_search(
        self, query: str, k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
@ -49,6 +63,24 @@ class VectorStore(ABC):
        """
        raise NotImplementedError
    def max_marginal_relevance_search_by_vector(
        self, embedding: List[float], k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
        raise NotImplementedError
    @classmethod
    def from_documents(
        cls,
--- a/langchain/vectorstores/faiss.py
+++ b/langchain/vectorstores/faiss.py
@ -92,6 +92,31 @@ class FAISS(VectorStore):
        self.index_to_docstore_id.update(index_to_id)
        return [_id for _, _id, _ in full_info]
    def similarity_search_with_score_by_vector(
        self, embedding: List[float], k: int = 4
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query.
        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
        Returns:
            List of Documents most similar to the query and score for each
        """
        scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
        docs = []
        for j, i in enumerate(indices[0]):
            if i == -1:
                # This happens when not enough docs are returned.
                continue
            _id = self.index_to_docstore_id[i]
            doc = self.docstore.search(_id)
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {_id}, got {doc}")
            docs.append((doc, scores[0][j]))
        return docs
    def similarity_search_with_score(
        self, query: str, k: int = 4
    ) -> List[Tuple[Document, float]]:
@ -105,19 +130,24 @@ class FAISS(VectorStore):
            List of Documents most similar to the query and score for each
        """
        embedding = self.embedding_function(query)
-        scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
+        docs = self.similarity_search_with_score_by_vector(embedding, k)
        docs = []
        for j, i in enumerate(indices[0]):
            if i == -1:
                # This happens when not enough docs are returned.
                continue
            _id = self.index_to_docstore_id[i]
            doc = self.docstore.search(_id)
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {_id}, got {doc}")
            docs.append((doc, scores[0][j]))
        return docs
    def similarity_search_by_vector(
        self, embedding: List[float], k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Return docs most similar to embedding vector.
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
        Returns:
            List of Documents most similar to the embedding.
        """
        docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
        return [doc for doc, _ in docs_and_scores]
    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
@ -133,6 +163,38 @@ class FAISS(VectorStore):
        docs_and_scores = self.similarity_search_with_score(query, k)
        return [doc for doc, _ in docs_and_scores]
    def max_marginal_relevance_search_by_vector(
        self, embedding: List[float], k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
        _, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
        # -1 happens when not enough docs are returned.
        embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
        mmr_selected = maximal_marginal_relevance(
            np.array([embedding], dtype=np.float32), embeddings, k=k
        )
        selected_indices = [indices[0][i] for i in mmr_selected]
        docs = []
        for i in selected_indices:
            _id = self.index_to_docstore_id[i]
            doc = self.docstore.search(_id)
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {_id}, got {doc}")
            docs.append(doc)
        return docs
    def max_marginal_relevance_search(
        self, query: str, k: int = 4, fetch_k: int = 20
    ) -> List[Document]:
@ -150,18 +212,7 @@ class FAISS(VectorStore):
            List of Documents selected by maximal marginal relevance.
        """
        embedding = self.embedding_function(query)
-        _, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
+        docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
        # -1 happens when not enough docs are returned.
        embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
        mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
        selected_indices = [indices[0][i] for i in mmr_selected]
        docs = []
        for i in selected_indices:
            _id = self.index_to_docstore_id[i]
            doc = self.docstore.search(_id)
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {_id}, got {doc}")
            docs.append(doc)
        return docs
    @classmethod
--- a/tests/integration_tests/vectorstores/test_faiss.py
+++ b/tests/integration_tests/vectorstores/test_faiss.py
@ -27,6 +27,24 @@ def test_faiss() -> None:
    assert output == [Document(page_content="foo")]
 def test_faiss_vector_sim() -> None:
    """Test vector similarity."""
    texts = ["foo", "bar", "baz"]
    docsearch = FAISS.from_texts(texts, FakeEmbeddings())
    index_to_id = docsearch.index_to_docstore_id
    expected_docstore = InMemoryDocstore(
        {
            index_to_id[0]: Document(page_content="foo"),
            index_to_id[1]: Document(page_content="bar"),
            index_to_id[2]: Document(page_content="baz"),
        }
    )
    assert docsearch.docstore.__dict__ == expected_docstore.__dict__
    query_vec = FakeEmbeddings().embed_query(text="foo")
    output = docsearch.similarity_search_by_vector(query_vec, k=1)
    assert output == [Document(page_content="foo")]
 def test_faiss_with_metadatas() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]