Bagatur/refac faiss (#9076)

Code cleanup and bug fix in deletion
11 months ago · 358562769a
parent 3eccd72382
commit 358562769a
4 changed files with 117 additions and 90 deletions
--- a/libs/langchain/langchain/vectorstores/faiss.py
+++ b/libs/langchain/langchain/vectorstores/faiss.py
@ -7,7 +7,16 @@ import pickle
 import uuid
 import warnings
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sized,
+    Tuple,
+)

 import numpy as np

@ -46,16 +55,29 @@ def dependable_faiss_import(no_avx2: Optional[bool] = None) -> Any:
    return faiss


+def _len_check_if_sized(x: Any, y: Any, x_name: str, y_name: str) -> None:
+    if isinstance(x, Sized) and isinstance(y, Sized) and len(x) != len(y):
+        raise ValueError(
+            f"{x_name} and {y_name} expected to be equal length but "
+            f"len({x_name})={len(x)} and len({y_name})={len(y)}"
+        )
+    return
+
+
 class FAISS(VectorStore):
    """Wrapper around FAISS vector database.

-    To use, you should have the ``faiss`` python package installed.
+    To use, you must have the ``faiss`` python package installed.

    Example:
        .. code-block:: python

-            from langchain import FAISS
-            faiss = FAISS(embedding_function, index, docstore, index_to_docstore_id)
+            from langchain.embeddings.openai import OpenAIEmbeddings
+            from langchain.vectorstores import FAISS
+
+            embeddings = OpenAIEmbeddings()
+            texts = ["FAISS is an important library", "LangChain supports FAISS"]
+            faiss = FAISS.from_texts(texts, embeddings)

    """

@ -87,44 +109,43 @@ class FAISS(VectorStore):
                )
            )

-    @property
-    def embeddings(self) -> Optional[Embeddings]:
-        # TODO: Accept embeddings object directly
-        return None
-
    def __add(
        self,
        texts: Iterable[str],
        embeddings: Iterable[List[float]],
-        metadatas: Optional[List[dict]] = None,
+        metadatas: Optional[Iterable[dict]] = None,
        ids: Optional[List[str]] = None,
-        **kwargs: Any,
    ) -> List[str]:
+        faiss = dependable_faiss_import()
+
        if not isinstance(self.docstore, AddableMixin):
            raise ValueError(
                "If trying to add texts, the underlying docstore should support "
                f"adding items, which {self.docstore} does not"
            )
-        documents = []
-        for i, text in enumerate(texts):
-            metadata = metadatas[i] if metadatas else {}
-            documents.append(Document(page_content=text, metadata=metadata))
-        if ids is None:
-            ids = [str(uuid.uuid4()) for _ in texts]
-        # Add to the index, the index_to_id mapping, and the docstore.
-        starting_len = len(self.index_to_docstore_id)
-        faiss = dependable_faiss_import()
+
+        _len_check_if_sized(texts, metadatas, "texts", "metadatas")
+        _metadatas = metadatas or ({} for _ in texts)
+        documents = [
+            Document(page_content=t, metadata=m) for t, m in zip(texts, _metadatas)
+        ]
+
+        _len_check_if_sized(documents, embeddings, "documents", "embeddings")
+        _len_check_if_sized(documents, ids, "documents", "ids")
+
+        # Add to the index.
        vector = np.array(embeddings, dtype=np.float32)
        if self._normalize_L2:
            faiss.normalize_L2(vector)
        self.index.add(vector)
-        # Get list of index, id, and docs.
-        full_info = [(starting_len + i, ids[i], doc) for i, doc in enumerate(documents)]
+
        # Add information to docstore and index.
-        self.docstore.add({_id: doc for _, _id, doc in full_info})
-        index_to_id = {index: _id for index, _id, _ in full_info}
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        self.docstore.add({id_: doc for id_, doc in zip(ids, documents)})
+        starting_len = len(self.index_to_docstore_id)
+        index_to_id = {starting_len + j: id_ for j, id_ in enumerate(ids)}
        self.index_to_docstore_id.update(index_to_id)
-        return [_id for _, _id, _ in full_info]
+        return ids

    def add_texts(
        self,
@ -143,14 +164,8 @@ class FAISS(VectorStore):
        Returns:
            List of ids from adding the texts into the vectorstore.
        """
-        if not isinstance(self.docstore, AddableMixin):
-            raise ValueError(
-                "If trying to add texts, the underlying docstore should support "
-                f"adding items, which {self.docstore} does not"
-            )
-        # Embed and create the documents.
        embeddings = [self.embedding_function(text) for text in texts]
-        return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs)
+        return self.__add(texts, embeddings, metadatas=metadatas, ids=ids)

    def add_embeddings(
        self,
@ -170,15 +185,9 @@ class FAISS(VectorStore):
        Returns:
            List of ids from adding the texts into the vectorstore.
        """
-        if not isinstance(self.docstore, AddableMixin):
-            raise ValueError(
-                "If trying to add texts, the underlying docstore should support "
-                f"adding items, which {self.docstore} does not"
-            )
        # Embed and create the documents.
        texts, embeddings = zip(*text_embeddings)
-
-        return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs)
+        return self.__add(texts, embeddings, metadatas=metadatas, ids=ids)

    def similarity_search_with_score_by_vector(
        self,
@ -480,22 +489,26 @@ class FAISS(VectorStore):
        """
        if ids is None:
            raise ValueError("No ids provided to delete.")
+        missing_ids = set(ids).difference(self.index_to_docstore_id.values())
+        if missing_ids:
+            raise ValueError(
+                f"Some specified ids do not exist in the current store. Ids not found: "
+                f"{missing_ids}"
+            )

-        overlapping = set(ids).intersection(self.index_to_docstore_id.values())
-        if not overlapping:
-            raise ValueError("ids do not exist in the current object")
-
-        _reversed_index = {v: k for k, v in self.index_to_docstore_id.items()}
-
-        index_to_delete = [_reversed_index[i] for i in ids]
+        reversed_index = {id_: idx for idx, id_ in self.index_to_docstore_id.items()}
+        index_to_delete = [reversed_index[id_] for id_ in ids]

-        # Removing ids from index.
        self.index.remove_ids(np.array(index_to_delete, dtype=np.int64))
-        for _id in index_to_delete:
-            del self.index_to_docstore_id[_id]
-
-        # Remove items from docstore.
        self.docstore.delete(ids)
+
+        remaining_ids = [
+            id_
+            for i, id_ in sorted(self.index_to_docstore_id.items())
+            if i not in index_to_delete
+        ]
+        self.index_to_docstore_id = {i: id_ for i, id_ in enumerate(remaining_ids)}
+
        return True

    def merge_from(self, target: FAISS) -> None:
@ -533,50 +546,32 @@ class FAISS(VectorStore):
    @classmethod
    def __from(
        cls,
-        texts: List[str],
+        texts: Iterable[str],
        embeddings: List[List[float]],
        embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
+        metadatas: Optional[Iterable[dict]] = None,
        ids: Optional[List[str]] = None,
        normalize_L2: bool = False,
+        distance_strategy: DistanceStrategy = DistanceStrategy.EUCLIDEAN_DISTANCE,
        **kwargs: Any,
    ) -> FAISS:
        faiss = dependable_faiss_import()
-        distance_strategy = kwargs.get(
-            "distance_strategy", DistanceStrategy.EUCLIDEAN_DISTANCE
-        )
        if distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
            index = faiss.IndexFlatIP(len(embeddings[0]))
        else:
            # Default to L2, currently other metric types not initialized.
            index = faiss.IndexFlatL2(len(embeddings[0]))
-        vector = np.array(embeddings, dtype=np.float32)
-        if normalize_L2 and distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
-            faiss.normalize_L2(vector)
-        index.add(vector)
-        documents = []
-        if ids is None:
-            ids = [str(uuid.uuid4()) for _ in texts]
-        for i, text in enumerate(texts):
-            metadata = metadatas[i] if metadatas else {}
-            documents.append(Document(page_content=text, metadata=metadata))
-        index_to_id = dict(enumerate(ids))
-
-        if len(index_to_id) != len(documents):
-            raise Exception(
-                f"{len(index_to_id)} ids provided for {len(documents)} documents."
-                " Each document should have an id."
-            )
-
-        docstore = InMemoryDocstore(dict(zip(index_to_id.values(), documents)))
-        return cls(
+        vecstore = cls(
            embedding.embed_query,
            index,
-            docstore,
-            index_to_id,
+            InMemoryDocstore(),
+            {},
            normalize_L2=normalize_L2,
+            distance_strategy=distance_strategy,
            **kwargs,
        )
+        vecstore.__add(texts, embeddings, metadatas=metadatas, ids=ids)
+        return vecstore

    @classmethod
    def from_texts(
@ -601,6 +596,7 @@ class FAISS(VectorStore):

                from langchain import FAISS
                from langchain.embeddings import OpenAIEmbeddings
+
                embeddings = OpenAIEmbeddings()
                faiss = FAISS.from_texts(texts, embeddings)
        """
@ -617,9 +613,9 @@ class FAISS(VectorStore):
    @classmethod
    def from_embeddings(
        cls,
-        text_embeddings: List[Tuple[str, List[float]]],
+        text_embeddings: Iterable[Tuple[str, List[float]]],
        embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
+        metadatas: Optional[Iterable[dict]] = None,
        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> FAISS:
@ -637,9 +633,10 @@ class FAISS(VectorStore):

                from langchain import FAISS
                from langchain.embeddings import OpenAIEmbeddings
+
                embeddings = OpenAIEmbeddings()
                text_embeddings = embeddings.embed_documents(texts)
-                text_embedding_pairs = list(zip(texts, text_embeddings))
+                text_embedding_pairs = zip(texts, text_embeddings)
                faiss = FAISS.from_embeddings(text_embedding_pairs, embeddings)
        """
        texts = [t[0] for t in text_embeddings]
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@ -10477,7 +10477,7 @@ clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
+extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
 javascript = ["esprima"]
 llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
 openai = ["openai", "tiktoken"]
@ -10487,4 +10487,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "fb53fa05a5258de15427c0f69f2070265842bd530f139ed4e0ed71cd3b70ad36"
+content-hash = "6e85bdaca0b4a62bace541dd914266b49a4d7f90c7be2030fab639bf7efc23c6"
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@ -334,6 +334,7 @@ extended_testing = [
 "feedparser",
 "xata",
 "xmltodict",
+ "faiss-cpu",
 ]

 [tool.ruff]
--- a/libs/langchain/tests/integration_tests/vectorstores/test_faiss.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_faiss.py
@ -7,11 +7,12 @@ import pytest

 from langchain.docstore.document import Document
 from langchain.docstore.in_memory import InMemoryDocstore
-from langchain.docstore.wikipedia import Wikipedia
 from langchain.vectorstores.faiss import FAISS
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+from tests.unit_tests.agents.test_react import FakeDocstore


+@pytest.mark.requires("faiss")
 def test_faiss() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
@ -29,6 +30,7 @@ def test_faiss() -> None:
    assert output == [Document(page_content="foo")]


+@pytest.mark.requires("faiss")
 def test_faiss_vector_sim() -> None:
    """Test vector similarity."""
    texts = ["foo", "bar", "baz"]
@ -47,6 +49,7 @@ def test_faiss_vector_sim() -> None:
    assert output == [Document(page_content="foo")]


+@pytest.mark.requires("faiss")
 def test_faiss_vector_sim_with_score_threshold() -> None:
    """Test vector similarity."""
    texts = ["foo", "bar", "baz"]
@ -65,6 +68,7 @@ def test_faiss_vector_sim_with_score_threshold() -> None:
    assert output == [Document(page_content="foo")]


+@pytest.mark.requires("faiss")
 def test_similarity_search_with_score_by_vector() -> None:
    """Test vector similarity with score by vector."""
    texts = ["foo", "bar", "baz"]
@ -84,6 +88,7 @@ def test_similarity_search_with_score_by_vector() -> None:
    assert output[0][0] == Document(page_content="foo")


+@pytest.mark.requires("faiss")
 def test_similarity_search_with_score_by_vector_with_score_threshold() -> None:
    """Test vector similarity with score by vector."""
    texts = ["foo", "bar", "baz"]
@ -108,6 +113,7 @@ def test_similarity_search_with_score_by_vector_with_score_threshold() -> None:
    assert output[0][1] < 0.2


+@pytest.mark.requires("faiss")
 def test_faiss_mmr() -> None:
    texts = ["foo", "foo", "fou", "foy"]
    docsearch = FAISS.from_texts(texts, FakeEmbeddings())
@ -122,6 +128,7 @@ def test_faiss_mmr() -> None:
    assert output[1][0] != Document(page_content="foo")


+@pytest.mark.requires("faiss")
 def test_faiss_mmr_with_metadatas() -> None:
    texts = ["foo", "foo", "fou", "foy"]
    metadatas = [{"page": i} for i in range(len(texts))]
@ -136,6 +143,7 @@ def test_faiss_mmr_with_metadatas() -> None:
    assert output[1][0] != Document(page_content="foo", metadata={"page": 0})


+@pytest.mark.requires("faiss")
 def test_faiss_mmr_with_metadatas_and_filter() -> None:
    texts = ["foo", "foo", "fou", "foy"]
    metadatas = [{"page": i} for i in range(len(texts))]
@ -149,6 +157,7 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None:
    assert output[0][1] == 0.0


+@pytest.mark.requires("faiss")
 def test_faiss_mmr_with_metadatas_and_list_filter() -> None:
    texts = ["foo", "foo", "fou", "foy"]
    metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))]
@ -163,6 +172,7 @@ def test_faiss_mmr_with_metadatas_and_list_filter() -> None:
    assert output[1][0] != Document(page_content="foo", metadata={"page": 0})


+@pytest.mark.requires("faiss")
 def test_faiss_with_metadatas() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
@ -186,6 +196,7 @@ def test_faiss_with_metadatas() -> None:
    assert output == [Document(page_content="foo", metadata={"page": 0})]


+@pytest.mark.requires("faiss")
 def test_faiss_with_metadatas_and_filter() -> None:
    texts = ["foo", "bar", "baz"]
    metadatas = [{"page": i} for i in range(len(texts))]
@ -208,6 +219,7 @@ def test_faiss_with_metadatas_and_filter() -> None:
    assert output == [Document(page_content="bar", metadata={"page": 1})]


+@pytest.mark.requires("faiss")
 def test_faiss_with_metadatas_and_list_filter() -> None:
    texts = ["foo", "bar", "baz", "foo", "qux"]
    metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))]
@ -236,6 +248,7 @@ def test_faiss_with_metadatas_and_list_filter() -> None:
    assert output == [Document(page_content="foo", metadata={"page": 0})]


+@pytest.mark.requires("faiss")
 def test_faiss_search_not_found() -> None:
    """Test what happens when document is not found."""
    texts = ["foo", "bar", "baz"]
@ -246,6 +259,7 @@ def test_faiss_search_not_found() -> None:
        docsearch.similarity_search("foo")


+@pytest.mark.requires("faiss")
 def test_faiss_add_texts() -> None:
    """Test end to end adding of texts."""
    # Create initial doc store.
@ -257,13 +271,15 @@ def test_faiss_add_texts() -> None:
    assert output == [Document(page_content="foo"), Document(page_content="foo")]


+@pytest.mark.requires("faiss")
 def test_faiss_add_texts_not_supported() -> None:
    """Test adding of texts to a docstore that doesn't support it."""
-    docsearch = FAISS(FakeEmbeddings().embed_query, None, Wikipedia(), {})
+    docsearch = FAISS(FakeEmbeddings().embed_query, None, FakeDocstore(), {})
    with pytest.raises(ValueError):
        docsearch.add_texts(["foo"])


+@pytest.mark.requires("faiss")
 def test_faiss_local_save_load() -> None:
    """Test end to end serialization."""
    texts = ["foo", "bar", "baz"]
@ -275,6 +291,7 @@ def test_faiss_local_save_load() -> None:
    assert new_docsearch.index is not None


+@pytest.mark.requires("faiss")
 def test_faiss_similarity_search_with_relevance_scores() -> None:
    """Test the similarity search with normalized similarities."""
    texts = ["foo", "bar", "baz"]
@ -289,6 +306,7 @@ def test_faiss_similarity_search_with_relevance_scores() -> None:
    assert score == 1.0


+@pytest.mark.requires("faiss")
 def test_faiss_similarity_search_with_relevance_scores_with_threshold() -> None:
    """Test the similarity search with normalized similarities with score threshold."""
    texts = ["foo", "bar", "baz"]
@ -306,6 +324,7 @@ def test_faiss_similarity_search_with_relevance_scores_with_threshold() -> None:
    assert score == 1.0


+@pytest.mark.requires("faiss")
 def test_faiss_invalid_normalize_fn() -> None:
    """Test the similarity search with normalized similarities."""
    texts = ["foo", "bar", "baz"]
@ -316,12 +335,22 @@ def test_faiss_invalid_normalize_fn() -> None:
        docsearch.similarity_search_with_relevance_scores("foo", k=1)


+@pytest.mark.requires("faiss")
 def test_missing_normalize_score_fn() -> None:
    """Test doesn't perform similarity search without a valid distance strategy."""
+    texts = ["foo", "bar", "baz"]
+    faiss_instance = FAISS.from_texts(texts, FakeEmbeddings(), distance_strategy="fake")
    with pytest.raises(ValueError):
-        texts = ["foo", "bar", "baz"]
-        faiss_instance = FAISS.from_texts(
-            texts, FakeEmbeddings(), distance_strategy="fake"
-        )
-
        faiss_instance.similarity_search_with_relevance_scores("foo", k=2)
+
+
+@pytest.mark.requires("faiss")
+def test_delete() -> None:
+    """Test the similarity search with normalized similarities."""
+    ids = ["a", "b", "c"]
+    docsearch = FAISS.from_texts(["foo", "bar", "baz"], FakeEmbeddings(), ids=ids)
+    docsearch.delete(ids[1:2])
+
+    result = docsearch.similarity_search("bar", k=2)
+    assert sorted([d.page_content for d in result]) == ["baz", "foo"]
+    assert docsearch.index_to_docstore_id == {0: ids[0], 1: ids[2]}