From 33e77a1007f344831783858559a3501a33e4ff66 Mon Sep 17 00:00:00 2001 From: Pham Vu Thai Minh Date: Tue, 31 Oct 2023 05:08:53 +0700 Subject: [PATCH] Async support for FAISS (#11333) Following this tutoral about using OpenAI Embeddings with FAISS https://python.langchain.com/docs/integrations/vectorstores/faiss ```python from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS from langchain.document_loaders import TextLoader from langchain.document_loaders import TextLoader loader = TextLoader("../../../extras/modules/state_of_the_union.txt") documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings() ``` This works fine ```python db = FAISS.from_documents(docs, embeddings) query = "What did the president say about Ketanji Brown Jackson" docs = db.similarity_search(query) ``` But the async version is not ```python db = await FAISS.afrom_documents(docs, embeddings) # NotImplementedError query = "What did the president say about Ketanji Brown Jackson" docs = await db.asimilarity_search(query) # this will use await asyncio.get_event_loop().run_in_executor under the hood and will not call OpenAIEmbeddings.aembed_query but call OpenAIEmbeddings.embed_query ``` So this PR add async/await supports for FAISS --------- Co-authored-by: Eugene Yurtsev --- .../vectorstores/async_faiss.ipynb | 604 ++++++++++++++++++ .../langchain/langchain/schema/vectorstore.py | 88 ++- .../langchain/langchain/vectorstores/faiss.py | 366 ++++++++++- .../unit_tests/vectorstores/test_faiss.py | 373 ++++++++++- 4 files changed, 1421 insertions(+), 10 deletions(-) create mode 100644 docs/docs/integrations/vectorstores/async_faiss.ipynb diff --git a/docs/docs/integrations/vectorstores/async_faiss.ipynb b/docs/docs/integrations/vectorstores/async_faiss.ipynb new file mode 100644 index 0000000000..969c3b3130 --- /dev/null +++ b/docs/docs/integrations/vectorstores/async_faiss.ipynb @@ -0,0 +1,604 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# Faiss\n", + "\n", + ">[Facebook AI Similarity Search (Faiss)](https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning.\n", + "\n", + "[Faiss documentation](https://faiss.ai/).\n", + "\n", + "This notebook shows how to use functionality related to the `FAISS` vector database using asyncio.\n", + "\n", + "See synchronous version [here](https://python.langchain.com/docs/integrations/vectorstores/faiss)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "497fcd89-e832-46a7-a74a-c71199666206", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.\n", + "# OR\n", + "!pip install faiss-cpu # For CPU Installation" + ] + }, + { + "cell_type": "markdown", + "id": "38237514-b3fa-44a4-9cff-30cd6bf50073", + "metadata": {}, + "source": [ + "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "47f9b495-88f1-4286-8d5d-1416103931a7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", + "\n", + "# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization\n", + "# os.environ['FAISS_NO_AVX2'] = '1'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aac9563e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import FAISS\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a3c3999a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5eabdb75", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "db = await FAISS.afrom_documents(docs, embeddings)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = await db.asimilarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4b172de8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "f13473b5", + "metadata": {}, + "source": [ + "## Similarity Search with score\n", + "There are some FAISS specific methods. One of them is `similarity_search_with_score`, which allows you to return not only the documents but also the distance score of the query to them. The returned distance score is L2 distance. Therefore, a lower score is better." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "186ee1d8", + "metadata": {}, + "outputs": [], + "source": [ + "docs_and_scores = await db.asimilarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "284e04b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': './state_of_the_union.txt'}),\n", + " 0.36871302)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_and_scores[0]" + ] + }, + { + "cell_type": "markdown", + "id": "f34420cf", + "metadata": {}, + "source": [ + "It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b558ebb7", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_vector = await embeddings.aembed_query(query)\n", + "docs_and_scores = await db.asimilarity_search_by_vector(embedding_vector)" + ] + }, + { + "cell_type": "markdown", + "id": "31bda7fd", + "metadata": {}, + "source": [ + "## Saving and loading\n", + "You can also save and load a FAISS index. This is useful so you don't have to recreate it everytime you use it." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "428a6816", + "metadata": {}, + "outputs": [], + "source": [ + "db.save_local(\"faiss_index\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "56d1841c", + "metadata": {}, + "outputs": [], + "source": [ + "new_db = FAISS.load_local(\"faiss_index\", embeddings, asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "39055525", + "metadata": {}, + "outputs": [], + "source": [ + "docs = await new_db.asimilarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "98378c4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': './state_of_the_union.txt'})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "markdown", + "id": "30c8f57b", + "metadata": {}, + "source": [ + "# Serializing and De-Serializing to bytes\n", + "\n", + "you can pickle the FAISS Index by these functions. If you use embeddings model which is of 90 mb (sentence-transformers/all-MiniLM-L6-v2 or any other model), the resultant pickle size would be more than 90 mb. the size of the model is also included in the overall size. To overcome this, use the below functions. These functions only serializes FAISS index and size would be much lesser. this can be helpful if you wish to store the index in database like sql." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d8faead5", + "metadata": {}, + "outputs": [], + "source": [ + "pkl = db.serialize_to_bytes() # serializes the faiss index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb083247", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e36e220b", + "metadata": {}, + "outputs": [], + "source": [ + "db = FAISS.deserialize_from_bytes(\n", + " embeddings=embeddings, serialized=pkl, asynchronous=True\n", + ") # Load the index" + ] + }, + { + "cell_type": "markdown", + "id": "57da60d4", + "metadata": {}, + "source": [ + "## Merging\n", + "You can also merge two FAISS vectorstores" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6dfd2b78", + "metadata": {}, + "outputs": [], + "source": [ + "db1 = await FAISS.afrom_texts([\"foo\"], embeddings)\n", + "db2 = await FAISS.afrom_texts([\"bar\"], embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "29960da7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'8164a453-9643-4959-87f7-9ba79f9e8fb0': Document(page_content='foo')}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db1.docstore._dict" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "83392605", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'4fbcf8a2-e80f-4f65-9308-2f4cb27cb6e7': Document(page_content='bar')}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db2.docstore._dict" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a3fcc1c7", + "metadata": {}, + "outputs": [], + "source": [ + "db1.merge_from(db2)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "41c51f89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'8164a453-9643-4959-87f7-9ba79f9e8fb0': Document(page_content='foo'),\n", + " '4fbcf8a2-e80f-4f65-9308-2f4cb27cb6e7': Document(page_content='bar')}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db1.docstore._dict" + ] + }, + { + "cell_type": "markdown", + "id": "f4294b96", + "metadata": {}, + "source": [ + "## Similarity Search with filtering\n", + "FAISS vectorstore can also support filtering, since the FAISS does not natively support filtering we have to do it manually. This is done by first fetching more results than `k` and then filtering them. You can filter the documents based on metadata. You can also set the `fetch_k` parameter when calling any search method to set how many documents you want to fetch before filtering. Here is a small example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6740107a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content: foo, Metadata: {'page': 1}, Score: 5.159960813797904e-15\n", + "Content: foo, Metadata: {'page': 2}, Score: 5.159960813797904e-15\n", + "Content: foo, Metadata: {'page': 3}, Score: 5.159960813797904e-15\n", + "Content: foo, Metadata: {'page': 4}, Score: 5.159960813797904e-15\n" + ] + } + ], + "source": [ + "from langchain.schema import Document\n", + "\n", + "list_of_documents = [\n", + " Document(page_content=\"foo\", metadata=dict(page=1)),\n", + " Document(page_content=\"bar\", metadata=dict(page=1)),\n", + " Document(page_content=\"foo\", metadata=dict(page=2)),\n", + " Document(page_content=\"barbar\", metadata=dict(page=2)),\n", + " Document(page_content=\"foo\", metadata=dict(page=3)),\n", + " Document(page_content=\"bar burr\", metadata=dict(page=3)),\n", + " Document(page_content=\"foo\", metadata=dict(page=4)),\n", + " Document(page_content=\"bar bruh\", metadata=dict(page=4)),\n", + "]\n", + "db = FAISS.from_documents(list_of_documents, embeddings)\n", + "results_with_scores = db.similarity_search_with_score(\"foo\")\n", + "for doc, score in results_with_scores:\n", + " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3d33c126", + "metadata": {}, + "source": [ + "Now we make the same query call but we filter for only `page = 1` " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "83159330", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content: foo, Metadata: {'page': 1}, Score: 5.159960813797904e-15\n", + "Content: bar, Metadata: {'page': 1}, Score: 0.3131446838378906\n" + ] + } + ], + "source": [ + "results_with_scores = await db.asimilarity_search_with_score(\"foo\", filter=dict(page=1))\n", + "for doc, score in results_with_scores:\n", + " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0be136e0", + "metadata": {}, + "source": [ + "Same thing can be done with the `max_marginal_relevance_search` as well." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "432c6980", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content: foo, Metadata: {'page': 1}\n", + "Content: bar, Metadata: {'page': 1}\n" + ] + } + ], + "source": [ + "results = await db.amax_marginal_relevance_search(\"foo\", filter=dict(page=1))\n", + "for doc in results:\n", + " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1b4ecd86", + "metadata": {}, + "source": [ + "Here is an example of how to set `fetch_k` parameter when calling `similarity_search`. Usually you would want the `fetch_k` parameter >> `k` parameter. This is because the `fetch_k` parameter is the number of documents that will be fetched before filtering. If you set `fetch_k` to a low number, you might not get enough documents to filter from." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1fd60fd1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content: foo, Metadata: {'page': 1}\n" + ] + } + ], + "source": [ + "results = await db.asimilarity_search(\"foo\", filter=dict(page=1), k=1, fetch_k=4)\n", + "for doc in results:\n", + " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1becca53", + "metadata": {}, + "source": [ + "## Delete\n", + "\n", + "You can also delete ids. Note that the ids to delete should be the ids in the docstore." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1408b870", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.delete([db.index_to_docstore_id[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d13daf33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Is now missing\n", + "0 in db.index_to_docstore_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30ace43e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/schema/vectorstore.py b/libs/langchain/langchain/schema/vectorstore.py index 23db4473ce..e99015d903 100644 --- a/libs/langchain/langchain/schema/vectorstore.py +++ b/libs/langchain/langchain/schema/vectorstore.py @@ -216,6 +216,17 @@ class VectorStore(ABC): """Run similarity search with distance.""" raise NotImplementedError + async def asimilarity_search_with_score( + self, *args: Any, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search with distance asynchronously.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search_with_score, *args, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + def _similarity_search_with_relevance_scores( self, query: str, @@ -243,6 +254,33 @@ class VectorStore(ABC): docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] + async def _asimilarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Default async similarity search with relevance scores. Modify if necessary + in subclass. + Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + relevance_score_fn = self._select_relevance_score_fn() + docs_and_scores = await self.asimilarity_search_with_score(query, k, **kwargs) + return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] + def similarity_search_with_relevance_scores( self, query: str, @@ -291,17 +329,51 @@ class VectorStore(ABC): return docs_and_similarities async def asimilarity_search_with_relevance_scores( - self, query: str, k: int = 4, **kwargs: Any + self, + query: str, + k: int = 4, + **kwargs: Any, ) -> List[Tuple[Document, float]]: - """Return docs most similar to query.""" + """Return docs and relevance scores in the range [0, 1], asynchronously. - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial( - self.similarity_search_with_relevance_scores, query, k=k, **kwargs + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + score_threshold = kwargs.pop("score_threshold", None) + + docs_and_similarities = await self._asimilarity_search_with_relevance_scores( + query, k=k, **kwargs ) - return await asyncio.get_event_loop().run_in_executor(None, func) + if any( + similarity < 0.0 or similarity > 1.0 + for _, similarity in docs_and_similarities + ): + warnings.warn( + "Relevance scores must be between" + f" 0 and 1, got {docs_and_similarities}" + ) + + if score_threshold is not None: + docs_and_similarities = [ + (doc, similarity) + for doc, similarity in docs_and_similarities + if similarity >= score_threshold + ] + if len(docs_and_similarities) == 0: + warnings.warn( + "No relevant docs were retrieved using the relevance score" + f" threshold {score_threshold}" + ) + return docs_and_similarities async def asimilarity_search( self, query: str, k: int = 4, **kwargs: Any diff --git a/libs/langchain/langchain/vectorstores/faiss.py b/libs/langchain/langchain/vectorstores/faiss.py index 53b2e8e31a..887f379753 100644 --- a/libs/langchain/langchain/vectorstores/faiss.py +++ b/libs/langchain/langchain/vectorstores/faiss.py @@ -1,11 +1,13 @@ from __future__ import annotations +import asyncio import logging import operator import os import pickle import uuid import warnings +from functools import partial from pathlib import Path from typing import ( Any, @@ -86,7 +88,10 @@ class FAISS(VectorStore): def __init__( self, - embedding_function: Union[Callable, Embeddings], + embedding_function: Union[ + Callable[[str], List[float]], + Embeddings, + ], index: Any, docstore: Docstore, index_to_docstore_id: Dict[int, str], @@ -131,12 +136,34 @@ class FAISS(VectorStore): else: return [self.embedding_function(text) for text in texts] + async def _aembed_documents(self, texts: List[str]) -> List[List[float]]: + if isinstance(self.embedding_function, Embeddings): + return await self.embedding_function.aembed_documents(texts) + else: + # return await asyncio.gather( + # [self.embedding_function(text) for text in texts] + # ) + raise Exception( + "`embedding_function` is expected to be an Embeddings object, support " + "for passing in a function will soon be removed." + ) + def _embed_query(self, text: str) -> List[float]: if isinstance(self.embedding_function, Embeddings): return self.embedding_function.embed_query(text) else: return self.embedding_function(text) + async def _aembed_query(self, text: str) -> List[float]: + if isinstance(self.embedding_function, Embeddings): + return await self.embedding_function.aembed_query(text) + else: + # return await self.embedding_function(text) + raise Exception( + "`embedding_function` is expected to be an Embeddings object, support " + "for passing in a function will soon be removed." + ) + def __add( self, texts: Iterable[str], @@ -196,6 +223,28 @@ class FAISS(VectorStore): embeddings = self._embed_documents(texts) return self.__add(texts, embeddings, metadatas=metadatas, ids=ids) + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore + asynchronously. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of unique IDs. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + texts = list(texts) + embeddings = await self._aembed_documents(texts) + return self.__add(texts, embeddings, metadatas=metadatas, ids=ids) + def add_embeddings( self, text_embeddings: Iterable[Tuple[str, List[float]]], @@ -281,6 +330,42 @@ class FAISS(VectorStore): ] return docs[:k] + async def asimilarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query asynchronously. + + Args: + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + **kwargs: kwargs to be passed to similarity search. Can include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of documents most similar to the query text and L2 distance + in float for each. Lower score represents more similarity. + """ + + # This is a temporary workaround to make the similarity search asynchronous. + func = partial( + self.similarity_search_with_score_by_vector, + embedding, + k=k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return await asyncio.get_event_loop().run_in_executor(None, func) + def similarity_search_with_score( self, query: str, @@ -312,6 +397,37 @@ class FAISS(VectorStore): ) return docs + async def asimilarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query asynchronously. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of documents most similar to the query text with + L2 distance in float. Lower score represents more similarity. + """ + embedding = await self._aembed_query(query) + docs = await self.asimilarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return docs + def similarity_search_by_vector( self, embedding: List[float], @@ -341,6 +457,35 @@ class FAISS(VectorStore): ) return [doc for doc, _ in docs_and_scores] + async def asimilarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector asynchronously. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of Documents most similar to the embedding. + """ + docs_and_scores = await self.asimilarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return [doc for doc, _ in docs_and_scores] + def similarity_search( self, query: str, @@ -366,6 +511,31 @@ class FAISS(VectorStore): ) return [doc for doc, _ in docs_and_scores] + async def asimilarity_search( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query asynchronously. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of Documents most similar to the query. + """ + docs_and_scores = await self.asimilarity_search_with_score( + query, k, filter=filter, fetch_k=fetch_k, **kwargs + ) + return [doc for doc, _ in docs_and_scores] + def max_marginal_relevance_search_with_score_by_vector( self, embedding: List[float], @@ -438,6 +608,45 @@ class FAISS(VectorStore): docs_and_scores.append((doc, score)) return docs_and_scores + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + *, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores selected using the maximal marginal + relevance asynchronously. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents and similarity scores selected by maximal marginal + relevance and score for each. + """ + # This is a temporary workaround to make the similarity search asynchronous. + func = partial( + self.max_marginal_relevance_search_with_score_by_vector, + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + ) + return await asyncio.get_event_loop().run_in_executor(None, func) + def max_marginal_relevance_search_by_vector( self, embedding: List[float], @@ -469,6 +678,39 @@ class FAISS(VectorStore): ) return [doc for doc, _ in docs_and_scores] + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance asynchronously. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + docs_and_scores = ( + await self.amax_marginal_relevance_search_with_score_by_vector( + embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, filter=filter + ) + ) + return [doc for doc, _ in docs_and_scores] + def max_marginal_relevance_search( self, query: str, @@ -506,6 +748,43 @@ class FAISS(VectorStore): ) return docs + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance asynchronously. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering (if needed) to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = await self._aembed_query(query) + docs = await self.amax_marginal_relevance_search_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + return docs + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: """Delete by ID. These are the IDs in the vectorstore. @@ -639,6 +918,43 @@ class FAISS(VectorStore): **kwargs, ) + @classmethod + async def afrom_texts( + cls, + texts: list[str], + embedding: Embeddings, + metadatas: List[dict] | None = None, + ids: List[str] | None = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents asynchronously. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the FAISS database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain.vectorstores import FAISS + from langchain.embeddings import OpenAIEmbeddings + + embeddings = OpenAIEmbeddings() + faiss = await FAISS.afrom_texts(texts, embeddings) + """ + embeddings = await embedding.aembed_documents(texts) + return cls.__from( + texts, + embeddings, + embedding, + metadatas=metadatas, + ids=ids, + **kwargs, + ) + @classmethod def from_embeddings( cls, @@ -679,6 +995,24 @@ class FAISS(VectorStore): **kwargs, ) + @classmethod + async def afrom_embeddings( + cls, + text_embeddings: Iterable[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[Iterable[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents asynchronously.""" + return cls.from_embeddings( + text_embeddings, + embedding, + metadatas=metadatas, + ids=ids, + **kwargs, + ) + def save_local(self, folder_path: str, index_name: str = "index") -> None: """Save FAISS index, docstore, and index_to_docstore_id to disk. @@ -715,6 +1049,7 @@ class FAISS(VectorStore): and index_to_docstore_id from. embeddings: Embeddings to use when generating queries index_name: for saving with a specific index file name + asynchronous: whether to use async version or not """ path = Path(folder_path) # load index separately since it is not picklable @@ -798,3 +1133,32 @@ class FAISS(VectorStore): (doc, relevance_score_fn(score)) for doc, score in docs_and_scores ] return docs_and_rel_scores + + async def _asimilarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores on a scale from 0 to 1.""" + # Pop score threshold so that only relevancy scores, not raw scores, are + # filtered. + relevance_score_fn = self._select_relevance_score_fn() + if relevance_score_fn is None: + raise ValueError( + "normalize_score_fn must be provided to" + " FAISS constructor to normalize scores" + ) + docs_and_scores = await self.asimilarity_search_with_score( + query, + k=k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + docs_and_rel_scores = [ + (doc, relevance_score_fn(score)) for doc, score in docs_and_scores + ] + return docs_and_rel_scores diff --git a/libs/langchain/tests/unit_tests/vectorstores/test_faiss.py b/libs/langchain/tests/unit_tests/vectorstores/test_faiss.py index ff74dea302..592db0c3c5 100644 --- a/libs/langchain/tests/unit_tests/vectorstores/test_faiss.py +++ b/libs/langchain/tests/unit_tests/vectorstores/test_faiss.py @@ -30,6 +30,25 @@ def test_faiss() -> None: assert output == [Document(page_content="foo")] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_afrom_texts() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + @pytest.mark.requires("faiss") def test_faiss_vector_sim() -> None: """Test vector similarity.""" @@ -49,6 +68,26 @@ def test_faiss_vector_sim() -> None: assert output == [Document(page_content="foo")] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_vector_sim() -> None: + """Test vector similarity.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.asimilarity_search_by_vector(query_vec, k=1) + assert output == [Document(page_content="foo")] + + @pytest.mark.requires("faiss") def test_faiss_vector_sim_with_score_threshold() -> None: """Test vector similarity.""" @@ -68,6 +107,28 @@ def test_faiss_vector_sim_with_score_threshold() -> None: assert output == [Document(page_content="foo")] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_vector_async_sim_with_score_threshold() -> None: + """Test vector similarity.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.asimilarity_search_by_vector( + query_vec, k=2, score_threshold=0.2 + ) + assert output == [Document(page_content="foo")] + + @pytest.mark.requires("faiss") def test_similarity_search_with_score_by_vector() -> None: """Test vector similarity with score by vector.""" @@ -88,6 +149,27 @@ def test_similarity_search_with_score_by_vector() -> None: assert output[0][0] == Document(page_content="foo") +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_similarity_async_search_with_score_by_vector() -> None: + """Test vector similarity with score by vector.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.asimilarity_search_with_score_by_vector(query_vec, k=1) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo") + + @pytest.mark.requires("faiss") def test_similarity_search_with_score_by_vector_with_score_threshold() -> None: """Test vector similarity with score by vector.""" @@ -113,6 +195,32 @@ def test_similarity_search_with_score_by_vector_with_score_threshold() -> None: assert output[0][1] < 0.2 +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_sim_asearch_with_score_by_vector_with_score_threshold() -> None: + """Test vector similarity with score by vector.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.asimilarity_search_with_score_by_vector( + query_vec, + k=2, + score_threshold=0.2, + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo") + assert output[0][1] < 0.2 + + @pytest.mark.requires("faiss") def test_faiss_mmr() -> None: texts = ["foo", "foo", "fou", "foy"] @@ -128,6 +236,22 @@ def test_faiss_mmr() -> None: assert output[1][0] != Document(page_content="foo") +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_mmr() -> None: + texts = ["foo", "foo", "fou", "foy"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + query_vec = await FakeEmbeddings().aembed_query(text="foo") + # make sure we can have k > docstore size + output = await docsearch.amax_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1 + ) + assert len(output) == len(texts) + assert output[0][0] == Document(page_content="foo") + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo") + + @pytest.mark.requires("faiss") def test_faiss_mmr_with_metadatas() -> None: texts = ["foo", "foo", "fou", "foy"] @@ -143,6 +267,22 @@ def test_faiss_mmr_with_metadatas() -> None: assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_mmr_with_metadatas() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.amax_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1 + ) + assert len(output) == len(texts) + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) + + @pytest.mark.requires("faiss") def test_faiss_mmr_with_metadatas_and_filter() -> None: texts = ["foo", "foo", "fou", "foy"] @@ -157,6 +297,21 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None: assert output[0][1] == 0.0 +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_mmr_with_metadatas_and_filter() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.amax_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": 1} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + + @pytest.mark.requires("faiss") def test_faiss_mmr_with_metadatas_and_list_filter() -> None: texts = ["foo", "foo", "fou", "foy"] @@ -172,6 +327,22 @@ def test_faiss_mmr_with_metadatas_and_list_filter() -> None: assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_mmr_with_metadatas_and_list_filter() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = await FakeEmbeddings().aembed_query(text="foo") + output = await docsearch.amax_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": [0, 1, 2]} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) + + @pytest.mark.requires("faiss") def test_faiss_with_metadatas() -> None: """Test end to end construction and search.""" @@ -196,6 +367,31 @@ def test_faiss_with_metadatas() -> None: assert output == [Document(page_content="foo", metadata={"page": 0})] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + expected_docstore = InMemoryDocstore( + { + docsearch.index_to_docstore_id[0]: Document( + page_content="foo", metadata={"page": 0} + ), + docsearch.index_to_docstore_id[1]: Document( + page_content="bar", metadata={"page": 1} + ), + docsearch.index_to_docstore_id[2]: Document( + page_content="baz", metadata={"page": 2} + ), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + @pytest.mark.requires("faiss") def test_faiss_with_metadatas_and_filter() -> None: texts = ["foo", "bar", "baz"] @@ -219,6 +415,30 @@ def test_faiss_with_metadatas_and_filter() -> None: assert output == [Document(page_content="bar", metadata={"page": 1})] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_with_metadatas_and_filter() -> None: + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + expected_docstore = InMemoryDocstore( + { + docsearch.index_to_docstore_id[0]: Document( + page_content="foo", metadata={"page": 0} + ), + docsearch.index_to_docstore_id[1]: Document( + page_content="bar", metadata={"page": 1} + ), + docsearch.index_to_docstore_id[2]: Document( + page_content="baz", metadata={"page": 2} + ), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = await docsearch.asimilarity_search("foo", k=1, filter={"page": 1}) + assert output == [Document(page_content="bar", metadata={"page": 1})] + + @pytest.mark.requires("faiss") def test_faiss_with_metadatas_and_list_filter() -> None: texts = ["foo", "bar", "baz", "foo", "qux"] @@ -248,6 +468,36 @@ def test_faiss_with_metadatas_and_list_filter() -> None: assert output == [Document(page_content="foo", metadata={"page": 0})] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_with_metadatas_and_list_filter() -> None: + texts = ["foo", "bar", "baz", "foo", "qux"] + metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings(), metadatas=metadatas) + expected_docstore = InMemoryDocstore( + { + docsearch.index_to_docstore_id[0]: Document( + page_content="foo", metadata={"page": 0} + ), + docsearch.index_to_docstore_id[1]: Document( + page_content="bar", metadata={"page": 1} + ), + docsearch.index_to_docstore_id[2]: Document( + page_content="baz", metadata={"page": 2} + ), + docsearch.index_to_docstore_id[3]: Document( + page_content="foo", metadata={"page": 3} + ), + docsearch.index_to_docstore_id[4]: Document( + page_content="qux", metadata={"page": 3} + ), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = await docsearch.asimilarity_search("foor", k=1, filter={"page": [0, 1, 2]}) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + @pytest.mark.requires("faiss") def test_faiss_search_not_found() -> None: """Test what happens when document is not found.""" @@ -259,6 +509,18 @@ def test_faiss_search_not_found() -> None: docsearch.similarity_search("foo") +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_search_not_found() -> None: + """Test what happens when document is not found.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + # Get rid of the docstore to purposefully induce errors. + docsearch.docstore = InMemoryDocstore({}) + with pytest.raises(ValueError): + await docsearch.asimilarity_search("foo") + + @pytest.mark.requires("faiss") def test_faiss_add_texts() -> None: """Test end to end adding of texts.""" @@ -271,14 +533,36 @@ def test_faiss_add_texts() -> None: assert output == [Document(page_content="foo"), Document(page_content="foo")] +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_add_texts() -> None: + """Test end to end adding of texts.""" + # Create initial doc store. + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + # Test adding a similar document as before. + await docsearch.aadd_texts(["foo"]) + output = await docsearch.asimilarity_search("foo", k=2) + assert output == [Document(page_content="foo"), Document(page_content="foo")] + + @pytest.mark.requires("faiss") def test_faiss_add_texts_not_supported() -> None: """Test adding of texts to a docstore that doesn't support it.""" - docsearch = FAISS(FakeEmbeddings().embed_query, None, FakeDocstore(), {}) + docsearch = FAISS(FakeEmbeddings(), None, FakeDocstore(), {}) with pytest.raises(ValueError): docsearch.add_texts(["foo"]) +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_add_texts_not_supported() -> None: + """Test adding of texts to a docstore that doesn't support it.""" + docsearch = FAISS(FakeEmbeddings(), None, FakeDocstore(), {}) + with pytest.raises(ValueError): + await docsearch.aadd_texts(["foo"]) + + @pytest.mark.requires("faiss") def test_faiss_local_save_load() -> None: """Test end to end serialization.""" @@ -291,6 +575,19 @@ def test_faiss_local_save_load() -> None: assert new_docsearch.index is not None +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_local_save_load() -> None: + """Test end to end serialization.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts(texts, FakeEmbeddings()) + temp_timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + with tempfile.TemporaryDirectory(suffix="_" + temp_timestamp + "/") as temp_folder: + docsearch.save_local(temp_folder) + new_docsearch = FAISS.load_local(temp_folder, FakeEmbeddings()) + assert new_docsearch.index is not None + + @pytest.mark.requires("faiss") def test_faiss_similarity_search_with_relevance_scores() -> None: """Test the similarity search with normalized similarities.""" @@ -306,6 +603,22 @@ def test_faiss_similarity_search_with_relevance_scores() -> None: assert score == 1.0 +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_similarity_search_with_relevance_scores() -> None: + """Test the similarity search with normalized similarities.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts( + texts, + FakeEmbeddings(), + relevance_score_fn=lambda score: 1.0 - score / math.sqrt(2), + ) + outputs = await docsearch.asimilarity_search_with_relevance_scores("foo", k=1) + output, score = outputs[0] + assert output == Document(page_content="foo") + assert score == 1.0 + + @pytest.mark.requires("faiss") def test_faiss_similarity_search_with_relevance_scores_with_threshold() -> None: """Test the similarity search with normalized similarities with score threshold.""" @@ -324,6 +637,25 @@ def test_faiss_similarity_search_with_relevance_scores_with_threshold() -> None: assert score == 1.0 +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_asimilarity_search_with_relevance_scores_with_threshold() -> None: + """Test the similarity search with normalized similarities with score threshold.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts( + texts, + FakeEmbeddings(), + relevance_score_fn=lambda score: 1.0 - score / math.sqrt(2), + ) + outputs = await docsearch.asimilarity_search_with_relevance_scores( + "foo", k=2, score_threshold=0.5 + ) + assert len(outputs) == 1 + output, score = outputs[0] + assert output == Document(page_content="foo") + assert score == 1.0 + + @pytest.mark.requires("faiss") def test_faiss_invalid_normalize_fn() -> None: """Test the similarity search with normalized similarities.""" @@ -335,6 +667,18 @@ def test_faiss_invalid_normalize_fn() -> None: docsearch.similarity_search_with_relevance_scores("foo", k=1) +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_faiss_async_invalid_normalize_fn() -> None: + """Test the similarity search with normalized similarities.""" + texts = ["foo", "bar", "baz"] + docsearch = await FAISS.afrom_texts( + texts, FakeEmbeddings(), relevance_score_fn=lambda _: 2.0 + ) + with pytest.warns(Warning, match="scores must be between"): + await docsearch.asimilarity_search_with_relevance_scores("foo", k=1) + + @pytest.mark.requires("faiss") def test_missing_normalize_score_fn() -> None: """Test doesn't perform similarity search without a valid distance strategy.""" @@ -344,6 +688,18 @@ def test_missing_normalize_score_fn() -> None: faiss_instance.similarity_search_with_relevance_scores("foo", k=2) +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_async_missing_normalize_score_fn() -> None: + """Test doesn't perform similarity search without a valid distance strategy.""" + texts = ["foo", "bar", "baz"] + faiss_instance = await FAISS.afrom_texts( + texts, FakeEmbeddings(), distance_strategy="fake" + ) + with pytest.raises(ValueError): + await faiss_instance.asimilarity_search_with_relevance_scores("foo", k=2) + + @pytest.mark.requires("faiss") def test_delete() -> None: """Test the similarity search with normalized similarities.""" @@ -354,3 +710,18 @@ def test_delete() -> None: result = docsearch.similarity_search("bar", k=2) assert sorted([d.page_content for d in result]) == ["baz", "foo"] assert docsearch.index_to_docstore_id == {0: ids[0], 1: ids[2]} + + +@pytest.mark.requires("faiss") +@pytest.mark.asyncio +async def test_async_delete() -> None: + """Test the similarity search with normalized similarities.""" + ids = ["a", "b", "c"] + docsearch = await FAISS.afrom_texts( + ["foo", "bar", "baz"], FakeEmbeddings(), ids=ids + ) + docsearch.delete(ids[1:2]) + + result = await docsearch.asimilarity_search("bar", k=2) + assert sorted([d.page_content for d in result]) == ["baz", "foo"] + assert docsearch.index_to_docstore_id == {0: ids[0], 1: ids[2]}