From 1d2981114fadea995c374567d3807a9f6a7eb612 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Thu, 16 Nov 2023 17:41:26 -0800 Subject: [PATCH] DOCS updated `async-faiss` example (#13434) The original notebook has the `faiss` title which is duplicated in the`faiss.jpynb`. As a result, we have two `faiss` items in the vectorstore ToC. And the first item breaks the searching order (it is placed between `A...` items). - I updated title to `Asynchronous Faiss`. --- .../integrations/vectorstores/faiss.ipynb | 116 ++--------- .../{async_faiss.ipynb => faiss_async.ipynb} | 186 +++--------------- docs/vercel.json | 4 + 3 files changed, 56 insertions(+), 250 deletions(-) rename docs/docs/integrations/vectorstores/{async_faiss.ipynb => faiss_async.ipynb} (71%) diff --git a/docs/docs/integrations/vectorstores/faiss.ipynb b/docs/docs/integrations/vectorstores/faiss.ipynb index 5931d1a6cc..b3a013b844 100644 --- a/docs/docs/integrations/vectorstores/faiss.ipynb +++ b/docs/docs/integrations/vectorstores/faiss.ipynb @@ -38,8 +38,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "47f9b495-88f1-4286-8d5d-1416103931a7", + "execution_count": null, + "id": "dc37144c-208d-4ab3-9f3a-0407a69fe052", "metadata": { "tags": [] }, @@ -51,33 +51,13 @@ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", "\n", "# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization\n", - "# os.environ['FAISS_NO_AVX2'] = '1'" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "aac9563e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "# os.environ['FAISS_NO_AVX2'] = '1'\n", + "\n", "from langchain.document_loaders import TextLoader\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import FAISS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a3c3999a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "from langchain.vectorstores import FAISS\n", + "\n", "from langchain.document_loaders import TextLoader\n", "\n", "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n", @@ -200,31 +180,15 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "428a6816", - "metadata": {}, - "outputs": [], - "source": [ - "db.save_local(\"faiss_index\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "56d1841c", - "metadata": {}, - "outputs": [], - "source": [ - "new_db = FAISS.load_local(\"faiss_index\", embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "39055525", + "execution_count": null, + "id": "1b31fe27-e0b3-42c6-b17c-8270b517ee1f", "metadata": {}, "outputs": [], "source": [ + "db.save_local(\"faiss_index\")\n", + "\n", + "new_db = FAISS.load_local(\"faiss_index\", embeddings)\n", + "\n", "docs = new_db.similarity_search(query)" ] }, @@ -266,30 +230,11 @@ "metadata": {}, "outputs": [], "source": [ - "pkl = db.serialize_to_bytes() # serializes the faiss index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb083247", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e36e220b", - "metadata": {}, - "outputs": [], - "source": [ + "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "\n", + "pkl = db.serialize_to_bytes() # serializes the faiss\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "\n", "db = FAISS.deserialize_from_bytes(\n", " embeddings=embeddings, serialized=pkl\n", ") # Load the index" @@ -306,33 +251,14 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "6dfd2b78", + "execution_count": null, + "id": "9b8f5e31-3f40-4e94-8d97-5883125efba7", "metadata": {}, "outputs": [], "source": [ "db1 = FAISS.from_texts([\"foo\"], embeddings)\n", - "db2 = FAISS.from_texts([\"bar\"], embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "29960da7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'068c473b-d420-487a-806b-fb0ccea7f711': Document(page_content='foo', metadata={})}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "db2 = FAISS.from_texts([\"bar\"], embeddings)\n", + "\n", "db1.docstore._dict" ] }, diff --git a/docs/docs/integrations/vectorstores/async_faiss.ipynb b/docs/docs/integrations/vectorstores/faiss_async.ipynb similarity index 71% rename from docs/docs/integrations/vectorstores/async_faiss.ipynb rename to docs/docs/integrations/vectorstores/faiss_async.ipynb index abec0d806a..638c4dbd0b 100644 --- a/docs/docs/integrations/vectorstores/async_faiss.ipynb +++ b/docs/docs/integrations/vectorstores/faiss_async.ipynb @@ -5,15 +5,16 @@ "id": "683953b3", "metadata": {}, "source": [ - "# Faiss\n", + "# Faiss (Async)\n", "\n", ">[Facebook AI Similarity Search (Faiss)](https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning.\n", "\n", "[Faiss documentation](https://faiss.ai/).\n", "\n", - "This notebook shows how to use functionality related to the `FAISS` vector database using asyncio.\n", + "This notebook shows how to use functionality related to the `FAISS` vector database using `asyncio`.\n", + "LangChain implemented the synchronous and asynchronous vector store functions.\n", "\n", - "See synchronous version [here](https://python.langchain.com/docs/integrations/vectorstores/faiss)." + "See `synchronous` version [here](https://python.langchain.com/docs/integrations/vectorstores/faiss)." ] }, { @@ -40,8 +41,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "47f9b495-88f1-4286-8d5d-1416103931a7", + "execution_count": null, + "id": "971a172a-2d87-4eec-be92-87aa174fec30", "metadata": { "tags": [] }, @@ -53,33 +54,13 @@ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", "\n", "# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization\n", - "# os.environ['FAISS_NO_AVX2'] = '1'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "aac9563e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "# os.environ['FAISS_NO_AVX2'] = '1'\n", + "\n", "from langchain.document_loaders import TextLoader\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import FAISS" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "a3c3999a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "from langchain.vectorstores import FAISS\n", + "\n", "from langchain.document_loaders import TextLoader\n", "\n", "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n", @@ -87,47 +68,13 @@ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", "\n", - "embeddings = OpenAIEmbeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "5eabdb75", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "embeddings = OpenAIEmbeddings()\n", + "\n", "db = await FAISS.afrom_documents(docs, embeddings)\n", "\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = await db.asimilarity_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "4b172de8", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" - ] - } - ], - "source": [ + "docs = await db.asimilarity_search(query)\n", + "\n", "print(docs[0].page_content)" ] }, @@ -142,33 +89,13 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "186ee1d8", + "execution_count": null, + "id": "30bf7c85-a273-45dc-ae9e-f138e330b42e", "metadata": {}, "outputs": [], "source": [ - "docs_and_scores = await db.asimilarity_search_with_score(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "284e04b5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': './state_of_the_union.txt'}),\n", - " 0.36871302)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "docs_and_scores = await db.asimilarity_search_with_score(query)\n", + "\n", "docs_and_scores[0]" ] }, @@ -202,52 +129,17 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "428a6816", - "metadata": {}, - "outputs": [], - "source": [ - "db.save_local(\"faiss_index\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "56d1841c", - "metadata": {}, - "outputs": [], - "source": [ - "new_db = FAISS.load_local(\"faiss_index\", embeddings, asynchronous=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "39055525", + "execution_count": null, + "id": "88e11f08-1ac8-45aa-8bc0-56439ef87256", "metadata": {}, "outputs": [], "source": [ - "docs = await new_db.asimilarity_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "98378c4e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': './state_of_the_union.txt'})" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "db.save_local(\"faiss_index\")\n", + "\n", + "new_db = FAISS.load_local(\"faiss_index\", embeddings, asynchronous=True)\n", + "\n", + "docs = await new_db.asimilarity_search(query)\n", + "\n", "docs[0]" ] }, @@ -261,26 +153,6 @@ "you can pickle the FAISS Index by these functions. If you use embeddings model which is of 90 mb (sentence-transformers/all-MiniLM-L6-v2 or any other model), the resultant pickle size would be more than 90 mb. the size of the model is also included in the overall size. To overcome this, use the below functions. These functions only serializes FAISS index and size would be much lesser. this can be helpful if you wish to store the index in database like sql." ] }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d8faead5", - "metadata": {}, - "outputs": [], - "source": [ - "pkl = db.serialize_to_bytes() # serializes the faiss index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb083247", - "metadata": {}, - "outputs": [], - "source": [ - "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -288,6 +160,10 @@ "metadata": {}, "outputs": [], "source": [ + "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "\n", + "pkl = db.serialize_to_bytes() # serializes the faiss index\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", "db = FAISS.deserialize_from_bytes(\n", " embeddings=embeddings, serialized=pkl, asynchronous=True\n", ") # Load the index" @@ -596,7 +472,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/vercel.json b/docs/vercel.json index 81730f46ce..87566d64cc 100644 --- a/docs/vercel.json +++ b/docs/vercel.json @@ -500,6 +500,10 @@ "source": "/docs/integrations/vectorstores/cassandra", "destination": "/docs/integrations/vectorstores/astradb" }, + { + "source": "/docs/integrations/vectorstores/async_faiss", + "destination": "/docs/integrations/vectorstores/faiss_async" + }, { "source": "/docs/integrations/cerebriumai", "destination": "/docs/integrations/providers/cerebriumai"