From a9310a3e8b6781bdc8f64a379eb844f8c8154584 Mon Sep 17 00:00:00 2001 From: Jan Backes Date: Sun, 16 Apr 2023 22:44:04 +0200 Subject: [PATCH] Add Annoy as VectorStore (#2939) Adds Annoy (https://github.com/spotify/annoy) as vector Store. RESOLVES hwchase17/langchain#2842 discord ref: https://discord.com/channels/1038097195422978059/1051632794427723827/1096089994168377354 --------- Co-authored-by: Harrison Chase Co-authored-by: vowelparrot <130414180+vowelparrot@users.noreply.github.com> --- .../indexes/vectorstores/examples/annoy.ipynb | 572 ++++++++++++++++++ langchain/vectorstores/__init__.py | 2 + langchain/vectorstores/annoy.py | 427 +++++++++++++ .../vectorstores/test_annoy.py | 123 ++++ 4 files changed, 1124 insertions(+) create mode 100644 docs/modules/indexes/vectorstores/examples/annoy.ipynb create mode 100644 langchain/vectorstores/annoy.py create mode 100644 tests/integration_tests/vectorstores/test_annoy.py diff --git a/docs/modules/indexes/vectorstores/examples/annoy.ipynb b/docs/modules/indexes/vectorstores/examples/annoy.ipynb new file mode 100644 index 00000000..67dfb5e7 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/annoy.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# Annoy\n", + "\n", + "This notebook shows how to use functionality related to the Annoy vector database.\n", + "\n", + "> \"Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data.\"\n", + "\n", + "via [Annoy](https://github.com/spotify/annoy) \n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3b450bdc", + "metadata": {}, + "source": [ + "```{note}\n", + "Annoy is read-only - once the index is built you cannot add any more emebddings!\n", + "If you want to progressively add to your VectorStore then better choose an alternative!\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6613d222", + "metadata": {}, + "source": [ + "## Create VectorStore from texts" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dc7351b5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.vectorstores import Annoy\n", + "\n", + "embeddings_func = HuggingFaceEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d2cb5f7d", + "metadata": {}, + "outputs": [], + "source": [ + "texts = [\"pizza is great\", \"I love salad\", \"my car\", \"a dog\"]\n", + "\n", + "# default metric is angular\n", + "vector_store = Annoy.from_texts(texts, embeddings_func)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a856b2d1", + "metadata": {}, + "outputs": [], + "source": [ + "# allows for custom annoy parameters, defaults are n_trees=100, n_jobs=-1, metric=\"angular\"\n", + "vector_store_v2 = Annoy.from_texts(\n", + " texts, embeddings_func, metric=\"dot\", n_trees=100, n_jobs=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8ada534a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='pizza is great', metadata={}),\n", + " Document(page_content='I love salad', metadata={}),\n", + " Document(page_content='my car', metadata={})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store.similarity_search(\"food\", k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0470c5c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='pizza is great', metadata={}), 1.0944390296936035),\n", + " (Document(page_content='I love salad', metadata={}), 1.1273186206817627),\n", + " (Document(page_content='my car', metadata={}), 1.1580758094787598)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the score is a distance metric, so lower is better\n", + "vector_store.similarity_search_with_score(\"food\", k=3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4583b231", + "metadata": {}, + "source": [ + "## Create VectorStore from docs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fbe898a8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "\n", + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "51ea6b5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source': '../../../state_of_the_union.txt'}),\n", + " Document(page_content='Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \\n\\nIn this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \\n\\nLet each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \\n\\nPlease rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \\n\\nThroughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. \\n\\nThey keep moving. \\n\\nAnd the costs and the threats to America and the world keep rising. \\n\\nThat’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. \\n\\nThe United States is a member along with 29 other nations. \\n\\nIt matters. American diplomacy matters. American resolve matters.', metadata={'source': '../../../state_of_the_union.txt'}),\n", + " Document(page_content='Putin’s latest attack on Ukraine was premeditated and unprovoked. \\n\\nHe rejected repeated efforts at diplomacy. \\n\\nHe thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. \\n\\nWe prepared extensively and carefully. \\n\\nWe spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. \\n\\nI spent countless hours unifying our European allies. We shared with the world in advance what we knew Putin was planning and precisely how he would try to falsely justify his aggression. \\n\\nWe countered Russia’s lies with truth. \\n\\nAnd now that he has acted the free world is holding him accountable. \\n\\nAlong with twenty-seven members of the European Union including France, Germany, Italy, as well as countries like the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.', metadata={'source': '../../../state_of_the_union.txt'}),\n", + " Document(page_content='We are inflicting pain on Russia and supporting the people of Ukraine. Putin is now isolated from the world more than ever. \\n\\nTogether with our allies –we are right now enforcing powerful economic sanctions. \\n\\nWe are cutting off Russia’s largest banks from the international financial system. \\n\\nPreventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion “war fund” worthless. \\n\\nWe are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come. \\n\\nTonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. \\n\\nThe U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. \\n\\nWe are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains.', metadata={'source': '../../../state_of_the_union.txt'}),\n", + " Document(page_content='And tonight I am announcing that we will join our allies in closing off American air space to all Russian flights – further isolating Russia – and adding an additional squeeze –on their economy. The Ruble has lost 30% of its value. \\n\\nThe Russian stock market has lost 40% of its value and trading remains suspended. Russia’s economy is reeling and Putin alone is to blame. \\n\\nTogether with our allies we are providing support to the Ukrainians in their fight for freedom. Military assistance. Economic assistance. Humanitarian assistance. \\n\\nWe are giving more than $1 Billion in direct assistance to Ukraine. \\n\\nAnd we will continue to aid the Ukrainian people as they defend their country and to help ease their suffering. \\n\\nLet me be clear, our forces are not engaged and will not engage in conflict with Russian forces in Ukraine. \\n\\nOur forces are not going to Europe to fight in Ukraine, but to defend our NATO Allies – in the event that Putin decides to keep moving west.', metadata={'source': '../../../state_of_the_union.txt'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d080985b", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store_from_docs = Annoy.from_documents(docs, embeddings_func)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4931cb99", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = vector_store_from_docs.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "97969d5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Ac\n" + ] + } + ], + "source": [ + "print(docs[0].page_content[:100])" + ] + }, + { + "cell_type": "markdown", + "id": "79628542", + "metadata": {}, + "source": [ + "## Create VectorStore via existing embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3432eddb", + "metadata": {}, + "outputs": [], + "source": [ + "embs = embeddings_func.embed_documents(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b69f8408", + "metadata": {}, + "outputs": [], + "source": [ + "data = list(zip(texts, embs))\n", + "\n", + "vector_store_from_embeddings = Annoy.from_embeddings(data, embeddings_func)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e260758d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='pizza is great', metadata={}), 1.0944390296936035),\n", + " (Document(page_content='I love salad', metadata={}), 1.1273186206817627),\n", + " (Document(page_content='my car', metadata={}), 1.1580758094787598)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store_from_embeddings.similarity_search_with_score(\"food\", k=3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "341390c2", + "metadata": {}, + "source": [ + "## Search via embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b9bce06d", + "metadata": {}, + "outputs": [], + "source": [ + "motorbike_emb = embeddings_func.embed_query(\"motorbike\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "af2552c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='my car', metadata={}),\n", + " Document(page_content='a dog', metadata={}),\n", + " Document(page_content='pizza is great', metadata={})]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store.similarity_search_by_vector(motorbike_emb, k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c7a1a924", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='my car', metadata={}), 1.0870471000671387),\n", + " (Document(page_content='a dog', metadata={}), 1.2095637321472168),\n", + " (Document(page_content='pizza is great', metadata={}), 1.3254905939102173)]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store.similarity_search_with_score_by_vector(motorbike_emb, k=3)" + ] + }, + { + "cell_type": "markdown", + "id": "4b77be77", + "metadata": {}, + "source": [ + "## Search via docstore id" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bbd971f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: '2d1498a8-a37c-4798-acb9-0016504ed798',\n", + " 1: '2d30aecc-88e0-4469-9d51-0ef7e9858e6d',\n", + " 2: '927f1120-985b-4691-b577-ad5cb42e011c',\n", + " 3: '3056ddcf-a62f-48c8-bd98-b9e57a3dfcae'}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store.index_to_docstore_id" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6dbf3365", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='pizza is great', metadata={})" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "some_docstore_id = 0 # texts[0]\n", + "\n", + "vector_store.docstore._dict[vector_store.index_to_docstore_id[some_docstore_id]]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "98b27172", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='pizza is great', metadata={}), 0.0),\n", + " (Document(page_content='I love salad', metadata={}), 1.0734446048736572),\n", + " (Document(page_content='my car', metadata={}), 1.2895267009735107)]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# same document has distance 0\n", + "vector_store.similarity_search_with_score_by_index(some_docstore_id, k=3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6f570f69", + "metadata": {}, + "source": [ + "## Save and load" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ef91cc69", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "saving config\n" + ] + } + ], + "source": [ + "vector_store.save_local(\"my_annoy_index_and_docstore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7a9d2fce", + "metadata": {}, + "outputs": [], + "source": [ + "loaded_vector_store = Annoy.load_local(\n", + " \"my_annoy_index_and_docstore\", embeddings=embeddings_func\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bba77cae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='pizza is great', metadata={}), 0.0),\n", + " (Document(page_content='I love salad', metadata={}), 1.0734446048736572),\n", + " (Document(page_content='my car', metadata={}), 1.2895267009735107)]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# same document has distance 0\n", + "loaded_vector_store.similarity_search_with_score_by_index(some_docstore_id, k=3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "df4beb83", + "metadata": {}, + "source": [ + "## Construct from scratch" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "26fcf742", + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "from annoy import AnnoyIndex\n", + "from langchain.docstore.document import Document\n", + "from langchain.docstore.in_memory import InMemoryDocstore\n", + "\n", + "metadatas = [{\"x\": \"food\"}, {\"x\": \"food\"}, {\"x\": \"stuff\"}, {\"x\": \"animal\"}]\n", + "\n", + "# embeddings\n", + "embeddings = embeddings_func.embed_documents(texts)\n", + "\n", + "# embedding dim\n", + "f = len(embeddings[0])\n", + "\n", + "# index\n", + "metric = \"angular\"\n", + "index = AnnoyIndex(f, metric=metric)\n", + "for i, emb in enumerate(embeddings):\n", + " index.add_item(i, emb)\n", + "index.build(10)\n", + "\n", + "# docstore\n", + "documents = []\n", + "for i, text in enumerate(texts):\n", + " metadata = metadatas[i] if metadatas else {}\n", + " documents.append(Document(page_content=text, metadata=metadata))\n", + "index_to_docstore_id = {i: str(uuid.uuid4()) for i in range(len(documents))}\n", + "docstore = InMemoryDocstore(\n", + " {index_to_docstore_id[i]: doc for i, doc in enumerate(documents)}\n", + ")\n", + "\n", + "db_manually = Annoy(\n", + " embeddings_func.embed_query, index, metric, docstore, index_to_docstore_id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2b3f6f5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(Document(page_content='pizza is great', metadata={'x': 'food'}),\n", + " 1.1314140558242798),\n", + " (Document(page_content='I love salad', metadata={'x': 'food'}),\n", + " 1.1668788194656372),\n", + " (Document(page_content='my car', metadata={'x': 'stuff'}), 1.226445198059082)]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db_manually.similarity_search_with_score(\"eating!\", k=3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 4f86ab6e..1cfbdfeb 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -1,4 +1,5 @@ """Wrappers on top of vector stores.""" +from langchain.vectorstores.annoy import Annoy from langchain.vectorstores.atlas import AtlasDB from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma @@ -23,4 +24,5 @@ __all__ = [ "OpenSearchVectorSearch", "AtlasDB", "DeepLake", + "Annoy", ] diff --git a/langchain/vectorstores/annoy.py b/langchain/vectorstores/annoy.py new file mode 100644 index 00000000..05d5a5d0 --- /dev/null +++ b/langchain/vectorstores/annoy.py @@ -0,0 +1,427 @@ +"""Wrapper around Annoy vector database.""" +from __future__ import annotations + +import os +import pickle +import uuid +from configparser import ConfigParser +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple + +import numpy as np + +from langchain.docstore.base import Docstore +from langchain.docstore.document import Document +from langchain.docstore.in_memory import InMemoryDocstore +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance + +INDEX_METRICS = frozenset(["angular", "euclidean", "manhattan", "hamming", "dot"]) +DEFAULT_METRIC = "angular" + + +def dependable_annoy_import() -> Any: + """Import annoy if available, otherwise raise error.""" + try: + import annoy + except ImportError: + raise ValueError( + "Could not import annoy python package. " + "Please install it with `pip install --user annoy` " + ) + return annoy + + +class Annoy(VectorStore): + """Wrapper around Annoy vector database. + + To use, you should have the ``annoy`` python package installed. + + Example: + .. code-block:: python + + from langchain import Annoy + db = Annoy(embedding_function, index, docstore, index_to_docstore_id) + + """ + + def __init__( + self, + embedding_function: Callable, + index: Any, + metric: str, + docstore: Docstore, + index_to_docstore_id: Dict[int, str], + ): + """Initialize with necessary components.""" + self.embedding_function = embedding_function + self.index = index + self.metric = metric + self.docstore = docstore + self.index_to_docstore_id = index_to_docstore_id + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + raise NotImplementedError( + "Annoy does not allow to add new data once the index is build." + ) + + def process_index_results( + self, idxs: List[int], dists: List[float] + ) -> List[Tuple[Document, float]]: + """Turns annoy results into a list of documents and scores. + + Args: + idxs: List of indices of the documents in the index. + dists: List of distances of the documents in the index. + Returns: + List of Documents and scores. + """ + docs = [] + for idx, dist in zip(idxs, dists): + _id = self.index_to_docstore_id[idx] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + docs.append((doc, dist)) + return docs + + def similarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4, search_k: int = -1 + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + Returns: + List of Documents most similar to the query and score for each + """ + idxs, dists = self.index.get_nns_by_vector( + embedding, k, search_k=search_k, include_distances=True + ) + return self.process_index_results(idxs, dists) + + def similarity_search_with_score_by_index( + self, docstore_index: int, k: int = 4, search_k: int = -1 + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + Returns: + List of Documents most similar to the query and score for each + """ + idxs, dists = self.index.get_nns_by_item( + docstore_index, k, search_k=search_k, include_distances=True + ) + return self.process_index_results(idxs, dists) + + def similarity_search_with_score( + self, query: str, k: int = 4, search_k: int = -1 + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + + Returns: + List of Documents most similar to the query and score for each + """ + embedding = self.embedding_function(query) + docs = self.similarity_search_with_score_by_vector(embedding, k, search_k) + return docs + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, search_k: int = -1, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + + Returns: + List of Documents most similar to the embedding. + """ + docs_and_scores = self.similarity_search_with_score_by_vector( + embedding, k, search_k + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_by_index( + self, docstore_index: int, k: int = 4, search_k: int = -1, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to docstore_index. + + Args: + docstore_index: Index of document in docstore + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + + Returns: + List of Documents most similar to the embedding. + """ + docs_and_scores = self.similarity_search_with_score_by_index( + docstore_index, k, search_k + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search( + self, query: str, k: int = 4, search_k: int = -1, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + search_k: inspect up to search_k nodes which defaults + to n_trees * n if not provided + + Returns: + List of Documents most similar to the query. + """ + docs_and_scores = self.similarity_search_with_score(query, k, search_k) + return [doc for doc, _ in docs_and_scores] + + def max_marginal_relevance_search_by_vector( + self, embedding: List[float], k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + idxs = self.index.get_nns_by_vector( + embedding, fetch_k, search_k=-1, include_distances=False + ) + embeddings = [self.index.get_item_vector(i) for i in idxs] + mmr_selected = maximal_marginal_relevance( + np.array([embedding], dtype=np.float32), embeddings, k=k + ) + # ignore the -1's if not enough docs are returned/indexed + selected_indices = [idxs[i] for i in mmr_selected if i != -1] + + docs = [] + for i in selected_indices: + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + docs.append(doc) + return docs + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = self.embedding_function(query) + docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k) + return docs + + @classmethod + def __from( + cls, + texts: List[str], + embeddings: List[List[float]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + metric: str = DEFAULT_METRIC, + trees: int = 100, + n_jobs: int = -1, + **kwargs: Any, + ) -> Annoy: + if metric not in INDEX_METRICS: + raise ValueError( + ( + f"Unsupported distance metric: {metric}. " + f"Expected one of {list(INDEX_METRICS)}" + ) + ) + annoy = dependable_annoy_import() + if not embeddings: + raise ValueError("embeddings must be provided to build AnnoyIndex") + f = len(embeddings[0]) + index = annoy.AnnoyIndex(f, metric=metric) + for i, emb in enumerate(embeddings): + index.add_item(i, emb) + index.build(trees, n_jobs=n_jobs) + + documents = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + documents.append(Document(page_content=text, metadata=metadata)) + index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} + docstore = InMemoryDocstore( + {index_to_id[i]: doc for i, doc in enumerate(documents)} + ) + return cls(embedding.embed_query, index, metric, docstore, index_to_id) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + metric: str = DEFAULT_METRIC, + trees: int = 100, + n_jobs: int = -1, + **kwargs: Any, + ) -> Annoy: + """Construct Annoy wrapper from raw documents. + + Args: + texts: List of documents to index. + embedding: Embedding function to use. + metadatas: List of metadata dictionaries to associate with documents. + metric: Metric to use for indexing. Defaults to "angular". + trees: Number of trees to use for indexing. Defaults to 100. + n_jobs: Number of jobs to use for indexing. Defaults to -1. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the Annoy database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import Annoy + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + index = Annoy.from_texts(texts, embeddings) + """ + embeddings = embedding.embed_documents(texts) + return cls.__from( + texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs + ) + + @classmethod + def from_embeddings( + cls, + text_embeddings: List[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + metric: str = DEFAULT_METRIC, + trees: int = 100, + n_jobs: int = -1, + **kwargs: Any, + ) -> Annoy: + """Construct Annoy wrapper from embeddings. + + Args: + text_embeddings: List of tuples of (text, embedding) + embedding: Embedding function to use. + metadatas: List of metadata dictionaries to associate with documents. + metric: Metric to use for indexing. Defaults to "angular". + trees: Number of trees to use for indexing. Defaults to 100. + n_jobs: Number of jobs to use for indexing. Defaults to -1 + + This is a user friendly interface that: + 1. Creates an in memory docstore with provided embeddings + 2. Initializes the Annoy database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import Annoy + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + db = Annoy.from_texts(texts, embeddings) + """ + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + + return cls.__from( + texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs + ) + + def save_local(self, folder_path: str, prefault: bool = False) -> None: + """Save Annoy index, docstore, and index_to_docstore_id to disk. + + Args: + folder_path: folder path to save index, docstore, + and index_to_docstore_id to. + prefault: Whether to pre-load the index into memory. + """ + path = Path(folder_path) + os.makedirs(path, exist_ok=True) + # save index, index config, docstore and index_to_docstore_id + config_object = ConfigParser() + config_object["ANNOY"] = { + "f": self.index.f, + "metric": self.metric, + } + self.index.save(str(path / "index.annoy"), prefault=prefault) + with open(path / "index.pkl", "wb") as file: + pickle.dump((self.docstore, self.index_to_docstore_id, config_object), file) + + @classmethod + def load_local( + cls, + folder_path: str, + embeddings: Embeddings, + ) -> Annoy: + """Load Annoy index, docstore, and index_to_docstore_id to disk. + + Args: + folder_path: folder path to load index, docstore, + and index_to_docstore_id from. + embeddings: Embeddings to use when generating queries. + """ + path = Path(folder_path) + # load index separately since it is not picklable + annoy = dependable_annoy_import() + # load docstore and index_to_docstore_id + with open(path / "index.pkl", "rb") as file: + docstore, index_to_docstore_id, config_object = pickle.load(file) + + f = int(config_object["ANNOY"]["f"]) + metric = config_object["ANNOY"]["metric"] + + index = annoy.AnnoyIndex(f, metric=metric) + index.load(str(path / "index.annoy")) + + return cls( + embeddings.embed_query, index, metric, docstore, index_to_docstore_id + ) diff --git a/tests/integration_tests/vectorstores/test_annoy.py b/tests/integration_tests/vectorstores/test_annoy.py new file mode 100644 index 00000000..ff7131bc --- /dev/null +++ b/tests/integration_tests/vectorstores/test_annoy.py @@ -0,0 +1,123 @@ +"""Test Annoy functionality.""" +import tempfile + +import pytest + +from langchain.docstore.document import Document +from langchain.docstore.in_memory import InMemoryDocstore +from langchain.vectorstores.annoy import Annoy +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_annoy() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_annoy_vector_sim() -> None: + """Test vector similarity.""" + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.similarity_search_by_vector(query_vec, k=1) + assert output == [Document(page_content="foo")] + + # make sure we can have k > docstore size + output = docsearch.max_marginal_relevance_search_by_vector(query_vec, k=10) + assert len(output) == len(texts) + + +def test_annoy_vector_sim_by_index() -> None: + """Test vector similarity.""" + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + index_to_id = docsearch.index_to_docstore_id + expected_docstore = InMemoryDocstore( + { + index_to_id[0]: Document(page_content="foo"), + index_to_id[1]: Document(page_content="bar"), + index_to_id[2]: Document(page_content="baz"), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = docsearch.similarity_search_by_index(2, k=1) + assert output == [Document(page_content="baz")] + + +def test_annoy_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Annoy.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + expected_docstore = InMemoryDocstore( + { + docsearch.index_to_docstore_id[0]: Document( + page_content="foo", metadata={"page": 0} + ), + docsearch.index_to_docstore_id[1]: Document( + page_content="bar", metadata={"page": 1} + ), + docsearch.index_to_docstore_id[2]: Document( + page_content="baz", metadata={"page": 2} + ), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + +def test_annoy_search_not_found() -> None: + """Test what happens when document is not found.""" + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + # Get rid of the docstore to purposefully induce errors. + docsearch.docstore = InMemoryDocstore({}) + + with pytest.raises(ValueError): + docsearch.similarity_search("foo") + + +def test_annoy_add_texts() -> None: + """Test end to end adding of texts.""" + # Create initial doc store. + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + # Test adding a similar document as before. + with pytest.raises(NotImplementedError): + docsearch.add_texts(["foo"]) + + +def test_annoy_local_save_load() -> None: + """Test end to end serialization.""" + texts = ["foo", "bar", "baz"] + docsearch = Annoy.from_texts(texts, FakeEmbeddings()) + + temp_dir = tempfile.TemporaryDirectory() + docsearch.save_local(temp_dir.name) + loaded_docsearch = Annoy.load_local(temp_dir.name, FakeEmbeddings()) + + assert docsearch.index_to_docstore_id == loaded_docsearch.index_to_docstore_id + assert docsearch.docstore.__dict__ == loaded_docsearch.docstore.__dict__ + assert loaded_docsearch.index is not None