mirror of https://github.com/hwchase17/langchain
Add Annoy as VectorStore (#2939)
Adds Annoy (https://github.com/spotify/annoy) as vector Store. RESOLVES hwchase17/langchain#2842 discord ref: https://discord.com/channels/1038097195422978059/1051632794427723827/1096089994168377354 --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: vowelparrot <130414180+vowelparrot@users.noreply.github.com>pull/2965/head^2
parent
e12e00df12
commit
a9310a3e8b
@ -0,0 +1,427 @@
|
||||
"""Wrapper around Annoy vector database."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import uuid
|
||||
from configparser import ConfigParser
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.base import Docstore
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.docstore.in_memory import InMemoryDocstore
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
INDEX_METRICS = frozenset(["angular", "euclidean", "manhattan", "hamming", "dot"])
|
||||
DEFAULT_METRIC = "angular"
|
||||
|
||||
|
||||
def dependable_annoy_import() -> Any:
|
||||
"""Import annoy if available, otherwise raise error."""
|
||||
try:
|
||||
import annoy
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import annoy python package. "
|
||||
"Please install it with `pip install --user annoy` "
|
||||
)
|
||||
return annoy
|
||||
|
||||
|
||||
class Annoy(VectorStore):
|
||||
"""Wrapper around Annoy vector database.
|
||||
|
||||
To use, you should have the ``annoy`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain import Annoy
|
||||
db = Annoy(embedding_function, index, docstore, index_to_docstore_id)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_function: Callable,
|
||||
index: Any,
|
||||
metric: str,
|
||||
docstore: Docstore,
|
||||
index_to_docstore_id: Dict[int, str],
|
||||
):
|
||||
"""Initialize with necessary components."""
|
||||
self.embedding_function = embedding_function
|
||||
self.index = index
|
||||
self.metric = metric
|
||||
self.docstore = docstore
|
||||
self.index_to_docstore_id = index_to_docstore_id
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
raise NotImplementedError(
|
||||
"Annoy does not allow to add new data once the index is build."
|
||||
)
|
||||
|
||||
def process_index_results(
|
||||
self, idxs: List[int], dists: List[float]
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Turns annoy results into a list of documents and scores.
|
||||
|
||||
Args:
|
||||
idxs: List of indices of the documents in the index.
|
||||
dists: List of distances of the documents in the index.
|
||||
Returns:
|
||||
List of Documents and scores.
|
||||
"""
|
||||
docs = []
|
||||
for idx, dist in zip(idxs, dists):
|
||||
_id = self.index_to_docstore_id[idx]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append((doc, dist))
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self, embedding: List[float], k: int = 4, search_k: int = -1
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
idxs, dists = self.index.get_nns_by_vector(
|
||||
embedding, k, search_k=search_k, include_distances=True
|
||||
)
|
||||
return self.process_index_results(idxs, dists)
|
||||
|
||||
def similarity_search_with_score_by_index(
|
||||
self, docstore_index: int, k: int = 4, search_k: int = -1
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
idxs, dists = self.index.get_nns_by_item(
|
||||
docstore_index, k, search_k=search_k, include_distances=True
|
||||
)
|
||||
return self.process_index_results(idxs, dists)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4, search_k: int = -1
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
embedding = self.embedding_function(query)
|
||||
docs = self.similarity_search_with_score_by_vector(embedding, k, search_k)
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, search_k: int = -1, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||
embedding, k, search_k
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def similarity_search_by_index(
|
||||
self, docstore_index: int, k: int = 4, search_k: int = -1, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to docstore_index.
|
||||
|
||||
Args:
|
||||
docstore_index: Index of document in docstore
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score_by_index(
|
||||
docstore_index, k, search_k
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, search_k: int = -1, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
search_k: inspect up to search_k nodes which defaults
|
||||
to n_trees * n if not provided
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(query, k, search_k)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, fetch_k: int = 20, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
idxs = self.index.get_nns_by_vector(
|
||||
embedding, fetch_k, search_k=-1, include_distances=False
|
||||
)
|
||||
embeddings = [self.index.get_item_vector(i) for i in idxs]
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array([embedding], dtype=np.float32), embeddings, k=k
|
||||
)
|
||||
# ignore the -1's if not enough docs are returned/indexed
|
||||
selected_indices = [idxs[i] for i in mmr_selected if i != -1]
|
||||
|
||||
docs = []
|
||||
for i in selected_indices:
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
embedding = self.embedding_function(query)
|
||||
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
def __from(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embeddings: List[List[float]],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
metric: str = DEFAULT_METRIC,
|
||||
trees: int = 100,
|
||||
n_jobs: int = -1,
|
||||
**kwargs: Any,
|
||||
) -> Annoy:
|
||||
if metric not in INDEX_METRICS:
|
||||
raise ValueError(
|
||||
(
|
||||
f"Unsupported distance metric: {metric}. "
|
||||
f"Expected one of {list(INDEX_METRICS)}"
|
||||
)
|
||||
)
|
||||
annoy = dependable_annoy_import()
|
||||
if not embeddings:
|
||||
raise ValueError("embeddings must be provided to build AnnoyIndex")
|
||||
f = len(embeddings[0])
|
||||
index = annoy.AnnoyIndex(f, metric=metric)
|
||||
for i, emb in enumerate(embeddings):
|
||||
index.add_item(i, emb)
|
||||
index.build(trees, n_jobs=n_jobs)
|
||||
|
||||
documents = []
|
||||
for i, text in enumerate(texts):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
documents.append(Document(page_content=text, metadata=metadata))
|
||||
index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
|
||||
docstore = InMemoryDocstore(
|
||||
{index_to_id[i]: doc for i, doc in enumerate(documents)}
|
||||
)
|
||||
return cls(embedding.embed_query, index, metric, docstore, index_to_id)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
metric: str = DEFAULT_METRIC,
|
||||
trees: int = 100,
|
||||
n_jobs: int = -1,
|
||||
**kwargs: Any,
|
||||
) -> Annoy:
|
||||
"""Construct Annoy wrapper from raw documents.
|
||||
|
||||
Args:
|
||||
texts: List of documents to index.
|
||||
embedding: Embedding function to use.
|
||||
metadatas: List of metadata dictionaries to associate with documents.
|
||||
metric: Metric to use for indexing. Defaults to "angular".
|
||||
trees: Number of trees to use for indexing. Defaults to 100.
|
||||
n_jobs: Number of jobs to use for indexing. Defaults to -1.
|
||||
|
||||
This is a user friendly interface that:
|
||||
1. Embeds documents.
|
||||
2. Creates an in memory docstore
|
||||
3. Initializes the Annoy database
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain import Annoy
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
embeddings = OpenAIEmbeddings()
|
||||
index = Annoy.from_texts(texts, embeddings)
|
||||
"""
|
||||
embeddings = embedding.embed_documents(texts)
|
||||
return cls.__from(
|
||||
texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_embeddings(
|
||||
cls,
|
||||
text_embeddings: List[Tuple[str, List[float]]],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
metric: str = DEFAULT_METRIC,
|
||||
trees: int = 100,
|
||||
n_jobs: int = -1,
|
||||
**kwargs: Any,
|
||||
) -> Annoy:
|
||||
"""Construct Annoy wrapper from embeddings.
|
||||
|
||||
Args:
|
||||
text_embeddings: List of tuples of (text, embedding)
|
||||
embedding: Embedding function to use.
|
||||
metadatas: List of metadata dictionaries to associate with documents.
|
||||
metric: Metric to use for indexing. Defaults to "angular".
|
||||
trees: Number of trees to use for indexing. Defaults to 100.
|
||||
n_jobs: Number of jobs to use for indexing. Defaults to -1
|
||||
|
||||
This is a user friendly interface that:
|
||||
1. Creates an in memory docstore with provided embeddings
|
||||
2. Initializes the Annoy database
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain import Annoy
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
embeddings = OpenAIEmbeddings()
|
||||
db = Annoy.from_texts(texts, embeddings)
|
||||
"""
|
||||
texts = [t[0] for t in text_embeddings]
|
||||
embeddings = [t[1] for t in text_embeddings]
|
||||
|
||||
return cls.__from(
|
||||
texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs
|
||||
)
|
||||
|
||||
def save_local(self, folder_path: str, prefault: bool = False) -> None:
|
||||
"""Save Annoy index, docstore, and index_to_docstore_id to disk.
|
||||
|
||||
Args:
|
||||
folder_path: folder path to save index, docstore,
|
||||
and index_to_docstore_id to.
|
||||
prefault: Whether to pre-load the index into memory.
|
||||
"""
|
||||
path = Path(folder_path)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
# save index, index config, docstore and index_to_docstore_id
|
||||
config_object = ConfigParser()
|
||||
config_object["ANNOY"] = {
|
||||
"f": self.index.f,
|
||||
"metric": self.metric,
|
||||
}
|
||||
self.index.save(str(path / "index.annoy"), prefault=prefault)
|
||||
with open(path / "index.pkl", "wb") as file:
|
||||
pickle.dump((self.docstore, self.index_to_docstore_id, config_object), file)
|
||||
|
||||
@classmethod
|
||||
def load_local(
|
||||
cls,
|
||||
folder_path: str,
|
||||
embeddings: Embeddings,
|
||||
) -> Annoy:
|
||||
"""Load Annoy index, docstore, and index_to_docstore_id to disk.
|
||||
|
||||
Args:
|
||||
folder_path: folder path to load index, docstore,
|
||||
and index_to_docstore_id from.
|
||||
embeddings: Embeddings to use when generating queries.
|
||||
"""
|
||||
path = Path(folder_path)
|
||||
# load index separately since it is not picklable
|
||||
annoy = dependable_annoy_import()
|
||||
# load docstore and index_to_docstore_id
|
||||
with open(path / "index.pkl", "rb") as file:
|
||||
docstore, index_to_docstore_id, config_object = pickle.load(file)
|
||||
|
||||
f = int(config_object["ANNOY"]["f"])
|
||||
metric = config_object["ANNOY"]["metric"]
|
||||
|
||||
index = annoy.AnnoyIndex(f, metric=metric)
|
||||
index.load(str(path / "index.annoy"))
|
||||
|
||||
return cls(
|
||||
embeddings.embed_query, index, metric, docstore, index_to_docstore_id
|
||||
)
|
@ -0,0 +1,123 @@
|
||||
"""Test Annoy functionality."""
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.docstore.in_memory import InMemoryDocstore
|
||||
from langchain.vectorstores.annoy import Annoy
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
|
||||
def test_annoy() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
index_to_id = docsearch.index_to_docstore_id
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
index_to_id[0]: Document(page_content="foo"),
|
||||
index_to_id[1]: Document(page_content="bar"),
|
||||
index_to_id[2]: Document(page_content="baz"),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_annoy_vector_sim() -> None:
|
||||
"""Test vector similarity."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
index_to_id = docsearch.index_to_docstore_id
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
index_to_id[0]: Document(page_content="foo"),
|
||||
index_to_id[1]: Document(page_content="bar"),
|
||||
index_to_id[2]: Document(page_content="baz"),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.similarity_search_by_vector(query_vec, k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
# make sure we can have k > docstore size
|
||||
output = docsearch.max_marginal_relevance_search_by_vector(query_vec, k=10)
|
||||
assert len(output) == len(texts)
|
||||
|
||||
|
||||
def test_annoy_vector_sim_by_index() -> None:
|
||||
"""Test vector similarity."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
index_to_id = docsearch.index_to_docstore_id
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
index_to_id[0]: Document(page_content="foo"),
|
||||
index_to_id[1]: Document(page_content="bar"),
|
||||
index_to_id[2]: Document(page_content="baz"),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
output = docsearch.similarity_search_by_index(2, k=1)
|
||||
assert output == [Document(page_content="baz")]
|
||||
|
||||
|
||||
def test_annoy_with_metadatas() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
docsearch.index_to_docstore_id[0]: Document(
|
||||
page_content="foo", metadata={"page": 0}
|
||||
),
|
||||
docsearch.index_to_docstore_id[1]: Document(
|
||||
page_content="bar", metadata={"page": 1}
|
||||
),
|
||||
docsearch.index_to_docstore_id[2]: Document(
|
||||
page_content="baz", metadata={"page": 2}
|
||||
),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
||||
|
||||
|
||||
def test_annoy_search_not_found() -> None:
|
||||
"""Test what happens when document is not found."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
# Get rid of the docstore to purposefully induce errors.
|
||||
docsearch.docstore = InMemoryDocstore({})
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
docsearch.similarity_search("foo")
|
||||
|
||||
|
||||
def test_annoy_add_texts() -> None:
|
||||
"""Test end to end adding of texts."""
|
||||
# Create initial doc store.
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
# Test adding a similar document as before.
|
||||
with pytest.raises(NotImplementedError):
|
||||
docsearch.add_texts(["foo"])
|
||||
|
||||
|
||||
def test_annoy_local_save_load() -> None:
|
||||
"""Test end to end serialization."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Annoy.from_texts(texts, FakeEmbeddings())
|
||||
|
||||
temp_dir = tempfile.TemporaryDirectory()
|
||||
docsearch.save_local(temp_dir.name)
|
||||
loaded_docsearch = Annoy.load_local(temp_dir.name, FakeEmbeddings())
|
||||
|
||||
assert docsearch.index_to_docstore_id == loaded_docsearch.index_to_docstore_id
|
||||
assert docsearch.docstore.__dict__ == loaded_docsearch.docstore.__dict__
|
||||
assert loaded_docsearch.index is not None
|
Loading…
Reference in New Issue