langchain/libs/community/langchain_community/vectorstores/annoy.py

456 lines
16 KiB
Python
Raw Normal View History

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 21:53:30 +00:00
from __future__ import annotations
import os
import pickle
import uuid
from configparser import ConfigParser
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
from langchain_community.docstore.base import Docstore
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
INDEX_METRICS = frozenset(["angular", "euclidean", "manhattan", "hamming", "dot"])
DEFAULT_METRIC = "angular"
def dependable_annoy_import() -> Any:
"""Import annoy if available, otherwise raise error."""
try:
import annoy
except ImportError:
raise ImportError(
"Could not import annoy python package. "
"Please install it with `pip install --user annoy` "
)
return annoy
class Annoy(VectorStore):
"""`Annoy` vector store.
To use, you should have the ``annoy`` python package installed.
Example:
.. code-block:: python
from langchain_community.vectorstores import Annoy
db = Annoy(embedding_function, index, docstore, index_to_docstore_id)
"""
def __init__(
self,
embedding_function: Callable,
index: Any,
metric: str,
docstore: Docstore,
index_to_docstore_id: Dict[int, str],
):
"""Initialize with necessary components."""
self.embedding_function = embedding_function
self.index = index
self.metric = metric
self.docstore = docstore
self.index_to_docstore_id = index_to_docstore_id
@property
def embeddings(self) -> Optional[Embeddings]:
# TODO: Accept embedding object directly
return None
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
raise NotImplementedError(
"Annoy does not allow to add new data once the index is build."
)
def process_index_results(
self, idxs: List[int], dists: List[float]
) -> List[Tuple[Document, float]]:
"""Turns annoy results into a list of documents and scores.
Args:
idxs: List of indices of the documents in the index.
dists: List of distances of the documents in the index.
Returns:
List of Documents and scores.
"""
docs = []
for idx, dist in zip(idxs, dists):
_id = self.index_to_docstore_id[idx]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append((doc, dist))
return docs
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4, search_k: int = -1
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the query and score for each
"""
idxs, dists = self.index.get_nns_by_vector(
embedding, k, search_k=search_k, include_distances=True
)
return self.process_index_results(idxs, dists)
def similarity_search_with_score_by_index(
self, docstore_index: int, k: int = 4, search_k: int = -1
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the query and score for each
"""
idxs, dists = self.index.get_nns_by_item(
docstore_index, k, search_k=search_k, include_distances=True
)
return self.process_index_results(idxs, dists)
def similarity_search_with_score(
self, query: str, k: int = 4, search_k: int = -1
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the query and score for each
"""
embedding = self.embedding_function(query)
docs = self.similarity_search_with_score_by_vector(embedding, k, search_k)
return docs
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, search_k: int = -1, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the embedding.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding, k, search_k
)
return [doc for doc, _ in docs_and_scores]
def similarity_search_by_index(
self, docstore_index: int, k: int = 4, search_k: int = -1, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to docstore_index.
Args:
docstore_index: Index of document in docstore
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the embedding.
"""
docs_and_scores = self.similarity_search_with_score_by_index(
docstore_index, k, search_k
)
return [doc for doc, _ in docs_and_scores]
def similarity_search(
self, query: str, k: int = 4, search_k: int = -1, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
search_k: inspect up to search_k nodes which defaults
to n_trees * n if not provided
Returns:
List of Documents most similar to the query.
"""
docs_and_scores = self.similarity_search_with_score(query, k, search_k)
return [doc for doc, _ in docs_and_scores]
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
k: Number of Documents to return. Defaults to 4.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
idxs = self.index.get_nns_by_vector(
embedding, fetch_k, search_k=-1, include_distances=False
)
embeddings = [self.index.get_item_vector(i) for i in idxs]
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32),
embeddings,
k=k,
lambda_mult=lambda_mult,
)
# ignore the -1's if not enough docs are returned/indexed
selected_indices = [idxs[i] for i in mmr_selected if i != -1]
docs = []
for i in selected_indices:
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append(doc)
return docs
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
docs = self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult=lambda_mult
)
return docs
@classmethod
def __from(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metric: str = DEFAULT_METRIC,
trees: int = 100,
n_jobs: int = -1,
**kwargs: Any,
) -> Annoy:
if metric not in INDEX_METRICS:
raise ValueError(
(
f"Unsupported distance metric: {metric}. "
f"Expected one of {list(INDEX_METRICS)}"
)
)
annoy = dependable_annoy_import()
if not embeddings:
raise ValueError("embeddings must be provided to build AnnoyIndex")
f = len(embeddings[0])
index = annoy.AnnoyIndex(f, metric=metric)
for i, emb in enumerate(embeddings):
index.add_item(i, emb)
index.build(trees, n_jobs=n_jobs)
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
docstore = InMemoryDocstore(
{index_to_id[i]: doc for i, doc in enumerate(documents)}
)
return cls(embedding.embed_query, index, metric, docstore, index_to_id)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metric: str = DEFAULT_METRIC,
trees: int = 100,
n_jobs: int = -1,
**kwargs: Any,
) -> Annoy:
"""Construct Annoy wrapper from raw documents.
Args:
texts: List of documents to index.
embedding: Embedding function to use.
metadatas: List of metadata dictionaries to associate with documents.
metric: Metric to use for indexing. Defaults to "angular".
trees: Number of trees to use for indexing. Defaults to 100.
n_jobs: Number of jobs to use for indexing. Defaults to -1.
This is a user friendly interface that:
1. Embeds documents.
2. Creates an in memory docstore
3. Initializes the Annoy database
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain_community.vectorstores import Annoy
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
index = Annoy.from_texts(texts, embeddings)
"""
embeddings = embedding.embed_documents(texts)
return cls.__from(
texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs
)
@classmethod
def from_embeddings(
cls,
text_embeddings: List[Tuple[str, List[float]]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metric: str = DEFAULT_METRIC,
trees: int = 100,
n_jobs: int = -1,
**kwargs: Any,
) -> Annoy:
"""Construct Annoy wrapper from embeddings.
Args:
text_embeddings: List of tuples of (text, embedding)
embedding: Embedding function to use.
metadatas: List of metadata dictionaries to associate with documents.
metric: Metric to use for indexing. Defaults to "angular".
trees: Number of trees to use for indexing. Defaults to 100.
n_jobs: Number of jobs to use for indexing. Defaults to -1
This is a user friendly interface that:
1. Creates an in memory docstore with provided embeddings
2. Initializes the Annoy database
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain_community.vectorstores import Annoy
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
text_embeddings = embeddings.embed_documents(texts)
text_embedding_pairs = list(zip(texts, text_embeddings))
db = Annoy.from_embeddings(text_embedding_pairs, embeddings)
"""
texts = [t[0] for t in text_embeddings]
embeddings = [t[1] for t in text_embeddings]
return cls.__from(
texts, embeddings, embedding, metadatas, metric, trees, n_jobs, **kwargs
)
def save_local(self, folder_path: str, prefault: bool = False) -> None:
"""Save Annoy index, docstore, and index_to_docstore_id to disk.
Args:
folder_path: folder path to save index, docstore,
and index_to_docstore_id to.
prefault: Whether to pre-load the index into memory.
"""
path = Path(folder_path)
os.makedirs(path, exist_ok=True)
# save index, index config, docstore and index_to_docstore_id
config_object = ConfigParser()
config_object["ANNOY"] = {
"f": self.index.f,
"metric": self.metric,
}
self.index.save(str(path / "index.annoy"), prefault=prefault)
with open(path / "index.pkl", "wb") as file:
pickle.dump((self.docstore, self.index_to_docstore_id, config_object), file)
@classmethod
def load_local(
cls,
folder_path: str,
embeddings: Embeddings,
) -> Annoy:
"""Load Annoy index, docstore, and index_to_docstore_id to disk.
Args:
folder_path: folder path to load index, docstore,
and index_to_docstore_id from.
embeddings: Embeddings to use when generating queries.
"""
path = Path(folder_path)
# load index separately since it is not picklable
annoy = dependable_annoy_import()
# load docstore and index_to_docstore_id
with open(path / "index.pkl", "rb") as file:
docstore, index_to_docstore_id, config_object = pickle.load(file)
f = int(config_object["ANNOY"]["f"])
metric = config_object["ANNOY"]["metric"]
index = annoy.AnnoyIndex(f, metric=metric)
index.load(str(path / "index.annoy"))
return cls(
embeddings.embed_query, index, metric, docstore, index_to_docstore_id
)