import json import uuid from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.load import dumpd, load from langchain_core.vectorstores import VectorStore from langchain_community.utils.math import cosine_similarity from langchain_community.vectorstores.utils import maximal_marginal_relevance class InMemoryVectorStore(VectorStore): """In-memory implementation of VectorStore using a dictionary. Uses numpy to compute cosine similarity for search. Args: embedding: embedding function to use. """ def __init__(self, embedding: Embeddings) -> None: self.store: Dict[str, Dict[str, Any]] = {} self.embedding = embedding @property def embeddings(self) -> Embeddings: return self.embedding def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: if ids: for _id in ids: self.store.pop(_id, None) async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: self.delete(ids) def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[Sequence[str]] = None, **kwargs: Any, ) -> List[str]: """Add texts to the store.""" vectors = self.embedding.embed_documents(list(texts)) ids_ = [] for i, text in enumerate(texts): doc_id = ids[i] if ids else str(uuid.uuid4()) ids_.append(doc_id) self.store[doc_id] = { "id": doc_id, "vector": vectors[i], "text": text, "metadata": metadatas[i] if metadatas else {}, } return ids_ async def aadd_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: return self.add_texts(texts, metadatas, **kwargs) def _similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[Callable[[Document], bool]] = None, **kwargs: Any, ) -> List[Tuple[Document, float, List[float]]]: result = [] for doc in self.store.values(): vector = doc["vector"] similarity = float(cosine_similarity([embedding], [vector]).item(0)) result.append( ( Document(page_content=doc["text"], metadata=doc["metadata"]), similarity, vector, ) ) result.sort(key=lambda x: x[1], reverse=True) if filter is not None: result = [r for r in result if filter(r[0])] return result[:k] def similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[Callable[[Document], bool]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: return [ (doc, similarity) for doc, similarity, _ in self._similarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter, **kwargs ) ] def similarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any, ) -> List[Tuple[Document, float]]: embedding = self.embedding.embed_query(query) docs = self.similarity_search_with_score_by_vector( embedding, k, **kwargs, ) return docs async def asimilarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float]]: return self.similarity_search_with_score(query, k, **kwargs) def similarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any, ) -> List[Document]: docs_and_scores = self.similarity_search_with_score_by_vector( embedding, k, **kwargs, ) return [doc for doc, _ in docs_and_scores] async def asimilarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Document]: return self.similarity_search_by_vector(embedding, k, **kwargs) def similarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)] async def asimilarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: return self.similarity_search(query, k, **kwargs) def max_marginal_relevance_search_by_vector( self, embedding: List[float], k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: prefetch_hits = self._similarity_search_with_score_by_vector( embedding=embedding, k=fetch_k, **kwargs, ) mmr_chosen_indices = maximal_marginal_relevance( np.array(embedding, dtype=np.float32), [vector for _, _, vector in prefetch_hits], k=k, lambda_mult=lambda_mult, ) return [prefetch_hits[idx][0] for idx in mmr_chosen_indices] def max_marginal_relevance_search( self, query: str, k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: embedding_vector = self.embedding.embed_query(query) return self.max_marginal_relevance_search_by_vector( embedding_vector, k, fetch_k, lambda_mult=lambda_mult, **kwargs, ) @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "InMemoryVectorStore": store = cls( embedding=embedding, ) store.add_texts(texts=texts, metadatas=metadatas, **kwargs) return store @classmethod async def afrom_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "InMemoryVectorStore": return cls.from_texts(texts, embedding, metadatas, **kwargs) @classmethod def load( cls, path: str, embedding: Embeddings, **kwargs: Any ) -> "InMemoryVectorStore": _path: Path = Path(path) with _path.open("r") as f: store = load(json.load(f)) vectorstore = cls(embedding=embedding, **kwargs) vectorstore.store = store return vectorstore def dump(self, path: str) -> None: _path: Path = Path(path) _path.parent.mkdir(exist_ok=True, parents=True) with _path.open("w") as f: json.dump(dumpd(self.store), f, indent=2)