2024-06-05 14:40:34 +00:00
|
|
|
import json
|
2024-03-20 14:21:07 +00:00
|
|
|
import uuid
|
2024-06-05 14:40:34 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_core.embeddings import Embeddings
|
2024-06-05 14:40:34 +00:00
|
|
|
from langchain_core.load import dumpd, load
|
2024-03-20 14:21:07 +00:00
|
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
|
|
|
|
from langchain_community.utils.math import cosine_similarity
|
|
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
|
|
|
|
|
|
|
|
class InMemoryVectorStore(VectorStore):
|
|
|
|
"""In-memory implementation of VectorStore using a dictionary.
|
|
|
|
Uses numpy to compute cosine similarity for search.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding: embedding function to use.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, embedding: Embeddings) -> None:
|
|
|
|
self.store: Dict[str, Dict[str, Any]] = {}
|
|
|
|
self.embedding = embedding
|
|
|
|
|
|
|
|
@property
|
|
|
|
def embeddings(self) -> Embeddings:
|
|
|
|
return self.embedding
|
|
|
|
|
|
|
|
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
|
|
if ids:
|
|
|
|
for _id in ids:
|
|
|
|
self.store.pop(_id, None)
|
|
|
|
|
|
|
|
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
|
|
self.delete(ids)
|
|
|
|
|
|
|
|
def add_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
2024-05-07 19:05:16 +00:00
|
|
|
ids: Optional[Sequence[str]] = None,
|
2024-03-20 14:21:07 +00:00
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
2024-05-07 19:05:16 +00:00
|
|
|
"""Add texts to the store."""
|
2024-03-20 14:21:07 +00:00
|
|
|
vectors = self.embedding.embed_documents(list(texts))
|
2024-05-07 19:05:16 +00:00
|
|
|
ids_ = []
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
for i, text in enumerate(texts):
|
2024-05-07 19:05:16 +00:00
|
|
|
doc_id = ids[i] if ids else str(uuid.uuid4())
|
|
|
|
ids_.append(doc_id)
|
2024-03-20 14:21:07 +00:00
|
|
|
self.store[doc_id] = {
|
|
|
|
"id": doc_id,
|
|
|
|
"vector": vectors[i],
|
|
|
|
"text": text,
|
|
|
|
"metadata": metadatas[i] if metadatas else {},
|
|
|
|
}
|
2024-05-07 19:05:16 +00:00
|
|
|
return ids_
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
async def aadd_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
return self.add_texts(texts, metadatas, **kwargs)
|
|
|
|
|
2024-06-05 14:40:34 +00:00
|
|
|
def _similarity_search_with_score_by_vector(
|
2024-03-20 14:21:07 +00:00
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = 4,
|
2024-06-05 14:40:34 +00:00
|
|
|
filter: Optional[Callable[[Document], bool]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float, List[float]]]:
|
|
|
|
result = []
|
2024-03-20 14:21:07 +00:00
|
|
|
for doc in self.store.values():
|
2024-06-05 14:40:34 +00:00
|
|
|
vector = doc["vector"]
|
|
|
|
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
|
|
|
result.append(
|
2024-03-20 14:21:07 +00:00
|
|
|
(
|
|
|
|
Document(page_content=doc["text"], metadata=doc["metadata"]),
|
|
|
|
similarity,
|
2024-06-05 14:40:34 +00:00
|
|
|
vector,
|
2024-03-20 14:21:07 +00:00
|
|
|
)
|
|
|
|
)
|
2024-06-05 14:40:34 +00:00
|
|
|
result.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
if filter is not None:
|
|
|
|
result = [r for r in result if filter(r[0])]
|
|
|
|
return result[:k]
|
|
|
|
|
|
|
|
def similarity_search_with_score_by_vector(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = 4,
|
|
|
|
filter: Optional[Callable[[Document], bool]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
return [
|
|
|
|
(doc, similarity)
|
|
|
|
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
|
|
|
embedding=embedding, k=k, filter=filter, **kwargs
|
|
|
|
)
|
|
|
|
]
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
def similarity_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
embedding = self.embedding.embed_query(query)
|
|
|
|
docs = self.similarity_search_with_score_by_vector(
|
|
|
|
embedding,
|
|
|
|
k,
|
2024-06-05 14:40:34 +00:00
|
|
|
**kwargs,
|
2024-03-20 14:21:07 +00:00
|
|
|
)
|
|
|
|
return docs
|
|
|
|
|
|
|
|
async def asimilarity_search_with_score(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
return self.similarity_search_with_score(query, k, **kwargs)
|
|
|
|
|
|
|
|
def similarity_search_by_vector(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = 4,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
docs_and_scores = self.similarity_search_with_score_by_vector(
|
|
|
|
embedding,
|
|
|
|
k,
|
2024-06-05 14:40:34 +00:00
|
|
|
**kwargs,
|
2024-03-20 14:21:07 +00:00
|
|
|
)
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
|
|
|
async def asimilarity_search_by_vector(
|
|
|
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
|
|
|
|
|
|
|
def similarity_search(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
|
|
|
|
|
|
|
async def asimilarity_search(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
return self.similarity_search(query, k, **kwargs)
|
|
|
|
|
|
|
|
def max_marginal_relevance_search_by_vector(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
2024-06-05 14:40:34 +00:00
|
|
|
prefetch_hits = self._similarity_search_with_score_by_vector(
|
|
|
|
embedding=embedding,
|
|
|
|
k=fetch_k,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
mmr_chosen_indices = maximal_marginal_relevance(
|
|
|
|
np.array(embedding, dtype=np.float32),
|
2024-06-05 14:40:34 +00:00
|
|
|
[vector for _, _, vector in prefetch_hits],
|
2024-03-20 14:21:07 +00:00
|
|
|
k=k,
|
|
|
|
lambda_mult=lambda_mult,
|
|
|
|
)
|
2024-06-05 14:40:34 +00:00
|
|
|
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
2024-03-20 14:21:07 +00:00
|
|
|
|
|
|
|
def max_marginal_relevance_search(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
|
|
return self.max_marginal_relevance_search_by_vector(
|
|
|
|
embedding_vector,
|
|
|
|
k,
|
|
|
|
fetch_k,
|
|
|
|
lambda_mult=lambda_mult,
|
2024-06-05 14:40:34 +00:00
|
|
|
**kwargs,
|
2024-03-20 14:21:07 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_texts(
|
|
|
|
cls,
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "InMemoryVectorStore":
|
|
|
|
store = cls(
|
|
|
|
embedding=embedding,
|
|
|
|
)
|
2024-05-07 19:05:16 +00:00
|
|
|
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
2024-03-20 14:21:07 +00:00
|
|
|
return store
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
async def afrom_texts(
|
|
|
|
cls,
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "InMemoryVectorStore":
|
|
|
|
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
2024-06-05 14:40:34 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def load(
|
|
|
|
cls, path: str, embedding: Embeddings, **kwargs: Any
|
|
|
|
) -> "InMemoryVectorStore":
|
|
|
|
_path: Path = Path(path)
|
|
|
|
with _path.open("r") as f:
|
|
|
|
store = load(json.load(f))
|
|
|
|
vectorstore = cls(embedding=embedding, **kwargs)
|
|
|
|
vectorstore.store = store
|
|
|
|
return vectorstore
|
|
|
|
|
|
|
|
def dump(self, path: str) -> None:
|
|
|
|
_path: Path = Path(path)
|
|
|
|
_path.parent.mkdir(exist_ok=True, parents=True)
|
|
|
|
with _path.open("w") as f:
|
|
|
|
json.dump(dumpd(self.store), f, indent=2)
|