community[minor]: Add InMemoryVectorStore (#19326)

This is a basic VectorStore implementation using an in-memory dict to
store the documents.
It doesn't need any extra/optional dependency as it uses numpy which is
already a dependency of langchain.
This is useful for quick testing, demos, examples.
Also it allows to write vendor-neutral tutorials, guides, etc...
pull/19342/head
Christophe Bornet 3 months ago committed by GitHub
parent 3c4529ac69
commit 00614f332a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,199 @@
import uuid
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
from langchain_community.utils.math import cosine_similarity
from langchain_community.vectorstores.utils import maximal_marginal_relevance
class InMemoryVectorStore(VectorStore):
"""In-memory implementation of VectorStore using a dictionary.
Uses numpy to compute cosine similarity for search.
Args:
embedding: embedding function to use.
"""
def __init__(self, embedding: Embeddings) -> None:
self.store: Dict[str, Dict[str, Any]] = {}
self.embedding = embedding
@property
def embeddings(self) -> Embeddings:
return self.embedding
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
if ids:
for _id in ids:
self.store.pop(_id, None)
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
self.delete(ids)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
ids = []
vectors = self.embedding.embed_documents(list(texts))
for i, text in enumerate(texts):
doc_id = str(uuid.uuid4())
ids.append(doc_id)
self.store[doc_id] = {
"id": doc_id,
"vector": vectors[i],
"text": text,
"metadata": metadatas[i] if metadatas else {},
}
return ids
async def aadd_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
return self.add_texts(texts, metadatas, **kwargs)
def similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
) -> List[Tuple[Document, float]]:
docs_with_similarity = []
for doc in self.store.values():
similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0))
docs_with_similarity.append(
(
Document(page_content=doc["text"], metadata=doc["metadata"]),
similarity,
)
)
docs_with_similarity.sort(key=lambda x: x[1], reverse=True)
return docs_with_similarity[:k]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
embedding = self.embedding.embed_query(query)
docs = self.similarity_search_with_score_by_vector(
embedding,
k,
)
return docs
async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
return self.similarity_search_with_score(query, k, **kwargs)
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
**kwargs: Any,
) -> List[Document]:
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding,
k,
)
return [doc for doc, _ in docs_and_scores]
async def asimilarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
return self.similarity_search_by_vector(embedding, k, **kwargs)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
return self.similarity_search(query, k, **kwargs)
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
docs_with_similarity = []
for doc in self.store.values():
similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0))
docs_with_similarity.append(
(
doc,
similarity,
)
)
docs_with_similarity.sort(key=lambda x: x[1], reverse=True)
prefetch_hits = docs_with_similarity[:fetch_k]
mmr_chosen_indices = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32),
[doc["vector"] for doc, _ in prefetch_hits],
k=k,
lambda_mult=lambda_mult,
)
return [
Document(
page_content=prefetch_hits[idx][0]["text"],
metadata=prefetch_hits[idx][0]["metadata"],
)
for idx in mmr_chosen_indices
]
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
embedding_vector = self.embedding.embed_query(query)
return self.max_marginal_relevance_search_by_vector(
embedding_vector,
k,
fetch_k,
lambda_mult=lambda_mult,
)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "InMemoryVectorStore":
store = cls(
embedding=embedding,
)
store.add_texts(texts=texts, metadatas=metadatas)
return store
@classmethod
async def afrom_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "InMemoryVectorStore":
return cls.from_texts(texts, embedding, metadatas, **kwargs)

@ -0,0 +1,33 @@
from langchain_core.documents import Document
from langchain_community.vectorstores.inmemory import InMemoryVectorStore
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
)
async def test_inmemory() -> None:
"""Test end to end construction and search."""
store = await InMemoryVectorStore.afrom_texts(
["foo", "bar", "baz"], ConsistentFakeEmbeddings()
)
output = await store.asimilarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
output = await store.asimilarity_search("bar", k=2)
assert output == [Document(page_content="bar"), Document(page_content="baz")]
output2 = await store.asimilarity_search_with_score("bar", k=2)
assert output2[0][1] > output2[1][1]
async def test_inmemory_mmr() -> None:
texts = ["foo", "foo", "fou", "foy"]
docsearch = await InMemoryVectorStore.afrom_texts(texts, ConsistentFakeEmbeddings())
# make sure we can k > docstore size
output = await docsearch.amax_marginal_relevance_search(
"foo", k=10, lambda_mult=0.1
)
assert len(output) == len(texts)
assert output[0] == Document(page_content="foo")
assert output[1] == Document(page_content="foy")
Loading…
Cancel
Save