diff --git a/libs/community/langchain_community/vectorstores/inmemory.py b/libs/community/langchain_community/vectorstores/inmemory.py index e440dc9d93..ce3f2ddeb5 100644 --- a/libs/community/langchain_community/vectorstores/inmemory.py +++ b/libs/community/langchain_community/vectorstores/inmemory.py @@ -1,9 +1,12 @@ +import json import uuid -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings +from langchain_core.load import dumpd, load from langchain_core.vectorstores import VectorStore from langchain_community.utils.math import cosine_similarity @@ -64,22 +67,42 @@ class InMemoryVectorStore(VectorStore): ) -> List[str]: return self.add_texts(texts, metadatas, **kwargs) - def similarity_search_with_score_by_vector( + def _similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, - ) -> List[Tuple[Document, float]]: - docs_with_similarity = [] + filter: Optional[Callable[[Document], bool]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float, List[float]]]: + result = [] for doc in self.store.values(): - similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0)) - docs_with_similarity.append( + vector = doc["vector"] + similarity = float(cosine_similarity([embedding], [vector]).item(0)) + result.append( ( Document(page_content=doc["text"], metadata=doc["metadata"]), similarity, + vector, ) ) - docs_with_similarity.sort(key=lambda x: x[1], reverse=True) - return docs_with_similarity[:k] + result.sort(key=lambda x: x[1], reverse=True) + if filter is not None: + result = [r for r in result if filter(r[0])] + return result[:k] + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Callable[[Document], bool]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + return [ + (doc, similarity) + for doc, similarity, _ in self._similarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter, **kwargs + ) + ] def similarity_search_with_score( self, @@ -91,6 +114,7 @@ class InMemoryVectorStore(VectorStore): docs = self.similarity_search_with_score_by_vector( embedding, k, + **kwargs, ) return docs @@ -108,6 +132,7 @@ class InMemoryVectorStore(VectorStore): docs_and_scores = self.similarity_search_with_score_by_vector( embedding, k, + **kwargs, ) return [doc for doc, _ in docs_and_scores] @@ -134,31 +159,19 @@ class InMemoryVectorStore(VectorStore): lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: - docs_with_similarity = [] - for doc in self.store.values(): - similarity = float(cosine_similarity([embedding], [doc["vector"]]).item(0)) - docs_with_similarity.append( - ( - doc, - similarity, - ) - ) - docs_with_similarity.sort(key=lambda x: x[1], reverse=True) - prefetch_hits = docs_with_similarity[:fetch_k] + prefetch_hits = self._similarity_search_with_score_by_vector( + embedding=embedding, + k=fetch_k, + **kwargs, + ) mmr_chosen_indices = maximal_marginal_relevance( np.array(embedding, dtype=np.float32), - [doc["vector"] for doc, _ in prefetch_hits], + [vector for _, _, vector in prefetch_hits], k=k, lambda_mult=lambda_mult, ) - return [ - Document( - page_content=prefetch_hits[idx][0]["text"], - metadata=prefetch_hits[idx][0]["metadata"], - ) - for idx in mmr_chosen_indices - ] + return [prefetch_hits[idx][0] for idx in mmr_chosen_indices] def max_marginal_relevance_search( self, @@ -174,6 +187,7 @@ class InMemoryVectorStore(VectorStore): k, fetch_k, lambda_mult=lambda_mult, + **kwargs, ) @classmethod @@ -199,3 +213,20 @@ class InMemoryVectorStore(VectorStore): **kwargs: Any, ) -> "InMemoryVectorStore": return cls.from_texts(texts, embedding, metadatas, **kwargs) + + @classmethod + def load( + cls, path: str, embedding: Embeddings, **kwargs: Any + ) -> "InMemoryVectorStore": + _path: Path = Path(path) + with _path.open("r") as f: + store = load(json.load(f)) + vectorstore = cls(embedding=embedding, **kwargs) + vectorstore.store = store + return vectorstore + + def dump(self, path: str) -> None: + _path: Path = Path(path) + _path.parent.mkdir(exist_ok=True, parents=True) + with _path.open("w") as f: + json.dump(dumpd(self.store), f, indent=2) diff --git a/libs/community/tests/unit_tests/vectorstores/test_inmemory.py b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py index e7e5008227..72743b562f 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_inmemory.py +++ b/libs/community/tests/unit_tests/vectorstores/test_inmemory.py @@ -1,3 +1,5 @@ +from pathlib import Path + from langchain_core.documents import Document from langchain_community.vectorstores.inmemory import InMemoryVectorStore @@ -44,3 +46,31 @@ async def test_inmemory_mmr() -> None: assert len(output) == len(texts) assert output[0] == Document(page_content="foo") assert output[1] == Document(page_content="foy") + + +async def test_inmemory_dump_load(tmp_path: Path) -> None: + """Test end to end construction and search.""" + embedding = ConsistentFakeEmbeddings() + store = await InMemoryVectorStore.afrom_texts(["foo", "bar", "baz"], embedding) + output = await store.asimilarity_search("foo", k=1) + + test_file = str(tmp_path / "test.json") + store.dump(test_file) + + loaded_store = InMemoryVectorStore.load(test_file, embedding) + loaded_output = await loaded_store.asimilarity_search("foo", k=1) + + assert output == loaded_output + + +async def test_inmemory_filter() -> None: + """Test end to end construction and search.""" + store = await InMemoryVectorStore.afrom_texts( + ["foo", "bar"], + ConsistentFakeEmbeddings(), + [{"id": 1}, {"id": 2}], + ) + output = await store.asimilarity_search( + "baz", filter=lambda doc: doc.metadata["id"] == 1 + ) + assert output == [Document(page_content="foo", metadata={"id": 1})]