core[patch]: Clean up indexing test code (#24139)

Refactor the code to use the existing InMemroyVectorStore. This change is needed for another PR that moves some of the imports around (and messes up the mock.patch in this file)
2024-11-10 01:10:59 +00:00 · 2024-07-11 14:54:46 -04:00 · 2024-07-11 14:54:46 -04:00 · 4ba14adec6
commit 4ba14adec6
parent 457677c1b7
1 changed files with 33 additions and 119 deletions
--- a/libs/core/tests/unit_tests/indexing/test_indexing.py
+++ b/libs/core/tests/unit_tests/indexing/test_indexing.py
@ -2,25 +2,21 @@ from datetime import datetime
 from typing import (
    Any,
    AsyncIterator,
-    Dict,
    Iterable,
    Iterator,
-    List,
-    Optional,
    Sequence,
-    Type,
 )
-from unittest.mock import patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest
 import pytest_asyncio

 from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
-from langchain_core.embeddings import Embeddings
+from langchain_core.embeddings import DeterministicFakeEmbedding
 from langchain_core.indexing import InMemoryRecordManager, aindex, index
 from langchain_core.indexing.api import _abatch, _HashedDocument
-from langchain_core.vectorstores import VST, VectorStore
+from langchain_core.vectorstores import InMemoryVectorStore, VectorStore


 class ToyLoader(BaseLoader):
@ -42,101 +38,6 @@ class ToyLoader(BaseLoader):
            yield document


-class InMemoryVectorStore(VectorStore):
-    """In-memory implementation of VectorStore using a dictionary."""
-
-    def __init__(self, permit_upserts: bool = False) -> None:
-        """Vector store interface for testing things in memory."""
-        self.store: Dict[str, Document] = {}
-        self.permit_upserts = permit_upserts
-
-    def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
-        """Delete the given documents from the store using their IDs."""
-        if ids:
-            for _id in ids:
-                self.store.pop(_id, None)
-
-    async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
-        """Delete the given documents from the store using their IDs."""
-        if ids:
-            for _id in ids:
-                self.store.pop(_id, None)
-
-    def add_documents(  # type: ignore
-        self,
-        documents: Sequence[Document],
-        *,
-        ids: Optional[Sequence[str]] = None,
-        **kwargs: Any,
-    ) -> List[str]:
-        """Add the given documents to the store (insert behavior)."""
-        if ids and len(ids) != len(documents):
-            raise ValueError(
-                f"Expected {len(ids)} ids, got {len(documents)} documents."
-            )
-
-        if not ids:
-            raise NotImplementedError("This is not implemented yet.")
-
-        for _id, document in zip(ids, documents):
-            if _id in self.store and not self.permit_upserts:
-                raise ValueError(
-                    f"Document with uid {_id} already exists in the store."
-                )
-            self.store[_id] = document
-
-        return list(ids)
-
-    async def aadd_documents(
-        self,
-        documents: Sequence[Document],
-        *,
-        ids: Optional[Sequence[str]] = None,
-        **kwargs: Any,
-    ) -> List[str]:
-        if ids and len(ids) != len(documents):
-            raise ValueError(
-                f"Expected {len(ids)} ids, got {len(documents)} documents."
-            )
-
-        if not ids:
-            raise NotImplementedError("This is not implemented yet.")
-
-        for _id, document in zip(ids, documents):
-            if _id in self.store and not self.permit_upserts:
-                raise ValueError(
-                    f"Document with uid {_id} already exists in the store."
-                )
-            self.store[_id] = document
-        return list(ids)
-
-    def add_texts(
-        self,
-        texts: Iterable[str],
-        metadatas: Optional[List[Dict[Any, Any]]] = None,
-        **kwargs: Any,
-    ) -> List[str]:
-        """Add the given texts to the store (insert behavior)."""
-        raise NotImplementedError()
-
-    @classmethod
-    def from_texts(
-        cls: Type[VST],
-        texts: List[str],
-        embedding: Embeddings,
-        metadatas: Optional[List[Dict[Any, Any]]] = None,
-        **kwargs: Any,
-    ) -> VST:
-        """Create a vector store from a list of texts."""
-        raise NotImplementedError()
-
-    def similarity_search(
-        self, query: str, k: int = 4, **kwargs: Any
-    ) -> List[Document]:
-        """Find the most similar documents to the given query."""
-        raise NotImplementedError()
-
-
@pytest.fixture
 def record_manager() -> InMemoryRecordManager:
    """Timestamped set fixture."""
@ -156,13 +57,15 @@ async def arecord_manager() -> InMemoryRecordManager:
@pytest.fixture
 def vector_store() -> InMemoryVectorStore:
    """Vector store fixture."""
-    return InMemoryVectorStore()
+    embeddings = DeterministicFakeEmbedding(size=5)
+    return InMemoryVectorStore(embeddings)


@pytest.fixture
 def upserting_vector_store() -> InMemoryVectorStore:
    """Vector store fixture."""
-    return InMemoryVectorStore(permit_upserts=True)
+    embeddings = DeterministicFakeEmbedding(size=5)
+    return InMemoryVectorStore(embeddings)


 def test_indexing_same_content(
@ -286,7 +189,7 @@ def test_index_simple_delete_full(

        doc_texts = set(
            # Ignoring type since doc should be in the store and not a None
-            vector_store.store.get(uid).page_content  # type: ignore
+            vector_store.get_by_ids([uid])[0].page_content  # type: ignore
            for uid in vector_store.store
        )
        assert doc_texts == {"mutated document 1", "This is another document."}
@ -368,7 +271,7 @@ async def test_aindex_simple_delete_full(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {"mutated document 1", "This is another document."}
@ -659,7 +562,7 @@ def test_incremental_delete(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {"This is another document.", "This is a test document."}
@ -718,7 +621,7 @@ def test_incremental_delete(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {
@ -786,7 +689,7 @@ def test_incremental_indexing_with_batch_size(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {"1", "2", "3", "4"}
@ -836,7 +739,7 @@ def test_incremental_delete_with_batch_size(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {"1", "2", "3", "4"}
@ -981,7 +884,7 @@ async def test_aincremental_delete(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {"This is another document.", "This is a test document."}
@ -1040,7 +943,7 @@ async def test_aincremental_delete(

    doc_texts = set(
        # Ignoring type since doc should be in the store and not a None
-        vector_store.store.get(uid).page_content  # type: ignore
+        vector_store.get_by_ids([uid])[0].page_content  # type: ignore
        for uid in vector_store.store
    )
    assert doc_texts == {
@ -1232,8 +1135,10 @@ def test_deduplication_v2(

    # using in memory implementation here
    assert isinstance(vector_store, InMemoryVectorStore)
+
+    ids = list(vector_store.store.keys())
    contents = sorted(
-        [document.page_content for document in vector_store.store.values()]
+        [document.page_content for document in vector_store.get_by_ids(ids)]
    )
    assert contents == ["1", "2", "3"]

@ -1370,11 +1275,19 @@ def test_indexing_custom_batch_size(
    ids = [_HashedDocument.from_document(doc).uid for doc in docs]

    batch_size = 1
-    with patch.object(vector_store, "add_documents") as mock_add_documents:
+
+    original = vector_store.add_documents
+
+    try:
+        mock_add_documents = MagicMock()
+        vector_store.add_documents = mock_add_documents  # type: ignore
+
        index(docs, record_manager, vector_store, batch_size=batch_size)
        args, kwargs = mock_add_documents.call_args
        assert args == (docs,)
        assert kwargs == {"ids": ids, "batch_size": batch_size}
+    finally:
+        vector_store.add_documents = original  # type: ignore


 async def test_aindexing_custom_batch_size(
@ -1390,8 +1303,9 @@ async def test_aindexing_custom_batch_size(
    ids = [_HashedDocument.from_document(doc).uid for doc in docs]

    batch_size = 1
-    with patch.object(vector_store, "aadd_documents") as mock_add_documents:
-        await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
-        args, kwargs = mock_add_documents.call_args
-        assert args == (docs,)
-        assert kwargs == {"ids": ids, "batch_size": batch_size}
+    mock_add_documents = AsyncMock()
+    vector_store.aadd_documents = mock_add_documents  # type: ignore
+    await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+    args, kwargs = mock_add_documents.call_args
+    assert args == (docs,)
+    assert kwargs == {"ids": ids, "batch_size": batch_size}