mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
core[patch]: Clean up indexing test code (#24139)
Refactor the code to use the existing InMemroyVectorStore. This change is needed for another PR that moves some of the imports around (and messes up the mock.patch in this file)
This commit is contained in:
parent
457677c1b7
commit
4ba14adec6
@ -2,25 +2,21 @@ from datetime import datetime
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterator,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Type,
|
||||
)
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.embeddings import DeterministicFakeEmbedding
|
||||
from langchain_core.indexing import InMemoryRecordManager, aindex, index
|
||||
from langchain_core.indexing.api import _abatch, _HashedDocument
|
||||
from langchain_core.vectorstores import VST, VectorStore
|
||||
from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
|
||||
|
||||
|
||||
class ToyLoader(BaseLoader):
|
||||
@ -42,101 +38,6 @@ class ToyLoader(BaseLoader):
|
||||
yield document
|
||||
|
||||
|
||||
class InMemoryVectorStore(VectorStore):
|
||||
"""In-memory implementation of VectorStore using a dictionary."""
|
||||
|
||||
def __init__(self, permit_upserts: bool = False) -> None:
|
||||
"""Vector store interface for testing things in memory."""
|
||||
self.store: Dict[str, Document] = {}
|
||||
self.permit_upserts = permit_upserts
|
||||
|
||||
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
"""Delete the given documents from the store using their IDs."""
|
||||
if ids:
|
||||
for _id in ids:
|
||||
self.store.pop(_id, None)
|
||||
|
||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
"""Delete the given documents from the store using their IDs."""
|
||||
if ids:
|
||||
for _id in ids:
|
||||
self.store.pop(_id, None)
|
||||
|
||||
def add_documents( # type: ignore
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
*,
|
||||
ids: Optional[Sequence[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add the given documents to the store (insert behavior)."""
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
f"Expected {len(ids)} ids, got {len(documents)} documents."
|
||||
)
|
||||
|
||||
if not ids:
|
||||
raise NotImplementedError("This is not implemented yet.")
|
||||
|
||||
for _id, document in zip(ids, documents):
|
||||
if _id in self.store and not self.permit_upserts:
|
||||
raise ValueError(
|
||||
f"Document with uid {_id} already exists in the store."
|
||||
)
|
||||
self.store[_id] = document
|
||||
|
||||
return list(ids)
|
||||
|
||||
async def aadd_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
*,
|
||||
ids: Optional[Sequence[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
f"Expected {len(ids)} ids, got {len(documents)} documents."
|
||||
)
|
||||
|
||||
if not ids:
|
||||
raise NotImplementedError("This is not implemented yet.")
|
||||
|
||||
for _id, document in zip(ids, documents):
|
||||
if _id in self.store and not self.permit_upserts:
|
||||
raise ValueError(
|
||||
f"Document with uid {_id} already exists in the store."
|
||||
)
|
||||
self.store[_id] = document
|
||||
return list(ids)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add the given texts to the store (insert behavior)."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VST],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> VST:
|
||||
"""Create a vector store from a list of texts."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Find the most similar documents to the given query."""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def record_manager() -> InMemoryRecordManager:
|
||||
"""Timestamped set fixture."""
|
||||
@ -156,13 +57,15 @@ async def arecord_manager() -> InMemoryRecordManager:
|
||||
@pytest.fixture
|
||||
def vector_store() -> InMemoryVectorStore:
|
||||
"""Vector store fixture."""
|
||||
return InMemoryVectorStore()
|
||||
embeddings = DeterministicFakeEmbedding(size=5)
|
||||
return InMemoryVectorStore(embeddings)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def upserting_vector_store() -> InMemoryVectorStore:
|
||||
"""Vector store fixture."""
|
||||
return InMemoryVectorStore(permit_upserts=True)
|
||||
embeddings = DeterministicFakeEmbedding(size=5)
|
||||
return InMemoryVectorStore(embeddings)
|
||||
|
||||
|
||||
def test_indexing_same_content(
|
||||
@ -286,7 +189,7 @@ def test_index_simple_delete_full(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"mutated document 1", "This is another document."}
|
||||
@ -368,7 +271,7 @@ async def test_aindex_simple_delete_full(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"mutated document 1", "This is another document."}
|
||||
@ -659,7 +562,7 @@ def test_incremental_delete(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"This is another document.", "This is a test document."}
|
||||
@ -718,7 +621,7 @@ def test_incremental_delete(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {
|
||||
@ -786,7 +689,7 @@ def test_incremental_indexing_with_batch_size(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"1", "2", "3", "4"}
|
||||
@ -836,7 +739,7 @@ def test_incremental_delete_with_batch_size(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"1", "2", "3", "4"}
|
||||
@ -981,7 +884,7 @@ async def test_aincremental_delete(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {"This is another document.", "This is a test document."}
|
||||
@ -1040,7 +943,7 @@ async def test_aincremental_delete(
|
||||
|
||||
doc_texts = set(
|
||||
# Ignoring type since doc should be in the store and not a None
|
||||
vector_store.store.get(uid).page_content # type: ignore
|
||||
vector_store.get_by_ids([uid])[0].page_content # type: ignore
|
||||
for uid in vector_store.store
|
||||
)
|
||||
assert doc_texts == {
|
||||
@ -1232,8 +1135,10 @@ def test_deduplication_v2(
|
||||
|
||||
# using in memory implementation here
|
||||
assert isinstance(vector_store, InMemoryVectorStore)
|
||||
|
||||
ids = list(vector_store.store.keys())
|
||||
contents = sorted(
|
||||
[document.page_content for document in vector_store.store.values()]
|
||||
[document.page_content for document in vector_store.get_by_ids(ids)]
|
||||
)
|
||||
assert contents == ["1", "2", "3"]
|
||||
|
||||
@ -1370,11 +1275,19 @@ def test_indexing_custom_batch_size(
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
with patch.object(vector_store, "add_documents") as mock_add_documents:
|
||||
|
||||
original = vector_store.add_documents
|
||||
|
||||
try:
|
||||
mock_add_documents = MagicMock()
|
||||
vector_store.add_documents = mock_add_documents # type: ignore
|
||||
|
||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
assert args == (docs,)
|
||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||
finally:
|
||||
vector_store.add_documents = original # type: ignore
|
||||
|
||||
|
||||
async def test_aindexing_custom_batch_size(
|
||||
@ -1390,8 +1303,9 @@ async def test_aindexing_custom_batch_size(
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
with patch.object(vector_store, "aadd_documents") as mock_add_documents:
|
||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
assert args == (docs,)
|
||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||
mock_add_documents = AsyncMock()
|
||||
vector_store.aadd_documents = mock_add_documents # type: ignore
|
||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
assert args == (docs,)
|
||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||
|
Loading…
Reference in New Issue
Block a user