@ -1,14 +1,26 @@
from datetime import datetime
from typing import Any , Dict , Iterable , Iterator , List , Optional , Sequence , Type
from typing import (
Any ,
AsyncIterator ,
Dict ,
Iterable ,
Iterator ,
List ,
Optional ,
Sequence ,
Type ,
)
from unittest . mock import patch
import pytest
import pytest_asyncio
from langchain . document_loaders . base import BaseLoader
from langchain . indexes import index
from langchain . embeddings . base import Embeddings
from langchain . indexes import aindex , index
from langchain . indexes . _api import _abatch
from langchain . indexes . _sql_record_manager import SQLRecordManager
from langchain . schema import Document
from langchain . schema . embeddings import Embeddings
from langchain . schema . vectorstore import VST , VectorStore
@ -28,6 +40,19 @@ class ToyLoader(BaseLoader):
""" Load the documents from the source. """
return list ( self . lazy_load ( ) )
async def alazy_load (
self ,
) - > AsyncIterator [ Document ] :
async def async_generator ( ) - > AsyncIterator [ Document ] :
for document in self . documents :
yield document
return async_generator ( )
async def aload ( self ) - > List [ Document ] :
""" Load the documents from the source. """
return [ doc async for doc in await self . alazy_load ( ) ]
class InMemoryVectorStore ( VectorStore ) :
""" In-memory implementation of VectorStore using a dictionary. """
@ -42,6 +67,12 @@ class InMemoryVectorStore(VectorStore):
for _id in ids :
self . store . pop ( _id , None )
async def adelete ( self , ids : Optional [ Sequence [ str ] ] = None , * * kwargs : Any ) - > None :
""" Delete the given documents from the store using their IDs. """
if ids :
for _id in ids :
self . store . pop ( _id , None )
def add_documents ( # type: ignore
self ,
documents : Sequence [ Document ] ,
@ -65,10 +96,33 @@ class InMemoryVectorStore(VectorStore):
)
self . store [ _id ] = document
async def aadd_documents (
self ,
documents : Sequence [ Document ] ,
* ,
ids : Optional [ Sequence [ str ] ] = None ,
* * kwargs : Any ,
) - > List [ str ] :
if ids and len ( ids ) != len ( documents ) :
raise ValueError (
f " Expected { len ( ids ) } ids, got { len ( documents ) } documents. "
)
if not ids :
raise NotImplementedError ( " This is not implemented yet. " )
for _id , document in zip ( ids , documents ) :
if _id in self . store :
raise ValueError (
f " Document with uid { _id } already exists in the store. "
)
self . store [ _id ] = document
return list ( ids )
def add_texts (
self ,
texts : Iterable [ str ] ,
metadatas : Optional [ List [ dict ] ] = None ,
metadatas : Optional [ List [ Dict [ Any , Any ] ] ] = None ,
* * kwargs : Any ,
) - > List [ str ] :
""" Add the given texts to the store (insert behavior). """
@ -79,7 +133,7 @@ class InMemoryVectorStore(VectorStore):
cls : Type [ VST ] ,
texts : List [ str ] ,
embedding : Embeddings ,
metadatas : Optional [ List [ dict ] ] = None ,
metadatas : Optional [ List [ Dict [ Any , Any ] ] ] = None ,
* * kwargs : Any ,
) - > VST :
""" Create a vector store from a list of texts. """
@ -100,6 +154,19 @@ def record_manager() -> SQLRecordManager:
return record_manager
@pytest_asyncio.fixture # type: ignore
@pytest.mark.requires ( " aiosqlite " )
async def arecord_manager ( ) - > SQLRecordManager :
""" Timestamped set fixture. """
record_manager = SQLRecordManager (
" kittens " ,
db_url = " sqlite+aiosqlite:///:memory: " ,
async_mode = True ,
)
await record_manager . acreate_schema ( )
return record_manager
@pytest.fixture
def vector_store ( ) - > InMemoryVectorStore :
""" Vector store fixture. """
@ -140,6 +207,44 @@ def test_indexing_same_content(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_aindexing_same_content (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Indexing some content to confirm it gets added only once. """
loader = ToyLoader (
documents = [
Document (
page_content = " This is a test document. " ,
) ,
Document (
page_content = " This is another document. " ,
) ,
]
)
assert await aindex ( await loader . alazy_load ( ) , arecord_manager , vector_store ) == {
" num_added " : 2 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
assert len ( list ( vector_store . store ) ) == 2
for _ in range ( 2 ) :
# Run the indexing again
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 2 ,
" num_updated " : 0 ,
}
def test_index_simple_delete_full (
record_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
@ -215,6 +320,91 @@ def test_index_simple_delete_full(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_aindex_simple_delete_full (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Indexing some content to confirm it gets added only once. """
loader = ToyLoader (
documents = [
Document (
page_content = " This is a test document. " ,
) ,
Document (
page_content = " This is another document. " ,
) ,
]
)
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 1 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store , cleanup = " full "
) == {
" num_added " : 2 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 1 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store , cleanup = " full "
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 2 ,
" num_updated " : 0 ,
}
loader = ToyLoader (
documents = [
Document (
page_content = " mutated document 1 " ,
) ,
Document (
page_content = " This is another document. " , # <-- Same as original
) ,
]
)
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store , cleanup = " full "
) == {
" num_added " : 1 ,
" num_deleted " : 1 ,
" num_skipped " : 1 ,
" num_updated " : 0 ,
}
doc_texts = set (
# Ignoring type since doc should be in the store and not a None
vector_store . store . get ( uid ) . page_content # type: ignore
for uid in vector_store . store
)
assert doc_texts == { " mutated document 1 " , " This is another document. " }
# Attempt to index again verify that nothing changes
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store , cleanup = " full "
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 2 ,
" num_updated " : 0 ,
}
def test_incremental_fails_with_bad_source_ids (
record_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
@ -251,6 +441,49 @@ def test_incremental_fails_with_bad_source_ids(
)
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_aincremental_fails_with_bad_source_ids (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Test indexing with incremental deletion strategy. """
loader = ToyLoader (
documents = [
Document (
page_content = " This is a test document. " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is another document. " ,
metadata = { " source " : " 2 " } ,
) ,
Document (
page_content = " This is yet another document. " ,
metadata = { " source " : None } ,
) ,
]
)
with pytest . raises ( ValueError ) :
# Should raise an error because no source id function was specified
await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = " incremental " ,
)
with pytest . raises ( ValueError ) :
# Should raise an error because no source id function was specified
await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = " incremental " ,
source_id_key = " source " ,
)
def test_no_delete (
record_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
@ -332,6 +565,89 @@ def test_no_delete(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_ano_delete (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Test indexing without a deletion strategy. """
loader = ToyLoader (
documents = [
Document (
page_content = " This is a test document. " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is another document. " ,
metadata = { " source " : " 2 " } ,
) ,
]
)
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = None ,
source_id_key = " source " ,
) == {
" num_added " : 2 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
# If we add the same content twice it should be skipped
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = None ,
source_id_key = " source " ,
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 2 ,
" num_updated " : 0 ,
}
loader = ToyLoader (
documents = [
Document (
page_content = " mutated content " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is another document. " ,
metadata = { " source " : " 2 " } ,
) ,
]
)
# Should result in no updates or deletions!
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = None ,
source_id_key = " source " ,
) == {
" num_added " : 1 ,
" num_deleted " : 0 ,
" num_skipped " : 1 ,
" num_updated " : 0 ,
}
def test_incremental_delete (
record_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
@ -436,6 +752,112 @@ def test_incremental_delete(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_aincremental_delete (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Test indexing with incremental deletion strategy. """
loader = ToyLoader (
documents = [
Document (
page_content = " This is a test document. " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is another document. " ,
metadata = { " source " : " 2 " } ,
) ,
]
)
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = " incremental " ,
source_id_key = " source " ,
) == {
" num_added " : 2 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
doc_texts = set (
# Ignoring type since doc should be in the store and not a None
vector_store . store . get ( uid ) . page_content # type: ignore
for uid in vector_store . store
)
assert doc_texts == { " This is another document. " , " This is a test document. " }
# Attempt to index again verify that nothing changes
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 2 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = " incremental " ,
source_id_key = " source " ,
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 2 ,
" num_updated " : 0 ,
}
# Create 2 documents from the same source all with mutated content
loader = ToyLoader (
documents = [
Document (
page_content = " mutated document 1 " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " mutated document 2 " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is another document. " , # <-- Same as original
metadata = { " source " : " 2 " } ,
) ,
]
)
# Attempt to index again verify that nothing changes
with patch . object (
arecord_manager , " aget_time " , return_value = datetime ( 2021 , 1 , 3 ) . timestamp ( )
) :
assert await aindex (
await loader . alazy_load ( ) ,
arecord_manager ,
vector_store ,
cleanup = " incremental " ,
source_id_key = " source " ,
) == {
" num_added " : 2 ,
" num_deleted " : 1 ,
" num_skipped " : 1 ,
" num_updated " : 0 ,
}
doc_texts = set (
# Ignoring type since doc should be in the store and not a None
vector_store . store . get ( uid ) . page_content # type: ignore
for uid in vector_store . store
)
assert doc_texts == {
" mutated document 1 " ,
" mutated document 2 " ,
" This is another document. " ,
}
def test_indexing_with_no_docs (
record_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
@ -450,6 +872,24 @@ def test_indexing_with_no_docs(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_aindexing_with_no_docs (
arecord_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
""" Check edge case when loader returns no new docs. """
loader = ToyLoader ( documents = [ ] )
assert await aindex (
await loader . alazy_load ( ) , arecord_manager , vector_store , cleanup = " full "
) == {
" num_added " : 0 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
def test_deduplication (
record_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
@ -474,6 +914,32 @@ def test_deduplication(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_adeduplication (
arecord_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
""" Check edge case when loader returns no new docs. """
docs = [
Document (
page_content = " This is a test document. " ,
metadata = { " source " : " 1 " } ,
) ,
Document (
page_content = " This is a test document. " ,
metadata = { " source " : " 1 " } ,
) ,
]
# Should result in only a single document being added
assert await aindex ( docs , arecord_manager , vector_store , cleanup = " full " ) == {
" num_added " : 1 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
def test_cleanup_with_different_batchsize (
record_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
@ -511,6 +977,45 @@ def test_cleanup_with_different_batchsize(
}
@pytest.mark.asyncio
@pytest.mark.requires ( " aiosqlite " )
async def test_async_cleanup_with_different_batchsize (
arecord_manager : SQLRecordManager , vector_store : InMemoryVectorStore
) - > None :
""" Check that we can clean up with different batch size. """
docs = [
Document (
page_content = " This is a test document. " ,
metadata = { " source " : str ( d ) } ,
)
for d in range ( 1000 )
]
assert await aindex ( docs , arecord_manager , vector_store , cleanup = " full " ) == {
" num_added " : 1000 ,
" num_deleted " : 0 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
docs = [
Document (
page_content = " Different doc " ,
metadata = { " source " : str ( d ) } ,
)
for d in range ( 1001 )
]
assert await aindex (
docs , arecord_manager , vector_store , cleanup = " full " , cleanup_batch_size = 17
) == {
" num_added " : 1001 ,
" num_deleted " : 1000 ,
" num_skipped " : 0 ,
" num_updated " : 0 ,
}
def test_deduplication_v2 (
record_manager : SQLRecordManager , vector_store : VectorStore
) - > None :
@ -547,3 +1052,29 @@ def test_deduplication_v2(
[ document . page_content for document in vector_store . store . values ( ) ]
)
assert contents == [ " 1 " , " 2 " , " 3 " ]
async def _to_async_iter ( it : Iterable [ Any ] ) - > AsyncIterator [ Any ] :
""" Convert an iterable to an async iterator. """
for i in it :
yield i
@pytest.mark.asyncio
async def test_abatch ( ) - > None :
""" Test the abatch function. """
batches = _abatch ( 5 , _to_async_iter ( range ( 12 ) ) )
assert isinstance ( batches , AsyncIterator )
assert [ batch async for batch in batches ] == [
[ 0 , 1 , 2 , 3 , 4 ] ,
[ 5 , 6 , 7 , 8 , 9 ] ,
[ 10 , 11 ] ,
]
batches = _abatch ( 1 , _to_async_iter ( range ( 3 ) ) )
assert isinstance ( batches , AsyncIterator )
assert [ batch async for batch in batches ] == [ [ 0 ] , [ 1 ] , [ 2 ] ]
batches = _abatch ( 2 , _to_async_iter ( range ( 5 ) ) )
assert isinstance ( batches , AsyncIterator )
assert [ batch async for batch in batches ] == [ [ 0 , 1 ] , [ 2 , 3 ] , [ 4 ] ]