diff --git a/libs/langchain/langchain/indexes/_api.py b/libs/langchain/langchain/indexes/_api.py index 6f2ebf9828..c471d07d9c 100644 --- a/libs/langchain/langchain/indexes/_api.py +++ b/libs/langchain/langchain/indexes/_api.py @@ -282,14 +282,14 @@ def index( # Filter out documents that already exist in the record store. uids = [] docs_to_index = [] - for doc, hashed_doc, doc_exists in zip(doc_batch, hashed_docs, exists_batch): + for hashed_doc, doc_exists in zip(hashed_docs, exists_batch): if doc_exists: # Must be updated to refresh timestamp. record_manager.update([hashed_doc.uid], time_at_least=index_start_dt) num_skipped += 1 continue uids.append(hashed_doc.uid) - docs_to_index.append(doc) + docs_to_index.append(hashed_doc.to_document()) # Be pessimistic and assume that all vector store write will fail. # First write to vector store diff --git a/libs/langchain/tests/unit_tests/indexes/test_indexing.py b/libs/langchain/tests/unit_tests/indexes/test_indexing.py index 7783f41f07..3567dd648c 100644 --- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py +++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py @@ -472,3 +472,42 @@ def test_deduplication( "num_skipped": 0, "num_updated": 0, } + + +def test_deduplication_v2( + record_manager: SQLRecordManager, vector_store: VectorStore +) -> None: + """Check edge case when loader returns no new docs.""" + docs = [ + Document( + page_content="1", + metadata={"source": "1"}, + ), + Document( + page_content="1", + metadata={"source": "1"}, + ), + Document( + page_content="2", + metadata={"source": "2"}, + ), + Document( + page_content="3", + metadata={"source": "3"}, + ), + ] + + # Should result in only a single document being added + assert index(docs, record_manager, vector_store, cleanup="full") == { + "num_added": 3, + "num_deleted": 0, + "num_skipped": 0, + "num_updated": 0, + } + + # using in memory implementation here + assert isinstance(vector_store, InMemoryVectorStore) + contents = sorted( + [document.page_content for document in vector_store.store.values()] + ) + assert contents == ["1", "2", "3"]