mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
core[minor]: Introduce DocumentIndex abstraction (#25062)
This PR adds a minimal document indexer abstraction. The goal of this abstraction is to allow developers to create custom retrievers that also have a standard indexing API and allow updating the document content in them. The abstraction comes with a test suite that can verify that the indexer implements the correct semantics. This is an iteration over a previous PRs (https://github.com/langchain-ai/langchain/pull/24364). The main difference is that we're sub-classing from BaseRetriever in this iteration and as so have consolidated the sync and async interfaces. The main problem with the current design is that runt time search configuration has to be specified at init rather than provided at run time. We will likely resolve this issue in one of the two ways: (1) Define a method (`get_retriever`) that will allow creating a retriever at run time with a specific configuration.. If we do this, we will likely break the subclass on BaseRetriever (2) Generalize base retriever so it can support structured queries --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
e7b95e0802
commit
41dfad5104
@ -7,6 +7,7 @@ if it's unchanged.
|
||||
|
||||
from langchain_core.indexing.api import IndexingResult, aindex, index
|
||||
from langchain_core.indexing.base import (
|
||||
DocumentIndex,
|
||||
InMemoryRecordManager,
|
||||
RecordManager,
|
||||
UpsertResponse,
|
||||
@ -14,6 +15,7 @@ from langchain_core.indexing.base import (
|
||||
|
||||
__all__ = [
|
||||
"aindex",
|
||||
"DocumentIndex",
|
||||
"index",
|
||||
"IndexingResult",
|
||||
"InMemoryRecordManager",
|
||||
|
@ -1,8 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional, Sequence, TypedDict
|
||||
from typing import Any, Dict, List, Optional, Sequence, TypedDict
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.runnables import run_in_executor
|
||||
|
||||
|
||||
class RecordManager(ABC):
|
||||
@ -447,3 +453,209 @@ class UpsertResponse(TypedDict):
|
||||
"""The IDs that were successfully indexed."""
|
||||
failed: List[str]
|
||||
"""The IDs that failed to index."""
|
||||
|
||||
|
||||
class DeleteResponse(TypedDict, total=False):
|
||||
"""A generic response for delete operation.
|
||||
|
||||
The fields in this response are optional and whether the vectorstore
|
||||
returns them or not is up to the implementation.
|
||||
"""
|
||||
|
||||
num_deleted: int
|
||||
"""The number of items that were successfully deleted.
|
||||
|
||||
If returned, this should only include *actual* deletions.
|
||||
|
||||
If the ID did not exist to begin with,
|
||||
it should not be included in this count.
|
||||
"""
|
||||
|
||||
succeeded: Sequence[str]
|
||||
"""The IDs that were successfully deleted.
|
||||
|
||||
If returned, this should only include *actual* deletions.
|
||||
|
||||
If the ID did not exist to begin with,
|
||||
it should not be included in this list.
|
||||
"""
|
||||
|
||||
failed: Sequence[str]
|
||||
"""The IDs that failed to be deleted.
|
||||
|
||||
Please note that deleting an ID that
|
||||
does not exist is **NOT** considered a failure.
|
||||
"""
|
||||
|
||||
num_failed: int
|
||||
"""The number of items that failed to be deleted."""
|
||||
|
||||
|
||||
@beta(message="Added in 0.2.29. The abstraction is subject to change.")
|
||||
class DocumentIndex(BaseRetriever):
|
||||
"""A document retriever that supports indexing operations.
|
||||
|
||||
This indexing interface is designed to be a generic abstraction for storing and
|
||||
querying documents that has an ID and metadata associated with it.
|
||||
|
||||
The interface is designed to be agnostic to the underlying implementation of the
|
||||
indexing system.
|
||||
|
||||
The interface is designed to support the following operations:
|
||||
|
||||
1. Storing document in the index.
|
||||
2. Fetching document by ID.
|
||||
3. Searching for document using a query.
|
||||
|
||||
.. versionadded:: 0.2.29
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
"""Upsert documents into the index.
|
||||
|
||||
The upsert functionality should utilize the ID field of the content object
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the content.
|
||||
|
||||
When an ID is specified and the content already exists in the vectorstore,
|
||||
the upsert method should update the content with the new data. If the content
|
||||
does not exist, the upsert method should add the item to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of documents to add to the vectorstore.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
"""
|
||||
|
||||
async def aupsert(
|
||||
self, items: Sequence[Document], /, **kwargs: Any
|
||||
) -> UpsertResponse:
|
||||
"""Add or update documents in the vectorstore. Async version of upsert.
|
||||
|
||||
The upsert functionality should utilize the ID field of the item
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the item.
|
||||
|
||||
When an ID is specified and the item already exists in the vectorstore,
|
||||
the upsert method should update the item with the new data. If the item
|
||||
does not exist, the upsert method should add the item to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of documents to add to the vectorstore.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
"""
|
||||
return await run_in_executor(
|
||||
None,
|
||||
self.upsert,
|
||||
items,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
|
||||
"""Delete by IDs or other criteria.
|
||||
|
||||
Calling delete without any input parameters should raise a ValueError!
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
kwargs: Additional keyword arguments. This is up to the implementation.
|
||||
For example, can include an option to delete the entire index,
|
||||
or else issue a non-blocking delete etc.
|
||||
|
||||
Returns:
|
||||
DeleteResponse: A response object that contains the list of IDs that were
|
||||
successfully deleted and the list of IDs that failed to be deleted.
|
||||
"""
|
||||
|
||||
async def adelete(
|
||||
self, ids: Optional[List[str]] = None, **kwargs: Any
|
||||
) -> DeleteResponse:
|
||||
"""Delete by IDs or other criteria. Async variant.
|
||||
|
||||
Calling adelete without any input parameters should raise a ValueError!
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
kwargs: Additional keyword arguments. This is up to the implementation.
|
||||
For example, can include an option to delete the entire index.
|
||||
|
||||
Returns:
|
||||
DeleteResponse: A response object that contains the list of IDs that were
|
||||
successfully deleted and the list of IDs that failed to be deleted.
|
||||
"""
|
||||
return await run_in_executor(
|
||||
None,
|
||||
self.delete,
|
||||
ids,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@abc.abstractmethod
|
||||
def get(
|
||||
self,
|
||||
ids: Sequence[str],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Get documents by id.
|
||||
|
||||
Fewer documents may be returned than requested if some IDs are not found or
|
||||
if there are duplicated IDs.
|
||||
|
||||
Users should not assume that the order of the returned documents matches
|
||||
the order of the input IDs. Instead, users should rely on the ID field of the
|
||||
returned documents.
|
||||
|
||||
This method should **NOT** raise exceptions if no documents are found for
|
||||
some IDs.
|
||||
|
||||
Args:
|
||||
ids: List of IDs to get.
|
||||
kwargs: Additional keyword arguments. These are up to the implementation.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents that were found.
|
||||
"""
|
||||
|
||||
async def aget(
|
||||
self,
|
||||
ids: Sequence[str],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Get documents by id.
|
||||
|
||||
Fewer documents may be returned than requested if some IDs are not found or
|
||||
if there are duplicated IDs.
|
||||
|
||||
Users should not assume that the order of the returned documents matches
|
||||
the order of the input IDs. Instead, users should rely on the ID field of the
|
||||
returned documents.
|
||||
|
||||
This method should **NOT** raise exceptions if no documents are found for
|
||||
some IDs.
|
||||
|
||||
Args:
|
||||
ids: List of IDs to get.
|
||||
kwargs: Additional keyword arguments. These are up to the implementation.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents that were found.
|
||||
"""
|
||||
return await run_in_executor(
|
||||
None,
|
||||
self.get,
|
||||
ids,
|
||||
**kwargs,
|
||||
)
|
||||
|
81
libs/core/langchain_core/indexing/in_memory.py
Normal file
81
libs/core/langchain_core/indexing/in_memory.py
Normal file
@ -0,0 +1,81 @@
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence, cast
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.indexing.base import DeleteResponse, DocumentIndex
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
|
||||
|
||||
@beta(message="Introduced in version 0.2.29. Underlying abstraction subject to change.")
|
||||
class InMemoryDocumentIndex(DocumentIndex):
|
||||
"""In memory document index.
|
||||
|
||||
This is an in-memory document index that stores documents in a dictionary.
|
||||
|
||||
It provides a simple search API that returns documents by the number of
|
||||
counts the given query appears in the document.
|
||||
|
||||
.. versionadded:: 0.2.29
|
||||
"""
|
||||
|
||||
store: Dict[str, Document] = Field(default_factory=dict)
|
||||
top_k: int = 4
|
||||
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
"""Upsert items into the index."""
|
||||
ok_ids = []
|
||||
|
||||
for item in items:
|
||||
if item.id is None:
|
||||
id_ = str(uuid.uuid4())
|
||||
item_ = item.copy()
|
||||
item_.id = id_
|
||||
else:
|
||||
item_ = item
|
||||
id_ = item.id
|
||||
|
||||
self.store[id_] = item_
|
||||
ok_ids.append(cast(str, item_.id))
|
||||
|
||||
return UpsertResponse(succeeded=ok_ids, failed=[])
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> DeleteResponse:
|
||||
"""Delete by ID."""
|
||||
if ids is None:
|
||||
raise ValueError("IDs must be provided for deletion")
|
||||
|
||||
ok_ids = []
|
||||
|
||||
for id_ in ids:
|
||||
if id_ in self.store:
|
||||
del self.store[id_]
|
||||
ok_ids.append(id_)
|
||||
|
||||
return DeleteResponse(
|
||||
succeeded=ok_ids, num_deleted=len(ok_ids), num_failed=0, failed=[]
|
||||
)
|
||||
|
||||
def get(self, ids: Sequence[str], /, **kwargs: Any) -> List[Document]:
|
||||
"""Get by ids."""
|
||||
found_documents = []
|
||||
|
||||
for id_ in ids:
|
||||
if id_ in self.store:
|
||||
found_documents.append(self.store[id_])
|
||||
|
||||
return found_documents
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
counts_by_doc = []
|
||||
|
||||
for document in self.store.values():
|
||||
count = document.page_content.count(query)
|
||||
counts_by_doc.append((document, count))
|
||||
|
||||
counts_by_doc.sort(key=lambda x: x[1], reverse=True)
|
||||
return [doc.copy() for doc, count in counts_by_doc[: self.top_k]]
|
@ -60,7 +60,7 @@ if TYPE_CHECKING:
|
||||
CallbackManagerForRetrieverRun,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.base import UpsertResponse
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -0,0 +1,50 @@
|
||||
"""Test in memory indexer"""
|
||||
|
||||
from typing import AsyncGenerator, Generator
|
||||
|
||||
import pytest
|
||||
from langchain_standard_tests.integration_tests.indexer import (
|
||||
AsyncDocumentIndexTestSuite,
|
||||
DocumentIndexerTestSuite,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.base import DocumentIndex
|
||||
from langchain_core.indexing.in_memory import (
|
||||
InMemoryDocumentIndex,
|
||||
)
|
||||
|
||||
|
||||
class TestDocumentIndexerTestSuite(DocumentIndexerTestSuite):
|
||||
@pytest.fixture()
|
||||
def index(self) -> Generator[DocumentIndex, None, None]:
|
||||
yield InMemoryDocumentIndex()
|
||||
|
||||
|
||||
class TestAsyncDocumentIndexerTestSuite(AsyncDocumentIndexTestSuite):
|
||||
# Something funky is going on with mypy and async pytest fixture
|
||||
@pytest.fixture()
|
||||
async def index(self) -> AsyncGenerator[DocumentIndex, None]: # type: ignore
|
||||
yield InMemoryDocumentIndex()
|
||||
|
||||
|
||||
def test_sync_retriever() -> None:
|
||||
index = InMemoryDocumentIndex()
|
||||
documents = [
|
||||
Document(id="1", page_content="hello world"),
|
||||
Document(id="2", page_content="goodbye cat"),
|
||||
]
|
||||
index.upsert(documents)
|
||||
assert index.invoke("hello") == [documents[0], documents[1]]
|
||||
assert index.invoke("cat") == [documents[1], documents[0]]
|
||||
|
||||
|
||||
async def test_async_retriever() -> None:
|
||||
index = InMemoryDocumentIndex()
|
||||
documents = [
|
||||
Document(id="1", page_content="hello world"),
|
||||
Document(id="2", page_content="goodbye cat"),
|
||||
]
|
||||
await index.aupsert(documents)
|
||||
assert (await index.ainvoke("hello")) == [documents[0], documents[1]]
|
||||
assert (await index.ainvoke("cat")) == [documents[1], documents[0]]
|
@ -4,11 +4,12 @@ from langchain_core.indexing import __all__
|
||||
def test_all() -> None:
|
||||
"""Use to catch obvious breaking changes."""
|
||||
assert __all__ == sorted(__all__, key=str.lower)
|
||||
assert __all__ == [
|
||||
assert set(__all__) == {
|
||||
"aindex",
|
||||
"DocumentIndex",
|
||||
"index",
|
||||
"IndexingResult",
|
||||
"InMemoryRecordManager",
|
||||
"RecordManager",
|
||||
"UpsertResponse",
|
||||
]
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ from typing_extensions import TypedDict
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing.base import UpsertResponse
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
|
@ -0,0 +1,392 @@
|
||||
"""Test suite to check index implementations."""
|
||||
|
||||
import inspect
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import AsyncGenerator, Generator
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.base import DocumentIndex
|
||||
|
||||
|
||||
class DocumentIndexerTestSuite(ABC):
|
||||
"""Test suite for checking the read-write of a document index.
|
||||
|
||||
Implementers should subclass this test suite and provide a fixture
|
||||
that returns an empty index for each test.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
@pytest.fixture
|
||||
def index(self) -> Generator[DocumentIndex, None, None]:
|
||||
"""Get the index."""
|
||||
|
||||
def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None:
|
||||
"""Verify that there is not parameter called ids in upsert"""
|
||||
signature = inspect.signature(index.upsert)
|
||||
assert "ids" not in signature.parameters
|
||||
|
||||
def test_upsert_no_ids(self, index: DocumentIndex) -> None:
|
||||
"""Upsert works with documents that do not have IDs.
|
||||
|
||||
At the moment, the ID field in documents is optional.
|
||||
"""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = index.upsert(documents)
|
||||
ids = sorted(response["succeeded"])
|
||||
|
||||
# Ordering is not guaranteed, need to test carefully
|
||||
documents = index.get(ids)
|
||||
sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore
|
||||
|
||||
if sorted_documents[0].page_content == "bar":
|
||||
assert sorted_documents[0] == Document(
|
||||
page_content="bar", metadata={"id": 2}, id=ids[0]
|
||||
)
|
||||
assert sorted_documents[1] == Document(
|
||||
page_content="foo", metadata={"id": 1}, id=ids[1]
|
||||
)
|
||||
else:
|
||||
assert sorted_documents[0] == Document(
|
||||
page_content="foo", metadata={"id": 1}, id=ids[0]
|
||||
)
|
||||
assert sorted_documents[1] == Document(
|
||||
page_content="bar", metadata={"id": 2}, id=ids[1]
|
||||
)
|
||||
|
||||
def test_upsert_some_ids(self, index: DocumentIndex) -> None:
|
||||
"""Test an upsert where some docs have ids and some dont."""
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
documents = [
|
||||
Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = index.upsert(documents)
|
||||
ids = response["succeeded"]
|
||||
other_id = list(set(ids) - {foo_uuid})[0]
|
||||
assert response["failed"] == []
|
||||
assert foo_uuid in ids
|
||||
# Ordering is not guaranteed, so we use a set.
|
||||
documents = index.get(ids)
|
||||
first_doc = documents[0]
|
||||
if first_doc.id == foo_uuid:
|
||||
assert documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
||||
]
|
||||
else:
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
||||
]
|
||||
|
||||
def test_upsert_overwrites(self, index: DocumentIndex) -> None:
|
||||
"""Test that upsert overwrites existing content."""
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
documents = [
|
||||
Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
|
||||
]
|
||||
response = index.upsert(documents)
|
||||
ids = response["succeeded"]
|
||||
assert response["failed"] == []
|
||||
|
||||
assert index.get(ids) == [
|
||||
Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
|
||||
]
|
||||
|
||||
# Now let's overwrite foo
|
||||
index.upsert([Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})])
|
||||
documents = index.get([foo_uuid])
|
||||
assert documents == [
|
||||
Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
|
||||
]
|
||||
|
||||
def test_delete_missing_docs(self, index: DocumentIndex) -> None:
|
||||
"""Verify that we can delete docs that aren't there."""
|
||||
assert index.get(["1"]) == [] # Should be empty.
|
||||
|
||||
delete_response = index.delete(["1"])
|
||||
if "num_deleted" in delete_response:
|
||||
assert delete_response["num_deleted"] == 0
|
||||
|
||||
if "num_failed" in delete_response:
|
||||
# Deleting a missing an ID is **not** failure!!
|
||||
assert delete_response["num_failed"] == 0
|
||||
|
||||
if "succeeded" in delete_response:
|
||||
# There was nothing to delete!
|
||||
assert delete_response["succeeded"] == []
|
||||
|
||||
if "failed" in delete_response:
|
||||
# Nothing should have failed
|
||||
assert delete_response["failed"] == []
|
||||
|
||||
def test_delete_semantics(self, index: DocumentIndex) -> None:
|
||||
"""Test deletion of content has appropriate semantics."""
|
||||
# Let's index a document first.
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
upsert_response = index.upsert(
|
||||
[Document(id=foo_uuid, page_content="foo", metadata={})]
|
||||
)
|
||||
assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
|
||||
|
||||
delete_response = index.delete(["missing_id", foo_uuid])
|
||||
|
||||
if "num_deleted" in delete_response:
|
||||
assert delete_response["num_deleted"] == 1
|
||||
|
||||
if "num_failed" in delete_response:
|
||||
# Deleting a missing an ID is **not** failure!!
|
||||
assert delete_response["num_failed"] == 0
|
||||
|
||||
if "succeeded" in delete_response:
|
||||
# There was nothing to delete!
|
||||
assert delete_response["succeeded"] == [foo_uuid]
|
||||
|
||||
if "failed" in delete_response:
|
||||
# Nothing should have failed
|
||||
assert delete_response["failed"] == []
|
||||
|
||||
def test_bulk_delete(self, index: DocumentIndex) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
documents = [
|
||||
Document(id="1", page_content="foo", metadata={"id": 1}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
Document(id="3", page_content="baz", metadata={"id": 3}),
|
||||
]
|
||||
|
||||
index.upsert(documents)
|
||||
index.delete(["1", "2"])
|
||||
assert index.get(["1", "2", "3"]) == [
|
||||
Document(page_content="baz", metadata={"id": 3}, id="3")
|
||||
]
|
||||
|
||||
def test_delete_no_args(self, index: DocumentIndex) -> None:
|
||||
"""Test delete with no args raises ValueError."""
|
||||
with pytest.raises(ValueError):
|
||||
index.delete()
|
||||
|
||||
def test_delete_missing_content(self, index: DocumentIndex) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
index.delete(["1"])
|
||||
index.delete(["1", "2", "3"])
|
||||
|
||||
def test_get_with_missing_ids(self, index: DocumentIndex) -> None:
|
||||
"""Test get with missing IDs."""
|
||||
documents = [
|
||||
Document(id="1", page_content="foo", metadata={"id": 1}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
upsert_response = index.upsert(documents)
|
||||
assert upsert_response == {
|
||||
"succeeded": ["1", "2"],
|
||||
"failed": [],
|
||||
}
|
||||
retrieved_documents = index.get(["1", "2", "3", "4"])
|
||||
# The ordering is not guaranteed, so we use a set.
|
||||
assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
]
|
||||
|
||||
def test_get_missing(self, index: DocumentIndex) -> None:
|
||||
"""Test get by IDs with missing IDs."""
|
||||
# This should not raise an exception
|
||||
documents = index.get(["1", "2", "3"])
|
||||
assert documents == []
|
||||
|
||||
|
||||
class AsyncDocumentIndexTestSuite(ABC):
|
||||
"""Test suite for checking the read-write of a document index.
|
||||
|
||||
Implementers should subclass this test suite and provide a fixture
|
||||
that returns an empty index for each test.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
@pytest.fixture
|
||||
async def index(self) -> AsyncGenerator[DocumentIndex, None]:
|
||||
"""Get the index."""
|
||||
|
||||
async def test_upsert_documents_has_no_ids(self, index: DocumentIndex) -> None:
|
||||
"""Verify that there is not parameter called ids in upsert"""
|
||||
signature = inspect.signature(index.upsert)
|
||||
assert "ids" not in signature.parameters
|
||||
|
||||
async def test_upsert_no_ids(self, index: DocumentIndex) -> None:
|
||||
"""Upsert works with documents that do not have IDs.
|
||||
|
||||
At the moment, the ID field in documents is optional.
|
||||
"""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = await index.aupsert(documents)
|
||||
ids = sorted(response["succeeded"])
|
||||
|
||||
# Ordering is not guaranteed, need to test carefully
|
||||
documents = await index.aget(ids)
|
||||
sorted_documents = sorted(documents, key=lambda x: x.id) # type: ignore
|
||||
|
||||
if sorted_documents[0].page_content == "bar":
|
||||
assert sorted_documents[0] == Document(
|
||||
page_content="bar", metadata={"id": 2}, id=ids[0]
|
||||
)
|
||||
assert sorted_documents[1] == Document(
|
||||
page_content="foo", metadata={"id": 1}, id=ids[1]
|
||||
)
|
||||
else:
|
||||
assert sorted_documents[0] == Document(
|
||||
page_content="foo", metadata={"id": 1}, id=ids[0]
|
||||
)
|
||||
assert sorted_documents[1] == Document(
|
||||
page_content="bar", metadata={"id": 2}, id=ids[1]
|
||||
)
|
||||
|
||||
async def test_upsert_some_ids(self, index: DocumentIndex) -> None:
|
||||
"""Test an upsert where some docs have ids and some dont."""
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
documents = [
|
||||
Document(id=foo_uuid, page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = await index.aupsert(documents)
|
||||
ids = response["succeeded"]
|
||||
other_id = list(set(ids) - {foo_uuid})[0]
|
||||
assert response["failed"] == []
|
||||
assert foo_uuid in ids
|
||||
# Ordering is not guaranteed, so we use a set.
|
||||
documents = await index.aget(ids)
|
||||
first_doc = documents[0]
|
||||
if first_doc.id == foo_uuid:
|
||||
assert documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
||||
]
|
||||
else:
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}, id=other_id),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=foo_uuid),
|
||||
]
|
||||
|
||||
async def test_upsert_overwrites(self, index: DocumentIndex) -> None:
|
||||
"""Test that upsert overwrites existing content."""
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
documents = [
|
||||
Document(id=foo_uuid, page_content="foo", metadata={"bar": 1}),
|
||||
]
|
||||
response = await index.aupsert(documents)
|
||||
ids = response["succeeded"]
|
||||
assert response["failed"] == []
|
||||
|
||||
assert await index.aget(ids) == [
|
||||
Document(page_content="foo", metadata={"bar": 1}, id=foo_uuid),
|
||||
]
|
||||
|
||||
# Now let's overwrite foo
|
||||
await index.aupsert(
|
||||
[Document(id=foo_uuid, page_content="foo2", metadata={"meow": 2})]
|
||||
)
|
||||
documents = await index.aget([foo_uuid])
|
||||
assert documents == [
|
||||
Document(page_content="foo2", metadata={"meow": 2}, id=foo_uuid)
|
||||
]
|
||||
|
||||
async def test_delete_missing_docs(self, index: DocumentIndex) -> None:
|
||||
"""Verify that we can delete docs that aren't there."""
|
||||
assert await index.aget(["1"]) == [] # Should be empty.
|
||||
|
||||
delete_response = await index.adelete(["1"])
|
||||
if "num_deleted" in delete_response:
|
||||
assert delete_response["num_deleted"] == 0
|
||||
|
||||
if "num_failed" in delete_response:
|
||||
# Deleting a missing an ID is **not** failure!!
|
||||
assert delete_response["num_failed"] == 0
|
||||
|
||||
if "succeeded" in delete_response:
|
||||
# There was nothing to delete!
|
||||
assert delete_response["succeeded"] == []
|
||||
|
||||
if "failed" in delete_response:
|
||||
# Nothing should have failed
|
||||
assert delete_response["failed"] == []
|
||||
|
||||
async def test_delete_semantics(self, index: DocumentIndex) -> None:
|
||||
"""Test deletion of content has appropriate semantics."""
|
||||
# Let's index a document first.
|
||||
foo_uuid = str(uuid.UUID(int=7))
|
||||
upsert_response = await index.aupsert(
|
||||
[Document(id=foo_uuid, page_content="foo", metadata={})]
|
||||
)
|
||||
assert upsert_response == {"succeeded": [foo_uuid], "failed": []}
|
||||
|
||||
delete_response = await index.adelete(["missing_id", foo_uuid])
|
||||
|
||||
if "num_deleted" in delete_response:
|
||||
assert delete_response["num_deleted"] == 1
|
||||
|
||||
if "num_failed" in delete_response:
|
||||
# Deleting a missing an ID is **not** failure!!
|
||||
assert delete_response["num_failed"] == 0
|
||||
|
||||
if "succeeded" in delete_response:
|
||||
# There was nothing to delete!
|
||||
assert delete_response["succeeded"] == [foo_uuid]
|
||||
|
||||
if "failed" in delete_response:
|
||||
# Nothing should have failed
|
||||
assert delete_response["failed"] == []
|
||||
|
||||
async def test_bulk_delete(self, index: DocumentIndex) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
documents = [
|
||||
Document(id="1", page_content="foo", metadata={"id": 1}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
Document(id="3", page_content="baz", metadata={"id": 3}),
|
||||
]
|
||||
|
||||
await index.aupsert(documents)
|
||||
await index.adelete(["1", "2"])
|
||||
assert await index.aget(["1", "2", "3"]) == [
|
||||
Document(page_content="baz", metadata={"id": 3}, id="3")
|
||||
]
|
||||
|
||||
async def test_delete_no_args(self, index: DocumentIndex) -> None:
|
||||
"""Test delete with no args raises ValueError."""
|
||||
with pytest.raises(ValueError):
|
||||
await index.adelete()
|
||||
|
||||
async def test_delete_missing_content(self, index: DocumentIndex) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
await index.adelete(["1"])
|
||||
await index.adelete(["1", "2", "3"])
|
||||
|
||||
async def test_get_with_missing_ids(self, index: DocumentIndex) -> None:
|
||||
"""Test get with missing IDs."""
|
||||
documents = [
|
||||
Document(id="1", page_content="foo", metadata={"id": 1}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
upsert_response = await index.aupsert(documents)
|
||||
assert upsert_response == {
|
||||
"succeeded": ["1", "2"],
|
||||
"failed": [],
|
||||
}
|
||||
retrieved_documents = await index.aget(["1", "2", "3", "4"])
|
||||
# The ordering is not guaranteed, so we use a set.
|
||||
assert sorted(retrieved_documents, key=lambda x: x.id) == [ # type: ignore
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
]
|
||||
|
||||
async def test_get_missing(self, index: DocumentIndex) -> None:
|
||||
"""Test get by IDs with missing IDs."""
|
||||
# This should not raise an exception
|
||||
documents = await index.aget(["1", "2", "3"])
|
||||
assert documents == []
|
Loading…
Reference in New Issue
Block a user