core[minor]: Add get_by_ids to vectorstore interface (#23594)

This PR adds a part of the indexing API proposed in this RFC
https://github.com/langchain-ai/langchain/pull/23544/files.

It allows rolling out `get_by_ids` which should be uncontroversial to
existing vectorstores without introducing new abstractions.

The semantics for this method depend on the ability of identifying
returned documents using the new optional ID field on documents:
https://github.com/langchain-ai/langchain/pull/23411

Alternatives are:

1. Relax the sequence requirement

```python
def get_by_ids(self, ids: Iterable[str], /) -> Iterable[Document]:
```

Rejected:
- implementations are more likley to start batching with bad defaults
- users would need to call list() or we'd need to introduce another
convenience method

2. Support more kwargs

```python

def get_by_ids(self, ids: Sequence[str], /, **kwargs) -> List[Document]:
...
```

Rejected: 
- No need for `batch` parameter since IDs is a sequence
- Output cannot be customized since `Document` is fixed. (e.g.,
parameters could be useful to grab extra metadata like the vector that
was indexed with the Document or to project a part of the document)
This commit is contained in:
Eugene Yurtsev 2024-07-01 13:04:33 -04:00 committed by GitHub
parent bf402f902e
commit 4f1821db3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -34,6 +34,7 @@ from typing import (
Iterable, Iterable,
List, List,
Optional, Optional,
Sequence,
Tuple, Tuple,
Type, Type,
TypeVar, TypeVar,
@ -99,6 +100,57 @@ class VectorStore(ABC):
raise NotImplementedError("delete method must be implemented by subclass.") raise NotImplementedError("delete method must be implemented by subclass.")
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
"""Get documents by their IDs.
The returned documents are expected to have the ID field set to the ID of the
document in the vector store.
Fewer documents may be returned than requested if some IDs are not found or
if there are duplicated IDs.
Users should not assume that the order of the returned documents matches
the order of the input IDs. Instead, users should rely on the ID field of the
returned documents.
This method should **NOT** raise exceptions if no documents are found for
some IDs.
Args:
ids: List of ids to retrieve.
Returns:
List of Documents.
"""
raise NotImplementedError(
f"{self.__class__.__name__} does not yet support get_by_ids."
)
# Implementations should override this method to provide an async native version.
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
"""Get documents by their IDs.
The returned documents are expected to have the ID field set to the ID of the
document in the vector store.
Fewer documents may be returned than requested if some IDs are not found or
if there are duplicated IDs.
Users should not assume that the order of the returned documents matches
the order of the input IDs. Instead, users should rely on the ID field of the
returned documents.
This method should **NOT** raise exceptions if no documents are found for
some IDs.
Args:
ids: List of ids to retrieve.
Returns:
List of Documents.
"""
return await run_in_executor(None, self.get_by_ids, ids)
async def adelete( async def adelete(
self, ids: Optional[List[str]] = None, **kwargs: Any self, ids: Optional[List[str]] = None, **kwargs: Any
) -> Optional[bool]: ) -> Optional[bool]: