From 4f1821db3e6019dfbe144375a68c08e6b4f6b7e4 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 1 Jul 2024 13:04:33 -0400 Subject: [PATCH] core[minor]: Add get_by_ids to vectorstore interface (#23594) This PR adds a part of the indexing API proposed in this RFC https://github.com/langchain-ai/langchain/pull/23544/files. It allows rolling out `get_by_ids` which should be uncontroversial to existing vectorstores without introducing new abstractions. The semantics for this method depend on the ability of identifying returned documents using the new optional ID field on documents: https://github.com/langchain-ai/langchain/pull/23411 Alternatives are: 1. Relax the sequence requirement ```python def get_by_ids(self, ids: Iterable[str], /) -> Iterable[Document]: ``` Rejected: - implementations are more likley to start batching with bad defaults - users would need to call list() or we'd need to introduce another convenience method 2. Support more kwargs ```python def get_by_ids(self, ids: Sequence[str], /, **kwargs) -> List[Document]: ... ``` Rejected: - No need for `batch` parameter since IDs is a sequence - Output cannot be customized since `Document` is fixed. (e.g., parameters could be useful to grab extra metadata like the vector that was indexed with the Document or to project a part of the document) --- libs/core/langchain_core/vectorstores.py | 52 ++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/libs/core/langchain_core/vectorstores.py b/libs/core/langchain_core/vectorstores.py index 9098538590..d846500b5b 100644 --- a/libs/core/langchain_core/vectorstores.py +++ b/libs/core/langchain_core/vectorstores.py @@ -34,6 +34,7 @@ from typing import ( Iterable, List, Optional, + Sequence, Tuple, Type, TypeVar, @@ -99,6 +100,57 @@ class VectorStore(ABC): raise NotImplementedError("delete method must be implemented by subclass.") + def get_by_ids(self, ids: Sequence[str], /) -> List[Document]: + """Get documents by their IDs. + + The returned documents are expected to have the ID field set to the ID of the + document in the vector store. + + Fewer documents may be returned than requested if some IDs are not found or + if there are duplicated IDs. + + Users should not assume that the order of the returned documents matches + the order of the input IDs. Instead, users should rely on the ID field of the + returned documents. + + This method should **NOT** raise exceptions if no documents are found for + some IDs. + + Args: + ids: List of ids to retrieve. + + Returns: + List of Documents. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not yet support get_by_ids." + ) + + # Implementations should override this method to provide an async native version. + async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]: + """Get documents by their IDs. + + The returned documents are expected to have the ID field set to the ID of the + document in the vector store. + + Fewer documents may be returned than requested if some IDs are not found or + if there are duplicated IDs. + + Users should not assume that the order of the returned documents matches + the order of the input IDs. Instead, users should rely on the ID field of the + returned documents. + + This method should **NOT** raise exceptions if no documents are found for + some IDs. + + Args: + ids: List of ids to retrieve. + + Returns: + List of Documents. + """ + return await run_in_executor(None, self.get_by_ids, ids) + async def adelete( self, ids: Optional[List[str]] = None, **kwargs: Any ) -> Optional[bool]: