core[minor]: Add get_by_ids to vectorstore interface (#23594)

This PR adds a part of the indexing API proposed in this RFC https://github.com/langchain-ai/langchain/pull/23544/files. It allows rolling out `get_by_ids` which should be uncontroversial to existing vectorstores without introducing new abstractions. The semantics for this method depend on the ability of identifying returned documents using the new optional ID field on documents: https://github.com/langchain-ai/langchain/pull/23411 Alternatives are: 1. Relax the sequence requirement ```python def get_by_ids(self, ids: Iterable[str], /) -> Iterable[Document]: ``` Rejected: - implementations are more likley to start batching with bad defaults - users would need to call list() or we'd need to introduce another convenience method 2. Support more kwargs ```python def get_by_ids(self, ids: Sequence[str], /, **kwargs) -> List[Document]: ... ``` Rejected: - No need for `batch` parameter since IDs is a sequence - Output cannot be customized since `Document` is fixed. (e.g., parameters could be useful to grab extra metadata like the vector that was indexed with the Document or to project a part of the document)
2024-11-10 01:10:59 +00:00 · 2024-07-01 13:04:33 -04:00 · 2024-07-01 13:04:33 -04:00 · 4f1821db3e
commit 4f1821db3e
parent bf402f902e
1 changed files with 52 additions and 0 deletions
--- a/libs/core/langchain_core/vectorstores.py
+++ b/libs/core/langchain_core/vectorstores.py
@ -34,6 +34,7 @@ from typing import (
    Iterable,
    List,
    Optional,
    Sequence,
    Tuple,
    Type,
    TypeVar,
@ -99,6 +100,57 @@ class VectorStore(ABC):
        raise NotImplementedError("delete method must be implemented by subclass.")
    def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
        """Get documents by their IDs.
        The returned documents are expected to have the ID field set to the ID of the
        document in the vector store.
        Fewer documents may be returned than requested if some IDs are not found or
        if there are duplicated IDs.
        Users should not assume that the order of the returned documents matches
        the order of the input IDs. Instead, users should rely on the ID field of the
        returned documents.
        This method should **NOT** raise exceptions if no documents are found for
        some IDs.
        Args:
            ids: List of ids to retrieve.
        Returns:
            List of Documents.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} does not yet support get_by_ids."
        )
    # Implementations should override this method to provide an async native version.
    async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
        """Get documents by their IDs.
        The returned documents are expected to have the ID field set to the ID of the
        document in the vector store.
        Fewer documents may be returned than requested if some IDs are not found or
        if there are duplicated IDs.
        Users should not assume that the order of the returned documents matches
        the order of the input IDs. Instead, users should rely on the ID field of the
        returned documents.
        This method should **NOT** raise exceptions if no documents are found for
        some IDs.
        Args:
            ids: List of ids to retrieve.
        Returns:
            List of Documents.
        """
        return await run_in_executor(None, self.get_by_ids, ids)
    async def adelete(
        self, ids: Optional[List[str]] = None, **kwargs: Any
    ) -> Optional[bool]: