core[patch]: Update documentation for base retriever (#20345)

Updating in code documentation for base retriever to direct folks toward
the .invoke and .ainvoke methods + explain how to implement
This commit is contained in:
Eugene Yurtsev 2024-04-11 16:20:14 -04:00 committed by GitHub
parent d2f4153fe6
commit 2900720cd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -51,12 +51,48 @@ RetrieverOutputLike = Runnable[Any, RetrieverOutput]
class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
"""Abstract base class for a Document retrieval system.
A retrieval system is defined as something that can take string queries and return
the most 'relevant' Documents from some source.
Example:
A retrieval system is defined as something that can take string queries and return
the most 'relevant' Documents from some source.
Usage:
A retriever follows the standard Runnable interface, and should be used
via the standard runnable methods of `invoke`, `ainvoke`, `batch`, `abatch`.
Implementation:
When implementing a custom retriever, the class should implement
the `_get_relevant_documents` method to define the logic for retrieving documents.
Optionally, an async native implementations can be provided by overriding the
`_aget_relevant_documents` method.
Example: A retriever that returns the first 5 documents from a list of documents
.. code-block:: python
from langchain_core import Document, BaseRetriever
from typing import List
class SimpleRetriever(BaseRetriever):
docs: List[Document]
k: int = 5
def _get_relevant_documents(self, query: str) -> List[Document]:
\"\"\"Return the first k documents from the list of documents\"\"\"
return self.docs[:self.k]
async def _aget_relevant_documents(self, query: str) -> List[Document]:
\"\"\"(Optional) async native implementation.\"\"\"
return self.docs[:self.k]
Example: A simple retriever based on a scitkit learn vectorizer
.. code-block:: python
from sklearn.metrics.pairwise import cosine_similarity
class TFIDFRetriever(BaseRetriever, BaseModel):
vectorizer: Any
docs: List[Document]
@ -66,9 +102,7 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
class Config:
arbitrary_types_allowed = True
def get_relevant_documents(self, query: str) -> List[Document]:
from sklearn.metrics.pairwise import cosine_similarity
def _get_relevant_documents(self, query: str) -> List[Document]:
# Ip -- (n_docs,x), Op -- (n_docs,n_Feats)
query_vec = self.vectorizer.transform([query])
# Op -- (n_docs,1) -- Cosine Sim with each doc
@ -137,6 +171,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
def invoke(
self, input: str, config: Optional[RunnableConfig] = None, **kwargs: Any
) -> List[Document]:
"""Invoke the retriever to get relevant documents.
Main entry point for synchronous retriever invocations.
Args:
input: The query string
config: Configuration for the retriever
**kwargs: Additional arguments to pass to the retriever
Returns:
List of relevant documents
Examples:
.. code-block:: python
retriever.invoke("query")
"""
config = ensure_config(config)
return self.get_relevant_documents(
input,
@ -153,6 +205,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
config: Optional[RunnableConfig] = None,
**kwargs: Any,
) -> List[Document]:
"""Asynchronously invoke the retriever to get relevant documents.
Main entry point for asynchronous retriever invocations.
Args:
input: The query string
config: Configuration for the retriever
**kwargs: Additional arguments to pass to the retriever
Returns:
List of relevant documents
Examples:
.. code-block:: python
await retriever.ainvoke("query")
"""
config = ensure_config(config)
return await self.aget_relevant_documents(
input,
@ -203,6 +273,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
**kwargs: Any,
) -> List[Document]:
"""Retrieve documents relevant to a query.
Users should favor using `.invoke` or `.batch` rather than
`get_relevant_documents directly`.
Args:
query: string to find relevant documents for
callbacks: Callback manager or list of callbacks
@ -212,6 +286,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
metadata: Optional metadata associated with the retriever. Defaults to None
This metadata will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`.
run_name: Optional name for the run.
Returns:
List of relevant documents
"""
@ -260,6 +336,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
**kwargs: Any,
) -> List[Document]:
"""Asynchronously get documents relevant to a query.
Users should favor using `.ainvoke` or `.abatch` rather than
`aget_relevant_documents directly`.
Args:
query: string to find relevant documents for
callbacks: Callback manager or list of callbacks
@ -269,6 +349,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
metadata: Optional metadata associated with the retriever. Defaults to None
This metadata will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`.
run_name: Optional name for the run.
Returns:
List of relevant documents
"""