|
|
|
@ -51,12 +51,48 @@ RetrieverOutputLike = Runnable[Any, RetrieverOutput]
|
|
|
|
|
class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
"""Abstract base class for a Document retrieval system.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
A retrieval system is defined as something that can take string queries and return
|
|
|
|
|
the most 'relevant' Documents from some source.
|
|
|
|
|
the most 'relevant' Documents from some source.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
|
|
|
|
|
A retriever follows the standard Runnable interface, and should be used
|
|
|
|
|
via the standard runnable methods of `invoke`, `ainvoke`, `batch`, `abatch`.
|
|
|
|
|
|
|
|
|
|
Implementation:
|
|
|
|
|
|
|
|
|
|
When implementing a custom retriever, the class should implement
|
|
|
|
|
the `_get_relevant_documents` method to define the logic for retrieving documents.
|
|
|
|
|
|
|
|
|
|
Optionally, an async native implementations can be provided by overriding the
|
|
|
|
|
`_aget_relevant_documents` method.
|
|
|
|
|
|
|
|
|
|
Example: A retriever that returns the first 5 documents from a list of documents
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
from langchain_core import Document, BaseRetriever
|
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
|
|
class SimpleRetriever(BaseRetriever):
|
|
|
|
|
docs: List[Document]
|
|
|
|
|
k: int = 5
|
|
|
|
|
|
|
|
|
|
def _get_relevant_documents(self, query: str) -> List[Document]:
|
|
|
|
|
\"\"\"Return the first k documents from the list of documents\"\"\"
|
|
|
|
|
return self.docs[:self.k]
|
|
|
|
|
|
|
|
|
|
async def _aget_relevant_documents(self, query: str) -> List[Document]:
|
|
|
|
|
\"\"\"(Optional) async native implementation.\"\"\"
|
|
|
|
|
return self.docs[:self.k]
|
|
|
|
|
|
|
|
|
|
Example: A simple retriever based on a scitkit learn vectorizer
|
|
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
class TFIDFRetriever(BaseRetriever, BaseModel):
|
|
|
|
|
vectorizer: Any
|
|
|
|
|
docs: List[Document]
|
|
|
|
@ -66,9 +102,7 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
class Config:
|
|
|
|
|
arbitrary_types_allowed = True
|
|
|
|
|
|
|
|
|
|
def get_relevant_documents(self, query: str) -> List[Document]:
|
|
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
def _get_relevant_documents(self, query: str) -> List[Document]:
|
|
|
|
|
# Ip -- (n_docs,x), Op -- (n_docs,n_Feats)
|
|
|
|
|
query_vec = self.vectorizer.transform([query])
|
|
|
|
|
# Op -- (n_docs,1) -- Cosine Sim with each doc
|
|
|
|
@ -137,6 +171,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
def invoke(
|
|
|
|
|
self, input: str, config: Optional[RunnableConfig] = None, **kwargs: Any
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Invoke the retriever to get relevant documents.
|
|
|
|
|
|
|
|
|
|
Main entry point for synchronous retriever invocations.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input: The query string
|
|
|
|
|
config: Configuration for the retriever
|
|
|
|
|
**kwargs: Additional arguments to pass to the retriever
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of relevant documents
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
retriever.invoke("query")
|
|
|
|
|
"""
|
|
|
|
|
config = ensure_config(config)
|
|
|
|
|
return self.get_relevant_documents(
|
|
|
|
|
input,
|
|
|
|
@ -153,6 +205,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
config: Optional[RunnableConfig] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Asynchronously invoke the retriever to get relevant documents.
|
|
|
|
|
|
|
|
|
|
Main entry point for asynchronous retriever invocations.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input: The query string
|
|
|
|
|
config: Configuration for the retriever
|
|
|
|
|
**kwargs: Additional arguments to pass to the retriever
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of relevant documents
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
await retriever.ainvoke("query")
|
|
|
|
|
"""
|
|
|
|
|
config = ensure_config(config)
|
|
|
|
|
return await self.aget_relevant_documents(
|
|
|
|
|
input,
|
|
|
|
@ -203,6 +273,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Retrieve documents relevant to a query.
|
|
|
|
|
|
|
|
|
|
Users should favor using `.invoke` or `.batch` rather than
|
|
|
|
|
`get_relevant_documents directly`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: string to find relevant documents for
|
|
|
|
|
callbacks: Callback manager or list of callbacks
|
|
|
|
@ -212,6 +286,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
metadata: Optional metadata associated with the retriever. Defaults to None
|
|
|
|
|
This metadata will be associated with each call to this retriever,
|
|
|
|
|
and passed as arguments to the handlers defined in `callbacks`.
|
|
|
|
|
run_name: Optional name for the run.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of relevant documents
|
|
|
|
|
"""
|
|
|
|
@ -260,6 +336,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Asynchronously get documents relevant to a query.
|
|
|
|
|
|
|
|
|
|
Users should favor using `.ainvoke` or `.abatch` rather than
|
|
|
|
|
`aget_relevant_documents directly`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: string to find relevant documents for
|
|
|
|
|
callbacks: Callback manager or list of callbacks
|
|
|
|
@ -269,6 +349,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
|
|
|
|
|
metadata: Optional metadata associated with the retriever. Defaults to None
|
|
|
|
|
This metadata will be associated with each call to this retriever,
|
|
|
|
|
and passed as arguments to the handlers defined in `callbacks`.
|
|
|
|
|
run_name: Optional name for the run.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of relevant documents
|
|
|
|
|
"""
|
|
|
|
|