From 0adc282d7055e6abca1177b539b199b73a9f2673 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 6 Aug 2023 17:00:57 -0700 Subject: [PATCH] Harrison/as retriever docstring (#8840) Co-authored-by: Bytestorm <31070777+Bytestorm5@users.noreply.github.com> --- .../modules/chains/popular/vector_db_qa.mdx | 35 ++++++++++++ .../popular/vector_db_qa_with_sources.mdx | 2 +- libs/langchain/langchain/vectorstores/base.py | 54 +++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/docs/snippets/modules/chains/popular/vector_db_qa.mdx b/docs/snippets/modules/chains/popular/vector_db_qa.mdx index 719c30eeb0..c353d16f5c 100644 --- a/docs/snippets/modules/chains/popular/vector_db_qa.mdx +++ b/docs/snippets/modules/chains/popular/vector_db_qa.mdx @@ -117,3 +117,38 @@ qa.run(query) ``` + +## Vectorstore Retriever Options +You can adjust how documents are retrieved from your vectorstore depending on the specific task. + +There are two main ways to retrieve documents relevant to a query- Similarity Search and Max Marginal Relevance Search (MMR Search). Similarity Search is the default, but you can use MMR by adding the `search_type` parameter: + +```python +docsearch.as_retriever(search_type="mmr") +``` + +You can also modify the search by passing specific search arguments through the retriever to the search function, using the `search_kwargs` keyword argument. + +- `k` defines how many documents are returned; defaults to 4. +- `score_threshold` allows you to set a minimum relevance for documents returned by the retriever, if you are using the "similarity_score_threshold" search type. +- `fetch_k` determines the amount of documents to pass to the MMR algorithm; defaults to 20. +- `lambda_mult` controls the diversity of results returned by the MMR algorithm, with 1 being minimum diversity and 0 being maximum. Defaults to 0.5. +- `filter` allows you to define a filter on what documents should be retrieved, based on the documents' metadata. This has no effect if the Vectorstore doesn't store any metadata. + +Some examples for how these parameters can be used: +```python +# Retrieve more documents with higher diversity- useful if your dataset has many similar documents +docsearch.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25}) + +# Fetch more documents for the MMR algorithm to consider, but only return the top 5 +docsearch.as_retriever(search_type="mmr", search_kwargs={'k': 5, 'fetch_k': 50}) + +# Only retrieve documents that have a relevance score above a certain threshold +docsearch.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8}) + +# Only get the single most similar document from the dataset +docsearch.as_retriever(search_kwargs={'k': 1}) + +# Use a filter to only retrieve documents from a specific paper +docsearch.as_retriever(search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}}) +``` \ No newline at end of file diff --git a/docs/snippets/modules/chains/popular/vector_db_qa_with_sources.mdx b/docs/snippets/modules/chains/popular/vector_db_qa_with_sources.mdx index 3135593d09..564406bb61 100644 --- a/docs/snippets/modules/chains/popular/vector_db_qa_with_sources.mdx +++ b/docs/snippets/modules/chains/popular/vector_db_qa_with_sources.mdx @@ -3,7 +3,7 @@ Additionally, we can return the source documents used to answer the question by ```python -qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True) +qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(search_type="mmr", search_kwargs={'fetch_k': 30}), return_source_documents=True) ``` diff --git a/libs/langchain/langchain/vectorstores/base.py b/libs/langchain/langchain/vectorstores/base.py index 45ea079a21..8f27ce20b5 100644 --- a/libs/langchain/langchain/vectorstores/base.py +++ b/libs/langchain/langchain/vectorstores/base.py @@ -461,8 +461,62 @@ class VectorStore(ABC): return tags def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: + """Return VectorStoreRetriever initialized from this VectorStore. + + Args: + search_type (Optional[str]): Defines the type of search that + the Retriever should perform. + Can be "similarity" (default), "mmr", or + "similarity_score_threshold". + search_kwargs (Optional[Dict]): Keyword arguments to pass to the + search function. Can include things like: + k: Amount of documents to return (Default: 4) + score_threshold: Minimum relevance threshold + for similarity_score_threshold + fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) + lambda_mult: Diversity of results returned by MMR; + 1 for minimum diversity and 0 for maximum. (Default: 0.5) + filter: Filter by document metadata + + Returns: + VectorStoreRetriever: Retriever class for VectorStore. + + Examples: + + .. code-block:: python + + # Retrieve more documents with higher diversity + # Useful if your dataset has many similar documents + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 6, 'lambda_mult': 0.25} + ) + + # Fetch more documents for the MMR algorithm to consider + # But only return the top 5 + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 5, 'fetch_k': 50} + ) + + # Only retrieve documents that have a relevance score + # Above a certain threshold + docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={'score_threshold': 0.8} + ) + + # Only get the single most similar document from the dataset + docsearch.as_retriever(search_kwargs={'k': 1}) + + # Use a filter to only retrieve documents from a specific paper + docsearch.as_retriever( + search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}} + ) + """ tags = kwargs.pop("tags", None) or [] tags.extend(self._get_retriever_tags()) + return VectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)