mirror of https://github.com/hwchase17/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
194 lines
6.9 KiB
Plaintext
194 lines
6.9 KiB
Plaintext
1 year ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
1 year ago
|
"attachments": {},
|
||
1 year ago
|
"cell_type": "markdown",
|
||
|
"id": "fc0db1bc",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# LOTR (Merger Retriever)\n",
|
||
|
"\n",
|
||
1 year ago
|
"`Lord of the Retrievers`, also known as `MergerRetriever`, takes a list of retrievers as input and merges the results of their get_relevant_documents() methods into a single list. The merged results will be a list of documents that are relevant to the query and that have been ranked by the different retrievers.\n",
|
||
1 year ago
|
"\n",
|
||
1 year ago
|
"The `MergerRetriever` class can be used to improve the accuracy of document retrieval in a number of ways. First, it can combine the results of multiple retrievers, which can help to reduce the risk of bias in the results. Second, it can rank the results of the different retrievers, which can help to ensure that the most relevant documents are returned first."
|
||
1 year ago
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "9fbcc58f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import os\n",
|
||
|
"import chromadb\n",
|
||
|
"from langchain.retrievers.merger_retriever import MergerRetriever\n",
|
||
|
"from langchain.vectorstores import Chroma\n",
|
||
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||
|
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||
1 year ago
|
"from langchain.document_transformers import (\n",
|
||
|
" EmbeddingsRedundantFilter,\n",
|
||
|
" EmbeddingsClusteringFilter,\n",
|
||
|
")\n",
|
||
1 year ago
|
"from langchain.retrievers.document_compressors import DocumentCompressorPipeline\n",
|
||
|
"from langchain.retrievers import ContextualCompressionRetriever\n",
|
||
|
"\n",
|
||
|
"# Get 3 diff embeddings.\n",
|
||
|
"all_mini = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
|
||
|
"multi_qa_mini = HuggingFaceEmbeddings(model_name=\"multi-qa-MiniLM-L6-dot-v1\")\n",
|
||
|
"filter_embeddings = OpenAIEmbeddings()\n",
|
||
|
"\n",
|
||
|
"ABS_PATH = os.path.dirname(os.path.abspath(__file__))\n",
|
||
|
"DB_DIR = os.path.join(ABS_PATH, \"db\")\n",
|
||
|
"\n",
|
||
|
"# Instantiate 2 diff cromadb indexs, each one with a diff embedding.\n",
|
||
|
"client_settings = chromadb.config.Settings(\n",
|
||
1 year ago
|
" is_persistent=True,\n",
|
||
1 year ago
|
" persist_directory=DB_DIR,\n",
|
||
|
" anonymized_telemetry=False,\n",
|
||
|
")\n",
|
||
|
"db_all = Chroma(\n",
|
||
|
" collection_name=\"project_store_all\",\n",
|
||
|
" persist_directory=DB_DIR,\n",
|
||
|
" client_settings=client_settings,\n",
|
||
|
" embedding_function=all_mini,\n",
|
||
|
")\n",
|
||
|
"db_multi_qa = Chroma(\n",
|
||
|
" collection_name=\"project_store_multi\",\n",
|
||
|
" persist_directory=DB_DIR,\n",
|
||
|
" client_settings=client_settings,\n",
|
||
|
" embedding_function=multi_qa_mini,\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
|
"# Define 2 diff retrievers with 2 diff embeddings and diff search type.\n",
|
||
|
"retriever_all = db_all.as_retriever(\n",
|
||
|
" search_type=\"similarity\", search_kwargs={\"k\": 5, \"include_metadata\": True}\n",
|
||
|
")\n",
|
||
|
"retriever_multi_qa = db_multi_qa.as_retriever(\n",
|
||
|
" search_type=\"mmr\", search_kwargs={\"k\": 5, \"include_metadata\": True}\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
1 year ago
|
"# The Lord of the Retrievers will hold the ouput of boths retrievers and can be used as any other\n",
|
||
1 year ago
|
"# retriever on different types of chains.\n",
|
||
1 year ago
|
"lotr = MergerRetriever(retrievers=[retriever_all, retriever_multi_qa])"
|
||
1 year ago
|
]
|
||
|
},
|
||
|
{
|
||
1 year ago
|
"attachments": {},
|
||
1 year ago
|
"cell_type": "markdown",
|
||
|
"id": "c152339d",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Remove redundant results from the merged retrievers."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "039faea6",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
1 year ago
|
"# We can remove redundant results from both retrievers using yet another embedding.\n",
|
||
1 year ago
|
"# Using multiples embeddings in diff steps could help reduce biases.\n",
|
||
|
"filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)\n",
|
||
|
"pipeline = DocumentCompressorPipeline(transformers=[filter])\n",
|
||
|
"compression_retriever = ContextualCompressionRetriever(\n",
|
||
|
" base_compressor=pipeline, base_retriever=lotr\n",
|
||
|
")"
|
||
|
]
|
||
1 year ago
|
},
|
||
|
{
|
||
|
"attachments": {},
|
||
|
"cell_type": "markdown",
|
||
|
"id": "c10022fa",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Pick a representative sample of documents from the merged retrievers."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "b3885482",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# This filter will divide the documents vectors into clusters or \"centers\" of meaning.\n",
|
||
|
"# Then it will pick the closest document to that center for the final results.\n",
|
||
|
"# By default the result document will be ordered/grouped by clusters.\n",
|
||
|
"filter_ordered_cluster = EmbeddingsClusteringFilter(\n",
|
||
1 year ago
|
" embeddings=filter_embeddings,\n",
|
||
|
" num_clusters=10,\n",
|
||
|
" num_closest=1,\n",
|
||
|
")\n",
|
||
1 year ago
|
"\n",
|
||
|
"# If you want the final document to be ordered by the original retriever scores\n",
|
||
|
"# you need to add the \"sorted\" parameter.\n",
|
||
|
"filter_ordered_by_retriever = EmbeddingsClusteringFilter(\n",
|
||
1 year ago
|
" embeddings=filter_embeddings,\n",
|
||
|
" num_clusters=10,\n",
|
||
|
" num_closest=1,\n",
|
||
|
" sorted=True,\n",
|
||
|
")\n",
|
||
1 year ago
|
"\n",
|
||
|
"pipeline = DocumentCompressorPipeline(transformers=[filter_ordered_by_retriever])\n",
|
||
|
"compression_retriever = ContextualCompressionRetriever(\n",
|
||
|
" base_compressor=pipeline, base_retriever=lotr\n",
|
||
1 year ago
|
")"
|
||
1 year ago
|
]
|
||
1 year ago
|
},
|
||
|
{
|
||
|
"attachments": {},
|
||
|
"cell_type": "markdown",
|
||
|
"id": "8f68956e",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Re-order results to avoid performance degradation.\n",
|
||
|
"No matter the architecture of your model, there is a sustancial performance degradation when you include 10+ retrieved documents.\n",
|
||
|
"In brief: When models must access relevant information in the middle of long contexts, then tend to ignore the provided documents.\n",
|
||
|
"See: https://arxiv.org/abs//2307.03172"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "007283f3",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# You can use an additional document transformer to reorder documents after removing redudance.\n",
|
||
|
"from langchain.document_transformers import LongContextReorder\n",
|
||
|
"\n",
|
||
|
"filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)\n",
|
||
|
"reordering = LongContextReorder()\n",
|
||
|
"pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])\n",
|
||
|
"compression_retriever_reordered = ContextualCompressionRetriever(\n",
|
||
|
" base_compressor=pipeline, base_retriever=lotr\n",
|
||
|
")"
|
||
|
]
|
||
1 year ago
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.6"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|