forked from Archives/langchain
Harrison/base combine doc chain (#264)
This commit is contained in:
parent
c27a6fa8a4
commit
e9b1c8cdfa
@ -52,15 +52,32 @@ With these primitives in mind, the following chains exist:
|
||||
**Vector Database Question-Answering**
|
||||
|
||||
- **Links Used**: Vectorstore, LLMChain
|
||||
- **Notes**: This chain takes user input (a question), uses the Vectorstore and semantic search to find relevant documents, and then passes the documents plus to the original question to another LLM to generate a final answer.
|
||||
- **Notes**: This chain takes user input (a question), uses the Vectorstore and semantic search to find relevant documents, and then passes the documents plus the original question to another LLM to generate a final answer.
|
||||
- `Example Notebook <chains/vector_db_qa.ipynb>`_
|
||||
|
||||
**Vector Database Question-Answering With Sources**
|
||||
|
||||
- **Links Used**: Vectorstore, LLMChain
|
||||
- **Notes**: This chain takes user input (a question), uses the Vectorstore and semantic search to find relevant documents, and then passes the documents plus the original question to another LLM to generate a final answer with sources.
|
||||
- `Example Notebook <chains/vector_db_qa_with_sources.ipynb>`_
|
||||
|
||||
**Question-Answering With Sources**
|
||||
|
||||
- **Links Used**: LLMChain
|
||||
- **Notes**: This chain takes a question and multiple documents as input. It then runs a first LLMChain over all documents attempting to answer the provided question. It then runs a second LLMChain over the results of the first pass, combining the answers from documents into a single response that is returned.
|
||||
- `Example Notebook <chains/combine_documents.ipynb>`_
|
||||
- **Notes**: These types of chains take a question and multiple documents as input, and return an answer plus sources for where that answer came from. There are multiple underlying types of chains to do this, for more information see TODO.
|
||||
- `Example Notebook <chains/qa_with_sources.ipynb>`_
|
||||
|
||||
**Question-Answering**
|
||||
|
||||
- **Links Used**: LLMChain
|
||||
- **Notes**: These types of chains take a question and multiple documents as input, and return an answer. There are multiple underlying types of chains to do this, for more information see TODO.
|
||||
- `Example Notebook <chains/question_answering.ipynb>`_
|
||||
|
||||
**Summarization**
|
||||
|
||||
- **Links Used**: LLMChain
|
||||
- **Notes**: These types of chains take multiple documents as input, and return a summary of all documents. There are multiple underlying types of chains to do this, for more information see TODO.
|
||||
- `Example Notebook <chains/summarize.ipynb>`_
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
@ -1,93 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d9a0131f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Map Reduce\n",
|
||||
"\n",
|
||||
"This notebok showcases an example of map-reduce chains: recursive summarization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e9db25f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import OpenAI, PromptTemplate, LLMChain\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.chains.mapreduce import MapReduceChain\n",
|
||||
"\n",
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"\n",
|
||||
"_prompt = \"\"\"Write a concise summary of the following:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"{text}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"CONCISE SUMMARY:\"\"\"\n",
|
||||
"prompt = PromptTemplate(template=_prompt, input_variables=[\"text\"])\n",
|
||||
"\n",
|
||||
"text_splitter = CharacterTextSplitter()\n",
|
||||
"\n",
|
||||
"mp_chain = MapReduceChain.from_params(llm, prompt, text_splitter)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "99bbe19b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"\\n\\nThe President discusses the recent aggression by Russia, and the response by the United States and its allies. He announces new sanctions against Russia, and says that the free world is united in holding Putin accountable. The President also discusses the American Rescue Plan, the Bipartisan Infrastructure Law, and the Bipartisan Innovation Act. Finally, the President addresses the need for women's rights and equality for LGBTQ+ Americans.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"mp_chain.run(state_of_the_union)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "baa6e808",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
250
docs/examples/chains/qa_with_sources.ipynb
Normal file
250
docs/examples/chains/qa_with_sources.ipynb
Normal file
@ -0,0 +1,250 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74148cee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question Answering with Sources\n",
|
||||
"\n",
|
||||
"This notebook walks through how to use LangChain for question answering with sources over a list of documents. It covers three different chain types: `stuff`, `map_reduce`, and `refine`. For a more in depth explanation of what these chain types are, see [here](../../explanation/combine_docs.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ca2f0efc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare Data\n",
|
||||
"First we prepare the data. For this example we do similarity search over a vector database, but these documents could be fetched in any manner (the point of this notebook to highlight what to do AFTER you fetch the documents)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "78f28130",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.embeddings.cohere import CohereEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
|
||||
"from langchain.vectorstores.faiss import FAISS\n",
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4da195a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "5ec2b55b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "5286f58f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "005a47e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d82f899a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `stuff` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `stuff` Chain to do question answering with sources."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fc1a5ed6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type=\"stuff\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e239964b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [Document(page_content=t, metadata={\"source\": i}) for i, t in enumerate(texts[:3])]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7d766417",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': ' The president did not mention Justice Breyer.\\nSOURCES: 0-pl, 1-pl, 2-pl'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5dbb304",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `map_reduce` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `map_reduce` Chain to do question answering with sources."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "921db0a4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type=\"map_reduce\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "e417926a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': ' The president did not mention Justice Breyer.\\nSOURCES: 0, 1, 2'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5bf0e1ab",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `refine` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `refine` Chain to do question answering with sources."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "904835c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type=\"refine\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "f60875c6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': \"\\n\\nThe president said that Justice Breyer has dedicated his life to serve the country and has left a legacy of excellence. He also thanked Justice Breyer for his service and for his commitment to advancing liberty and justice, including protecting the rights of women and the constitutional right affirmed in Roe v. Wade, preserving access to health care and a woman's right to choose, and advancing the bipartisan Equality Act to protect LGBTQ+ Americans. The president also noted that the State of the Union is strong because of the courage and determination of the American people, and that the nation will meet and overcome the challenges of our time as one people, just as the Ukrainian people have done in the face of adversity. Source: 0, 29, 35\"}"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"query_str\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "929620d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
248
docs/examples/chains/question_answering.ipynb
Normal file
248
docs/examples/chains/question_answering.ipynb
Normal file
@ -0,0 +1,248 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "05859721",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question Answering\n",
|
||||
"\n",
|
||||
"This notebook walks through how to use LangChain for question answering over a list of documents. It covers three different types of chaings: `stuff`, `map_reduce`, and `refine`. For a more in depth explanation of what these chain types are, see [here](../../explanation/combine_docs.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "726f4996",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare Data\n",
|
||||
"First we prepare the data. For this example we do similarity search over a vector database, but these documents could be fetched in any manner (the point of this notebook to highlight what to do AFTER you fetch the documents)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "17fcbc0f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores.faiss import FAISS\n",
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "291f0117",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "fd9666a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = FAISS.from_texts(texts, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d1eaf6e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "a16e3453",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains.question_answering import load_qa_chain\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f78787a0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `stuff` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `stuff` Chain to do question answering."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "180fd4c1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_chain(OpenAI(temperature=0), chain_type=\"stuff\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d145ae31",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [Document(page_content=t) for t in texts[:3]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "77fdf1aa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': ' The president did not mention Justice Breyer.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "91522e29",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `map_reduce` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `map_reduce` Chain to do question answering."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b0060f51",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_chain(OpenAI(temperature=0), chain_type=\"map_reduce\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "fbdb9137",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': ' The president did not mention Justice Breyer.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6ea50ad0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `refine` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `refine` Chain to do question answering."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fb167057",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_qa_chain(OpenAI(temperature=0), chain_type=\"refine\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "d8b5286e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': '\\n\\nThe president did not mention Justice Breyer in the given page content.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"chain({\"input_documents\": docs, \"query_str\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "49e9c6d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
234
docs/examples/chains/summarize.ipynb
Normal file
234
docs/examples/chains/summarize.ipynb
Normal file
@ -0,0 +1,234 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d9a0131f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Summarization\n",
|
||||
"\n",
|
||||
"This notebook walks through how to use LangChain for summarization over a list of documents. It covers three different chain types: `stuff`, `map_reduce`, and `refine`. For a more in depth explanation of what these chain types are, see [here](../../explanation/combine_docs.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b5660bf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare Data\n",
|
||||
"First we prepare the data. For this example we create multiple documents from one long one, but these documents could be fetched in any manner (the point of this notebook to highlight what to do AFTER you fetch the documents)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e9db25f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import OpenAI, PromptTemplate, LLMChain\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.chains.mapreduce import MapReduceChain\n",
|
||||
"\n",
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"text_splitter = CharacterTextSplitter()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "99bbe19b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "baa6e808",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "8dff4f43",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [Document(page_content=t) for t in texts[:3]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "27989fc4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains.summarize import load_summarize_chain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea2d5c99",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `stuff` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `stuff` Chain to do summarization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f01f3196",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_summarize_chain(llm, chain_type=\"stuff\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "da4d9801",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' In his speech, President Biden addressed the ongoing conflict between Russia and Ukraine, and the need for the United States and its allies to stand with Ukraine. He also discussed the American Rescue Plan, the Bipartisan Infrastructure Law, and the Bipartisan Innovation Act, which will help to create jobs, modernize infrastructure, and level the playing field with China. He also emphasized the importance of buying American products to support American jobs.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c868e86",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `map_reduce` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `map_reduce` Chain to do summarization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ef28e1d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_summarize_chain(llm, chain_type=\"map_reduce\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "f82c5f9f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' In response to Russian aggression in Ukraine, the US and its allies have imposed economic sanctions, cut off access to technology, seized assets of Russian oligarchs, and closed American airspace to Russian flights. The US is also providing military, economic, and humanitarian assistance to Ukraine, mobilizing ground forces, air squadrons, and ship deployments, and releasing 30 million barrels of oil from its Strategic Petroleum Reserve. President Biden has also passed the American Rescue Plan, Bipartisan Infrastructure Law, and Bipartisan Innovation Act to provide economic relief and rebuild America.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f61350f9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### The `refine` Chain\n",
|
||||
"\n",
|
||||
"This sections shows results of using the `refine` Chain to do summarization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "3bcbe31e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = load_summarize_chain(llm, chain_type=\"refine\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "c8cad866",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"\\nIn this speech, the speaker addresses the American people and their allies, discussing the recent aggression of Russia's Vladimir Putin in Ukraine. The speaker outlines the actions taken by the United States and its allies to hold Putin accountable, including economic sanctions, cutting off access to technology, and seizing the assets of Russian oligarchs. The speaker also announces the closing of American airspace to Russian flights, further isolating Russia and adding an additional squeeze on their economy. The Russian stock market has lost 40% of its value and trading remains suspended. Together with our allies, the United States is providing military, economic, and humanitarian assistance to Ukraine, and has mobilized forces to protect NATO countries. The speaker also announces the release of 60 million barrels of oil from reserves around the world, with the United States releasing 30 million barrels from its own Strategic Petroleum Reserve. The speaker emphasizes that the United States and its allies will defend every inch of NATO territory and that Putin will pay a high price for his aggression. The speaker also acknowledges the hardships faced by the American people due to the pandemic and the American Rescue Plan, which has provided immediate economic relief for tens of millions of Americans, helped put food on their table, keep a roof over their heads, and cut the cost of health insurance. The speaker\""
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0da92750",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -41,27 +41,27 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"id": "3018f865",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"qa = VectorDBQA(llm=OpenAI(), vectorstore=docsearch)"
|
||||
"qa = VectorDBQA.from_llm(llm=OpenAI(), vectorstore=docsearch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"id": "032a47f8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' The President said that Ketanji Brown Jackson is a consensus builder and has received a broad range of support since she was nominated.'"
|
||||
"\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator and federal public defender, and from a family of public school educators and police officers. He also said that she has received a broad range of support since she was nominated, from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -74,7 +74,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0f20b92",
|
||||
"id": "f056f6fd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@ -96,7 +96,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -61,72 +61,6 @@
|
||||
" d.metadata = {'source': f\"{i}-pl\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aa1c1b60",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### QAWithSourcesChain\n",
|
||||
"This shows how to use the `QAWithSourcesChain`, which takes in document objects and uses them directly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "61bce191",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "57ddf8c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import QAWithSourcesChain\n",
|
||||
"from langchain.llms import OpenAI, Cohere\n",
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f908a92a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = QAWithSourcesChain.from_llm(OpenAI(temperature=0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "a505ac89",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'answer': ' The president thanked Justice Breyer for his service.',\n",
|
||||
" 'sources': '27-pl'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain({\"docs\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6fc81de",
|
||||
@ -159,10 +93,22 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"id": "8ba36fa7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'answer': ' The president thanked Justice Breyer for his service.',\n",
|
||||
" 'sources': '27-pl'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain({\"question\": \"What did the president say about Justice Breyer\"}, return_only_outputs=True)"
|
||||
]
|
||||
@ -192,7 +138,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.7"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
@ -1,180 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b118c9dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# HuggingFace Tokenizers\n",
|
||||
"\n",
|
||||
"This notebook show cases how to use HuggingFace tokenizers to split text."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e82c4685",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import CharacterTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a8ce51d5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import GPT2TokenizerFast\n",
|
||||
"\n",
|
||||
"tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "ca5e72c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "37cdfbeb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n",
|
||||
"\n",
|
||||
"Last year COVID-19 kept us apart. This year we are finally together again. \n",
|
||||
"\n",
|
||||
"Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n",
|
||||
"\n",
|
||||
"With a duty to one another to the American people to the Constitution. \n",
|
||||
"\n",
|
||||
"And with an unwavering resolve that freedom will always triumph over tyranny. \n",
|
||||
"\n",
|
||||
"Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n",
|
||||
"\n",
|
||||
"He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n",
|
||||
"\n",
|
||||
"He met the Ukrainian people. \n",
|
||||
"\n",
|
||||
"From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n",
|
||||
"\n",
|
||||
"Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n",
|
||||
"\n",
|
||||
"In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \n",
|
||||
"\n",
|
||||
"Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n",
|
||||
"\n",
|
||||
"Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n",
|
||||
"\n",
|
||||
"Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. \n",
|
||||
"\n",
|
||||
"They keep moving. \n",
|
||||
"\n",
|
||||
"And the costs and the threats to America and the world keep rising. \n",
|
||||
"\n",
|
||||
"That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. \n",
|
||||
"\n",
|
||||
"The United States is a member along with 29 other nations. \n",
|
||||
"\n",
|
||||
"It matters. American diplomacy matters. American resolve matters. \n",
|
||||
"\n",
|
||||
"Putin’s latest attack on Ukraine was premeditated and unprovoked. \n",
|
||||
"\n",
|
||||
"He rejected repeated efforts at diplomacy. \n",
|
||||
"\n",
|
||||
"He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. \n",
|
||||
"\n",
|
||||
"We prepared extensively and carefully. \n",
|
||||
"\n",
|
||||
"We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. \n",
|
||||
"\n",
|
||||
"I spent countless hours unifying our European allies. We shared with the world in advance what we knew Putin was planning and precisely how he would try to falsely justify his aggression. \n",
|
||||
"\n",
|
||||
"We countered Russia’s lies with truth. \n",
|
||||
"\n",
|
||||
"And now that he has acted the free world is holding him accountable. \n",
|
||||
"\n",
|
||||
"Along with twenty-seven members of the European Union including France, Germany, Italy, as well as countries like the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland. \n",
|
||||
"\n",
|
||||
"We are inflicting pain on Russia and supporting the people of Ukraine. Putin is now isolated from the world more than ever. \n",
|
||||
"\n",
|
||||
"Together with our allies –we are right now enforcing powerful economic sanctions. \n",
|
||||
"\n",
|
||||
"We are cutting off Russia’s largest banks from the international financial system. \n",
|
||||
"\n",
|
||||
"Preventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion “war fund” worthless. \n",
|
||||
"\n",
|
||||
"We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come. \n",
|
||||
"\n",
|
||||
"Tonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. \n",
|
||||
"\n",
|
||||
"The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. \n",
|
||||
"\n",
|
||||
"We are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains. \n",
|
||||
"\n",
|
||||
"And tonight I am announcing that we will join our allies in closing off American air space to all Russian flights – further isolating Russia – and adding an additional squeeze –on their economy. The Ruble has lost 30% of its value. \n",
|
||||
"\n",
|
||||
"The Russian stock market has lost 40% of its value and trading remains suspended. Russia’s economy is reeling and Putin alone is to blame. \n",
|
||||
"\n",
|
||||
"Together with our allies we are providing support to the Ukrainians in their fight for freedom. Military assistance. Economic assistance. Humanitarian assistance. \n",
|
||||
"\n",
|
||||
"We are giving more than $1 Billion in direct assistance to Ukraine. \n",
|
||||
"\n",
|
||||
"And we will continue to aid the Ukrainian people as they defend their country and to help ease their suffering. \n",
|
||||
"\n",
|
||||
"Let me be clear, our forces are not engaged and will not engage in conflict with Russian forces in Ukraine. \n",
|
||||
"\n",
|
||||
"Our forces are not going to Europe to fight in Ukraine, but to defend our NATO Allies – in the event that Putin decides to keep moving west. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(texts[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d214aec2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
258
docs/examples/integrations/textsplitter.ipynb
Normal file
258
docs/examples/integrations/textsplitter.ipynb
Normal file
@ -0,0 +1,258 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b118c9dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Text Splitter\n",
|
||||
"\n",
|
||||
"When you want to deal wit long pieces of text, it is necessary to split up that text into chunks.\n",
|
||||
"This notebook showcases several ways to do that.\n",
|
||||
"\n",
|
||||
"At a high level, text splitters work as following:\n",
|
||||
"\n",
|
||||
"1. Split the text up into small, semantically meaningful chunks (often sentences).\n",
|
||||
"2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).\n",
|
||||
"3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e82c4685",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter, SpacyTextSplitter\n",
|
||||
"# This is a long document we can split up.\n",
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5c461b26",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Character Text Splitting\n",
|
||||
"\n",
|
||||
"Let's start with the most simple method: let's split based on characters (by default \"\\n\\n\") and measure chunk length by number of characters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "79ff6737",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter( \n",
|
||||
" separator = \"\\n\\n\",\n",
|
||||
" chunk_size = 1000,\n",
|
||||
" chunk_overlap = 200,\n",
|
||||
" length_function = len,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "38547666",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \\n\\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. '"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"texts[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "13dc0983",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## HuggingFace Length Function\n",
|
||||
"Most LLMs are constrained by the number of tokens that you can pass in, which is not the same as the number of characters. In order to get a more accurate estimate, we can use HuggingFace tokenizers to count the text length."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "a8ce51d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import GPT2TokenizerFast\n",
|
||||
"\n",
|
||||
"tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "ca5e72c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=100, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "37cdfbeb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n",
|
||||
"\n",
|
||||
"Last year COVID-19 kept us apart. This year we are finally together again. \n",
|
||||
"\n",
|
||||
"Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n",
|
||||
"\n",
|
||||
"With a duty to one another to the American people to the Constitution. \n",
|
||||
"\n",
|
||||
"And with an unwavering resolve that freedom will always triumph over tyranny. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(texts[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea2973ac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## NLTK Text Splitter\n",
|
||||
"Rather than just splitting on \"\\n\\n\", we can use NLTK to split based on tokenizers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "20fa9c23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = NLTKTextSplitter(chunk_size=1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "5ea10835",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\\n\\nMembers of Congress and the Cabinet.\\n\\nJustices of the Supreme Court.\\n\\nMy fellow Americans.\\n\\nLast year COVID-19 kept us apart.\\n\\nThis year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents.\\n\\nBut most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constitution.\\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways.\\n\\nBut he badly miscalculated.\\n\\nHe thought he could roll into Ukraine and the world would roll over.\\n\\nInstead he met a wall of strength he never imagined.\\n\\nHe met the Ukrainian people.\\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\\n\\nGroups of citizens blocking tanks with their bodies.\\n\\nEveryone from students to retirees teachers turned soldiers defending their homeland.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"texts[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dab86b60",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Spacy Text Splitter\n",
|
||||
"Another alternative to NLTK is to use Spacy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "f9cc9dfc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = SpacyTextSplitter(chunk_size=1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "cef2b29e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\\n\\nMembers of Congress and the Cabinet.\\n\\nJustices of the Supreme Court.\\n\\nMy fellow Americans. \\n\\n\\n\\nLast year COVID-19 kept us apart.\\n\\nThis year we are finally together again.\\n\\n\\n\\n\\n\\nTonight, we meet as Democrats Republicans and Independents.\\n\\nBut most importantly as Americans.\\n\\n\\n\\n\\n\\nWith a duty to one another to the American people to the Constitution. \\n\\n\\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\\n\\n\\n\\n\\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways.\\n\\nBut he badly miscalculated.\\n\\n\\n\\n\\n\\nHe thought he could roll into Ukraine and the world would roll over.\\n\\nInstead he met a wall of strength he never imagined.\\n\\n\\n\\n\\n\\nHe met the Ukrainian people.\\n\\n\\n\\n\\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\\n\\n\\n\\n\\n\\nGroups of citizens blocking tanks with their bodies.\\n\\nEveryone from students to retirees teachers turned soldiers defending their homeland.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"texts[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a1a118b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
128
docs/explanation/combine_docs.md
Normal file
128
docs/explanation/combine_docs.md
Normal file
@ -0,0 +1,128 @@
|
||||
# Data Augmented Generation
|
||||
|
||||
## Overview
|
||||
|
||||
Language models are trained on large amounts of unstructured data, which makes them really good at general purpose text generation. However, there are many instances where you may want the language model to generate text based not on generic data but rather on specific data. Some common examples of this include:
|
||||
|
||||
- Summarization of a specific piece of text (a website, a private document, etc)
|
||||
- Question answering over a specific piece of text (a website, a private document, etc)
|
||||
- Question answering over multiple pieces of text (multiple websites, multiple private documents, etc)
|
||||
- Using the results of some external call to an API (results from a SQL query, etc)
|
||||
|
||||
All of these examples are instances when you do not want the LLM to generate text based solely on the data it was trained over, but rather you want it to incorporate other external data in some way. At a high level, this process can be broken down into two steps:
|
||||
|
||||
1. Fetching: Fetching the relevant data to include.
|
||||
2. Augmenting: Passing the data in as context to the LLM.
|
||||
|
||||
This guide is intended to provide an overview of how to do this. This includes an overview of the literature, as well as common tools, abstractions and chains for doing this.
|
||||
|
||||
## Related Literature
|
||||
There are a lot of related papers in this area. Most of them are focused on end-to-end methods that optimize the fetching of the relevant data as well as passing it in as context. These are a few of the papers that are particularly relevant:
|
||||
|
||||
**[RAG](https://arxiv.org/abs/2005.11401):** Retrieval Augmented Generation.
|
||||
This paper introduces RAG models where the parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever.
|
||||
|
||||
**[REALM](https://arxiv.org/abs/2002.08909):** Retrieval-Augmented Language Model Pre-Training.
|
||||
To capture knowledge in a more modular and interpretable way, this paper augments language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference.
|
||||
|
||||
**[HayStack](https://haystack.deepset.ai/):** This is not a paper, but rather an open source library aimed at semantic search, question answering, summarization, and document ranking for a wide range of NLP applications. The underpinnings of this library are focused on the same `fetching` and `augmenting` concepts discussed here, and incorporate some of the methods in the above papers.
|
||||
|
||||
These papers/open-source projects are centered around retrieval of documents, which is important for question-answering tasks over a large corpus of documents (which is how they are evaluated). However, we use the terminology of `Data Augmented Generation` to highlight that retrieval from some document store is only one possible way of fetching relevant data to include. Other methods to fetch relevant data could involve hitting an API, querying a database, or just working with user provided data (eg a specific document that they want to summarize).
|
||||
|
||||
Let's now deep dive on the two steps involved: fetching and augmenting.
|
||||
|
||||
## Fetching
|
||||
There are many ways to fetch relevant data to pass in as context to a LM, and these methods largely depend
|
||||
on the use case.
|
||||
|
||||
**User provided:** In some cases, the user may provide the relevant data, and no algorithm for fetching is needed.
|
||||
An example of this is for summarization of specific documents: the user will provide the document to be summarized,
|
||||
and task the language model with summarizing it.
|
||||
|
||||
**Document Retrieval:** One of the more common use cases involves fetching relevant documents or pieces of text from
|
||||
a large corpus of data. A common example of this is question answering over a private collection of documents.
|
||||
|
||||
**API Querying:** Another common way to fetch data is from an API query. One example of this is WebGPT like system,
|
||||
where you first query Google (or another search API) for relevant information, and then those results are used in
|
||||
the generation step. Another example could be querying a structured database (like SQL) and then using a language model
|
||||
to synthesize those results.
|
||||
|
||||
There are two big issues to deal with in fetching:
|
||||
|
||||
1. Fetching small enough pieces of information
|
||||
2. Not fetching too many pieces of information (eg fetching only the most relevant pieces)
|
||||
|
||||
### Text Splitting
|
||||
One big issue with all of these methods is how to make sure you are working with pieces of text that are not too large.
|
||||
This is important because most language models have a context length, and so you cannot (yet) just pass a
|
||||
large document in as context. Therefor, it is important to not only fetch relevant data but also make sure it is
|
||||
small enough chunks.
|
||||
|
||||
LangChain provides some utilities to help with splitting up larger pieces of data. This comes in the form of the TextSplitter class.
|
||||
The class takes in a document and splits it up into chunks, with several parameters that control the
|
||||
size of the chunks as well as the overlap in the chunks (important for maintaining context).
|
||||
See [this walkthrough](../examples/integrations/textsplitter.ipynb) for more information.
|
||||
|
||||
### Relevant Documents
|
||||
A second large issue related fetching data is to make sure you are not fetching too many documents, and are only fetching
|
||||
the documents that are relevant to the query/question at hand. There are a few ways to deal with this.
|
||||
|
||||
One concrete example of this is vector stores for document retrieval, often used for semantic search or question answering.
|
||||
With this method, larger documents are split up into
|
||||
smaller chunks and then each chunk of text is passed to an embedding function which creates an embedding for that piece of text.
|
||||
Those are embeddings are then stored in a database. When a new search query or question comes in, an embedding is
|
||||
created for that query/question and then documents with embeddings most similar to that embedding are fetched.
|
||||
Examples of vector database companies include [Pinecone](https://www.pinecone.io/) and [Weaviate](https://weaviate.io/).
|
||||
|
||||
Although this is perhaps the most common way of document retrieval, people are starting to think about alternative
|
||||
data structures and indexing techniques specifically for working with language models. For a leading example of this,
|
||||
check out [GPT Index](https://github.com/jerryjliu/gpt_index) - a collection of data structures created by and optimized
|
||||
for language models.
|
||||
|
||||
## Augmenting
|
||||
So you've fetched your relevant data - now what? How do you pass them to the language model in a format it can understand?
|
||||
There are a few different methods, or chains, for doing so. LangChain supports three of the more common ones - and
|
||||
we are actively looking to include more, so if you have any ideas please reach out! Note that there is not
|
||||
one best method - the decision of which one to use is often very context specific. In order from simplest to
|
||||
most complex:
|
||||
|
||||
### Stuffing
|
||||
Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context
|
||||
to pass to the language model. This is implemented in LangChain as the `StuffDocumentsChain`.
|
||||
|
||||
**Pros:** Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.
|
||||
|
||||
**Cons:** Most LLMs have a context length, and for large documents (or many documents) this will not work as it will result in a prompt larger than the context length.
|
||||
|
||||
The main downside of this method is that it only works one smaller pieces of data. Once you are working
|
||||
with many pieces of data, this approach is no longer feasible. The next two approaches are designed to help deal with that.
|
||||
|
||||
### Map Reduce
|
||||
This method involves an initial prompt on each chunk of data (for summarization tasks, this
|
||||
could be a summary of that chunk; for question-answering tasks, it could be an answer based solely on that chunk).
|
||||
Then a different prompt is run to combine all the initial outputs. This is implemented in the LangChain as the `MapReduceDocumentsChain`.
|
||||
|
||||
**Pros:** Can scale to larger documents (and more documents) than `StuffDocumentsChain`. The calls to the LLM on individual documents are independent and can therefore be parallelized.
|
||||
|
||||
**Cons:** Requires many more calls to the LLM than `StuffDocumentsChain`. Loses some information during the final combining call.
|
||||
|
||||
### Refine
|
||||
This method involves an initial prompt on the first chunk of data, generating some output.
|
||||
For the remaining documents, that output is passed in, along with the next document,
|
||||
asking the LLM to refine the output based on the new document.
|
||||
|
||||
**Pros:** Can pull in more relevant context, and may be less lossy than `RefineDocumentsChain`.
|
||||
|
||||
**Cons:** Requires many more calls to the LLM than `StuffDocumentsChain`. The calls are also NOT independent, meaning they cannot be paralleled like `RefineDocumentsChain`. There is also some potential dependencies on the ordering of the documents.
|
||||
|
||||
## Use Cases
|
||||
LangChain supports the above three methods of augmenting LLMs with external data.
|
||||
These methods can be used to underpin several common use cases and they are discussed below.
|
||||
For all three of these use cases, all three methods are supported.
|
||||
It is important to note that a large part of these implementations is the prompts
|
||||
that are used. We provide default prompts for all three use cases, but these can be configured.
|
||||
This is in case you discover a prompt that works better for your specific application.
|
||||
|
||||
- [Question-Answering With Sources](../examples/chains/qa_with_sources.ipynb)
|
||||
- [Question-Answering](../examples/chains/question_answering.ipynb)
|
||||
- [Summarization](../examples/chains/summarize.ipynb)
|
@ -159,6 +159,7 @@ see detailed information about the various classes, methods, and APIs.
|
||||
:name: resources
|
||||
|
||||
explanation/core_concepts.md
|
||||
explanation/combine_docs.md
|
||||
explanation/agents.md
|
||||
explanation/glossary.md
|
||||
explanation/cool_demos.md
|
||||
|
@ -5,6 +5,7 @@ from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.llm_bash.base import LLMBashChain
|
||||
from langchain.chains.llm_math.base import LLMMathChain
|
||||
from langchain.chains.llm_requests import LLMRequestsChain
|
||||
from langchain.chains.mapreduce import MapReduceChain
|
||||
from langchain.chains.pal.base import PALChain
|
||||
from langchain.chains.qa_with_sources.base import QAWithSourcesChain
|
||||
from langchain.chains.qa_with_sources.vector_db import VectorDBQAWithSourcesChain
|
||||
@ -29,4 +30,5 @@ __all__ = [
|
||||
"APIChain",
|
||||
"LLMRequestsChain",
|
||||
"TransformChain",
|
||||
"MapReduceChain",
|
||||
]
|
||||
|
1
langchain/chains/combine_documents/__init__.py
Normal file
1
langchain/chains/combine_documents/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Different ways to combine documents."""
|
43
langchain/chains/combine_documents/base.py
Normal file
43
langchain/chains/combine_documents/base.py
Normal file
@ -0,0 +1,43 @@
|
||||
"""Base interface for chains combining documents."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
|
||||
class BaseCombineDocumentsChain(Chain, BaseModel, ABC):
|
||||
"""Base interface for chains combining documents."""
|
||||
|
||||
input_key: str = "input_documents" #: :meta private:
|
||||
output_key: str = "output_text" #: :meta private:
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.input_key]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Return output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@abstractmethod
|
||||
def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
|
||||
"""Combine documents into a single string."""
|
||||
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
docs = inputs[self.input_key]
|
||||
# Other keys are assumed to be needed for LLM prediction
|
||||
other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
|
||||
output = self.combine_docs(docs, **other_keys)
|
||||
return {self.output_key: output}
|
62
langchain/chains/combine_documents/map_reduce.py
Normal file
62
langchain/chains/combine_documents/map_reduce.py
Normal file
@ -0,0 +1,62 @@
|
||||
"""Combining documents by mapping a chain over them first, then combining results."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, root_validator
|
||||
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
|
||||
class MapReduceDocumentsChain(BaseCombineDocumentsChain, BaseModel):
|
||||
"""Combining documents by mapping a chain over them, then combining results."""
|
||||
|
||||
llm_chain: LLMChain
|
||||
"""Chain to apply to each document individually.."""
|
||||
combine_document_chain: BaseCombineDocumentsChain
|
||||
"""Chain to use to combine results of applying llm_chain to documents."""
|
||||
document_variable_name: str
|
||||
"""The variable name in the llm_chain to put the documents in.
|
||||
If only one variable in the llm_chain, this need not be provided."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@root_validator(pre=True)
|
||||
def get_default_document_variable_name(cls, values: Dict) -> Dict:
|
||||
"""Get default document variable name, if not provided."""
|
||||
if "document_variable_name" not in values:
|
||||
llm_chain_variables = values["llm_chain"].prompt.input_variables
|
||||
if len(llm_chain_variables) == 1:
|
||||
values["document_variable_name"] = llm_chain_variables[0]
|
||||
else:
|
||||
raise ValueError(
|
||||
"document_variable_name must be provided if there are "
|
||||
"multiple llm_chain input_variables"
|
||||
)
|
||||
else:
|
||||
llm_chain_variables = values["llm_chain"].prompt.input_variables
|
||||
if values["document_variable_name"] not in llm_chain_variables:
|
||||
raise ValueError(
|
||||
f"document_variable_name {values['document_variable_name']} was "
|
||||
f"not found in llm_chain input_variables: {llm_chain_variables}"
|
||||
)
|
||||
return values
|
||||
|
||||
def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
|
||||
"""Combine by mapping first chain over all, then stuffing into final chain."""
|
||||
results = self.llm_chain.apply(
|
||||
[{**{self.document_variable_name: d.page_content}, **kwargs} for d in docs]
|
||||
)
|
||||
question_result_key = self.llm_chain.output_key
|
||||
result_docs = [
|
||||
Document(page_content=r[question_result_key], metadata=docs[i].metadata)
|
||||
for i, r in enumerate(results)
|
||||
]
|
||||
return self.combine_document_chain.combine_docs(result_docs, **kwargs)
|
88
langchain/chains/combine_documents/refine.py
Normal file
88
langchain/chains/combine_documents/refine.py
Normal file
@ -0,0 +1,88 @@
|
||||
"""Combining documents by doing a first pass and then refining on more documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
|
||||
|
||||
def _get_default_document_prompt() -> PromptTemplate:
|
||||
return PromptTemplate(input_variables=["page_content"], template="{page_content}")
|
||||
|
||||
|
||||
class RefineDocumentsChain(BaseCombineDocumentsChain, BaseModel):
|
||||
"""Combine documents by doing a first pass and then refining on more documents."""
|
||||
|
||||
initial_llm_chain: LLMChain
|
||||
"""LLM chain to use on initial document."""
|
||||
refine_llm_chain: LLMChain
|
||||
"""LLM chain to use when refining."""
|
||||
document_variable_name: str
|
||||
"""The variable name in the initial_llm_chain to put the documents in.
|
||||
If only one variable in the initial_llm_chain, this need not be provided."""
|
||||
initial_response_name: str
|
||||
"""The variable name to format the initial response in when refining."""
|
||||
document_prompt: BasePromptTemplate = Field(
|
||||
default_factory=_get_default_document_prompt
|
||||
)
|
||||
"""Prompt to use to format each document."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@root_validator(pre=True)
|
||||
def get_default_document_variable_name(cls, values: Dict) -> Dict:
|
||||
"""Get default document variable name, if not provided."""
|
||||
if "document_variable_name" not in values:
|
||||
llm_chain_variables = values["initial_llm_chain"].prompt.input_variables
|
||||
if len(llm_chain_variables) == 1:
|
||||
values["document_variable_name"] = llm_chain_variables[0]
|
||||
else:
|
||||
raise ValueError(
|
||||
"document_variable_name must be provided if there are "
|
||||
"multiple llm_chain input_variables"
|
||||
)
|
||||
else:
|
||||
llm_chain_variables = values["initial_llm_chain"].prompt.input_variables
|
||||
if values["document_variable_name"] not in llm_chain_variables:
|
||||
raise ValueError(
|
||||
f"document_variable_name {values['document_variable_name']} was "
|
||||
f"not found in llm_chain input_variables: {llm_chain_variables}"
|
||||
)
|
||||
return values
|
||||
|
||||
def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
|
||||
"""Combine by mapping first chain over all, then stuffing into final chain."""
|
||||
base_info = {"page_content": docs[0].page_content}
|
||||
base_info.update(docs[0].metadata)
|
||||
document_info = {k: base_info[k] for k in self.document_prompt.input_variables}
|
||||
base_inputs: dict = {
|
||||
self.document_variable_name: self.document_prompt.format(**document_info)
|
||||
}
|
||||
inputs = {**base_inputs, **kwargs}
|
||||
res = self.initial_llm_chain.predict(**inputs)
|
||||
for doc in docs[1:]:
|
||||
base_info = {"page_content": doc.page_content}
|
||||
base_info.update(doc.metadata)
|
||||
document_info = {
|
||||
k: base_info[k] for k in self.document_prompt.input_variables
|
||||
}
|
||||
base_inputs = {
|
||||
self.document_variable_name: self.document_prompt.format(
|
||||
**document_info
|
||||
),
|
||||
self.initial_response_name: res,
|
||||
}
|
||||
inputs = {**base_inputs, **kwargs}
|
||||
res = self.refine_llm_chain.predict(**inputs)
|
||||
return res
|
@ -1,21 +1,22 @@
|
||||
"""Document combining chain."""
|
||||
"""Chain that combines documents by stuffing into context."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
from langchain.prompts.prompt import Prompt
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
|
||||
|
||||
def _get_default_document_prompt() -> Prompt:
|
||||
return Prompt(input_variables=["page_content"], template="{page_content}")
|
||||
def _get_default_document_prompt() -> PromptTemplate:
|
||||
return PromptTemplate(input_variables=["page_content"], template="{page_content}")
|
||||
|
||||
|
||||
class CombineDocumentsChain(Chain, BaseModel):
|
||||
"""Combine documents."""
|
||||
class StuffDocumentsChain(BaseCombineDocumentsChain, BaseModel):
|
||||
"""Chain that combines documents by stuffing into context."""
|
||||
|
||||
llm_chain: LLMChain
|
||||
"""LLM wrapper to use after formatting documents."""
|
||||
@ -26,8 +27,6 @@ class CombineDocumentsChain(Chain, BaseModel):
|
||||
document_variable_name: str
|
||||
"""The variable name in the llm_chain to put the documents in.
|
||||
If only one variable in the llm_chain, this need not be provided."""
|
||||
input_key: str = "input_documents" #: :meta private:
|
||||
output_key: str = "output_text" #: :meta private:
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
@ -35,22 +34,6 @@ class CombineDocumentsChain(Chain, BaseModel):
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.input_key]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Return output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@root_validator(pre=True)
|
||||
def get_default_document_variable_name(cls, values: Dict) -> Dict:
|
||||
"""Get default document variable name, if not provided."""
|
||||
@ -72,10 +55,8 @@ class CombineDocumentsChain(Chain, BaseModel):
|
||||
)
|
||||
return values
|
||||
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
docs = inputs[self.input_key]
|
||||
# Other keys are assumed to be needed for LLM prediction
|
||||
other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
|
||||
def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
|
||||
"""Stuff all documents into one prompt and pass to LLM."""
|
||||
# Get relevant information from each document.
|
||||
doc_dicts = []
|
||||
for doc in docs:
|
||||
@ -88,7 +69,7 @@ class CombineDocumentsChain(Chain, BaseModel):
|
||||
# Format each document according to the prompt
|
||||
doc_strings = [self.document_prompt.format(**doc) for doc in doc_dicts]
|
||||
# Join the documents together to put them in the prompt.
|
||||
other_keys[self.document_variable_name] = "\n".join(doc_strings)
|
||||
inputs = kwargs.copy()
|
||||
inputs[self.document_variable_name] = "\n\n".join(doc_strings)
|
||||
# Call predict on the LLM.
|
||||
output = self.llm_chain.predict(**other_keys)
|
||||
return {self.output_key: output}
|
||||
return self.llm_chain.predict(**inputs)
|
@ -10,7 +10,9 @@ from typing import Dict, List
|
||||
from pydantic import BaseModel, Extra
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents import CombineDocumentsChain
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.llms.base import LLM
|
||||
@ -21,10 +23,8 @@ from langchain.text_splitter import TextSplitter
|
||||
class MapReduceChain(Chain, BaseModel):
|
||||
"""Map-reduce chain."""
|
||||
|
||||
map_llm: LLMChain
|
||||
"""LLM wrapper to use for the map step."""
|
||||
reduce_llm: LLMChain
|
||||
"""LLM wrapper to use for the reduce step."""
|
||||
combine_documents_chain: BaseCombineDocumentsChain
|
||||
"""Chain to use to combine documents."""
|
||||
text_splitter: TextSplitter
|
||||
"""Text splitter to use."""
|
||||
input_key: str = "input_text" #: :meta private:
|
||||
@ -36,7 +36,13 @@ class MapReduceChain(Chain, BaseModel):
|
||||
) -> MapReduceChain:
|
||||
"""Construct a map-reduce chain that uses the chain for map and reduce."""
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
return cls(map_llm=llm_chain, reduce_llm=llm_chain, text_splitter=text_splitter)
|
||||
reduce_chain = StuffDocumentsChain(llm_chain=llm_chain)
|
||||
combine_documents_chain = MapReduceDocumentsChain(
|
||||
llm_chain=llm_chain, combine_document_chain=reduce_chain
|
||||
)
|
||||
return cls(
|
||||
combine_documents_chain=combine_documents_chain, text_splitter=text_splitter
|
||||
)
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
@ -62,16 +68,7 @@ class MapReduceChain(Chain, BaseModel):
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
# Split the larger text into smaller chunks.
|
||||
docs = self.text_splitter.split_text(inputs[self.input_key])
|
||||
|
||||
# Now that we have the chunks, we send them to the LLM and track results.
|
||||
# This is the "map" part.
|
||||
input_list = [{self.map_llm.prompt.input_variables[0]: d} for d in docs]
|
||||
summary_results = self.map_llm.apply(input_list)
|
||||
summaries = [res[self.map_llm.output_key] for res in summary_results]
|
||||
summary_docs = [Document(page_content=text) for text in summaries]
|
||||
# We then need to combine these individual parts into one.
|
||||
# This is the reduce part.
|
||||
reduce_chain = CombineDocumentsChain(llm_chain=self.reduce_llm)
|
||||
outputs = reduce_chain({reduce_chain.input_key: summary_docs})
|
||||
return {self.output_key: outputs[self.output_key]}
|
||||
texts = self.text_splitter.split_text(inputs[self.input_key])
|
||||
docs = [Document(page_content=text) for text in texts]
|
||||
outputs = self.combine_documents_chain.combine_docs(docs)
|
||||
return {self.output_key: outputs}
|
||||
|
@ -1 +1,74 @@
|
||||
"""Question answering with sources over documents."""
|
||||
"""Load question answering with sources chains."""
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
||||
from langchain.chains.combine_documents.refine import RefineDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.qa_with_sources import (
|
||||
map_reduce_prompt,
|
||||
refine_prompt,
|
||||
stuff_prompt,
|
||||
)
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
|
||||
def _load_stuff_chain(llm: LLM) -> StuffDocumentsChain:
|
||||
llm_chain = LLMChain(llm=llm, prompt=stuff_prompt.PROMPT)
|
||||
return StuffDocumentsChain(
|
||||
llm_chain=llm_chain,
|
||||
document_variable_name="summaries",
|
||||
document_prompt=stuff_prompt.EXAMPLE_PROMPT,
|
||||
)
|
||||
|
||||
|
||||
def _load_map_reduce_chain(llm: LLM) -> MapReduceDocumentsChain:
|
||||
map_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.QUESTION_PROMPT)
|
||||
reduce_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.COMBINE_PROMPT)
|
||||
combine_document_chain = StuffDocumentsChain(
|
||||
llm_chain=reduce_chain,
|
||||
document_variable_name="summaries",
|
||||
document_prompt=map_reduce_prompt.EXAMPLE_PROMPT,
|
||||
)
|
||||
return MapReduceDocumentsChain(
|
||||
llm_chain=map_chain,
|
||||
combine_document_chain=combine_document_chain,
|
||||
document_variable_name="context",
|
||||
)
|
||||
|
||||
|
||||
def _load_refine_chain(llm: LLM) -> RefineDocumentsChain:
|
||||
initial_chain = LLMChain(llm=llm, prompt=refine_prompt.DEFAULT_TEXT_QA_PROMPT)
|
||||
refine_chain = LLMChain(llm=llm, prompt=refine_prompt.DEFAULT_REFINE_PROMPT)
|
||||
return RefineDocumentsChain(
|
||||
initial_llm_chain=initial_chain,
|
||||
refine_llm_chain=refine_chain,
|
||||
document_variable_name="context_str",
|
||||
initial_response_name="existing_answer",
|
||||
document_prompt=refine_prompt.EXAMPLE_PROMPT,
|
||||
)
|
||||
|
||||
|
||||
def load_qa_with_sources_chain(
|
||||
llm: LLM, chain_type: str = "stuff"
|
||||
) -> BaseCombineDocumentsChain:
|
||||
"""Load question answering with sources chain.
|
||||
|
||||
Args:
|
||||
llm: Language Model to use in the chain.
|
||||
chain_type: Type of document combining chain to use. Should be one of "stuff",
|
||||
"map_reduce", and "refine".
|
||||
|
||||
Returns:
|
||||
A chain to use for question answering with sources.
|
||||
"""
|
||||
loader_mapping = {
|
||||
"stuff": _load_stuff_chain,
|
||||
"map_reduce": _load_map_reduce_chain,
|
||||
"refine": _load_refine_chain,
|
||||
}
|
||||
if chain_type not in loader_mapping:
|
||||
raise ValueError(
|
||||
f"Got unsupported chain type: {chain_type}. "
|
||||
f"Should be one of {loader_mapping.keys()}"
|
||||
)
|
||||
return loader_mapping[chain_type](llm)
|
||||
|
@ -8,9 +8,11 @@ from typing import Any, Dict, List
|
||||
from pydantic import BaseModel, Extra, root_validator
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents import CombineDocumentsChain
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.qa_with_sources.prompt import (
|
||||
from langchain.chains.qa_with_sources.map_reduce_prompt import (
|
||||
COMBINE_PROMPT,
|
||||
EXAMPLE_PROMPT,
|
||||
QUESTION_PROMPT,
|
||||
@ -23,12 +25,8 @@ from langchain.prompts.base import BasePromptTemplate
|
||||
class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
"""Question answering with sources over documents."""
|
||||
|
||||
llm_question_chain: LLMChain
|
||||
"""LLM wrapper to use for asking questions to each document."""
|
||||
combine_document_chain: CombineDocumentsChain
|
||||
combine_document_chain: BaseCombineDocumentsChain
|
||||
"""Chain to use to combine documents."""
|
||||
doc_source_key: str = "source"
|
||||
"""Key in document.metadata to use as source information"""
|
||||
question_key: str = "question" #: :meta private:
|
||||
input_docs_key: str = "docs" #: :meta private:
|
||||
answer_key: str = "answer" #: :meta private:
|
||||
@ -38,7 +36,7 @@ class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: LLM,
|
||||
combine_document_prompt: BasePromptTemplate = EXAMPLE_PROMPT,
|
||||
document_prompt: BasePromptTemplate = EXAMPLE_PROMPT,
|
||||
question_prompt: BasePromptTemplate = QUESTION_PROMPT,
|
||||
combine_prompt: BasePromptTemplate = COMBINE_PROMPT,
|
||||
**kwargs: Any,
|
||||
@ -46,13 +44,17 @@ class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
"""Construct the chain from an LLM."""
|
||||
llm_question_chain = LLMChain(llm=llm, prompt=question_prompt)
|
||||
llm_combine_chain = LLMChain(llm=llm, prompt=combine_prompt)
|
||||
combine_document_chain = CombineDocumentsChain(
|
||||
combine_results_chain = StuffDocumentsChain(
|
||||
llm_chain=llm_combine_chain,
|
||||
document_prompt=combine_document_prompt,
|
||||
document_prompt=document_prompt,
|
||||
document_variable_name="summaries",
|
||||
)
|
||||
combine_document_chain = MapReduceDocumentsChain(
|
||||
llm_chain=llm_question_chain,
|
||||
combine_document_chain=combine_results_chain,
|
||||
document_variable_name="context",
|
||||
)
|
||||
return cls(
|
||||
llm_question_chain=llm_question_chain,
|
||||
combine_document_chain=combine_document_chain,
|
||||
**kwargs,
|
||||
)
|
||||
@ -82,7 +84,7 @@ class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
@root_validator(pre=True)
|
||||
def validate_question_chain(cls, values: Dict) -> Dict:
|
||||
"""Validate question chain."""
|
||||
llm_question_chain = values["llm_question_chain"]
|
||||
llm_question_chain = values["combine_document_chain"].llm_chain
|
||||
if len(llm_question_chain.input_keys) != 2:
|
||||
raise ValueError(
|
||||
f"The llm_question_chain should have two inputs: a content key "
|
||||
@ -104,23 +106,7 @@ class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
docs = self._get_docs(inputs)
|
||||
query = inputs[self.question_key]
|
||||
content_key, query_key = self.llm_question_chain.input_keys
|
||||
results = self.llm_question_chain.apply(
|
||||
[{content_key: d.page_content, query_key: query} for d in docs]
|
||||
)
|
||||
question_result_key = self.llm_question_chain.output_key
|
||||
result_docs = [
|
||||
Document(page_content=r[question_result_key], metadata=docs[i].metadata)
|
||||
for i, r in enumerate(results)
|
||||
]
|
||||
answer_dict = self.combine_document_chain(
|
||||
{
|
||||
self.combine_document_chain.input_key: result_docs,
|
||||
self.question_key: query,
|
||||
}
|
||||
)
|
||||
answer = answer_dict[self.combine_document_chain.output_key]
|
||||
answer = self.combine_document_chain.combine_docs(docs, **inputs)
|
||||
if "\nSOURCES: " in answer:
|
||||
answer, sources = answer.split("\nSOURCES: ")
|
||||
else:
|
||||
@ -142,4 +128,4 @@ class QAWithSourcesChain(BaseQAWithSourcesChain, BaseModel):
|
||||
return [self.input_docs_key, self.question_key]
|
||||
|
||||
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
|
||||
return inputs[self.input_docs_key]
|
||||
return inputs.pop(self.input_docs_key)
|
||||
|
38
langchain/chains/qa_with_sources/refine_prompt.py
Normal file
38
langchain/chains/qa_with_sources/refine_prompt.py
Normal file
@ -0,0 +1,38 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
DEFAULT_REFINE_PROMPT_TMPL = (
|
||||
"The original question is as follows: {query_str}\n"
|
||||
"We have provided an existing answer, including sources: {existing_answer}\n"
|
||||
"We have the opportunity to refine the existing answer"
|
||||
"(only if needed) with some more context below.\n"
|
||||
"------------\n"
|
||||
"{context_str}\n"
|
||||
"------------\n"
|
||||
"Given the new context, refine the original answer to better "
|
||||
"answer the question. "
|
||||
"If you do update it, please update the sources as well. "
|
||||
"If the context isn't useful, return the original answer."
|
||||
)
|
||||
DEFAULT_REFINE_PROMPT = PromptTemplate(
|
||||
input_variables=["query_str", "existing_answer", "context_str"],
|
||||
template=DEFAULT_REFINE_PROMPT_TMPL,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_TEXT_QA_PROMPT_TMPL = (
|
||||
"Context information is below. \n"
|
||||
"---------------------\n"
|
||||
"{context_str}"
|
||||
"\n---------------------\n"
|
||||
"Given the context information and not prior knowledge, "
|
||||
"answer the question: {query_str}\n"
|
||||
)
|
||||
DEFAULT_TEXT_QA_PROMPT = PromptTemplate(
|
||||
input_variables=["context_str", "query_str"], template=DEFAULT_TEXT_QA_PROMPT_TMPL
|
||||
)
|
||||
|
||||
EXAMPLE_PROMPT = PromptTemplate(
|
||||
template="Content: {page_content}\nSource: {source}",
|
||||
input_variables=["page_content", "source"],
|
||||
)
|
44
langchain/chains/qa_with_sources/stuff_prompt.py
Normal file
44
langchain/chains/qa_with_sources/stuff_prompt.py
Normal file
@ -0,0 +1,44 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
|
||||
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
|
||||
ALWAYS return a "SOURCES" part in your answer.
|
||||
|
||||
QUESTION: Which state/country's law governs the interpretation of the contract?
|
||||
=========
|
||||
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an injunction or other relief to protect its Intellectual Property Rights.
|
||||
Source: 28-pl
|
||||
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other) right or remedy.\n\n11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation in force of the remainder of the term (if any) and this Agreement.\n\n11.8 No Agency. Except as expressly stated otherwise, nothing in this Agreement shall create an agency, partnership or joint venture of any kind between the parties.\n\n11.9 No Third-Party Beneficiaries.
|
||||
Source: 30-pl
|
||||
Content: (b) if Google believes, in good faith, that the Distributor has violated or caused Google to violate any Anti-Bribery Laws (as defined in Clause 8.5) or that such a violation is reasonably likely to occur,
|
||||
Source: 4-pl
|
||||
=========
|
||||
FINAL ANSWER: This Agreement is governed by English law.
|
||||
SOURCES: 28-pl
|
||||
|
||||
QUESTION: What did the president say about Michael Jackson?
|
||||
=========
|
||||
Content: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland.
|
||||
Source: 0-pl
|
||||
Content: And we won’t stop. \n\nWe have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. \n\nLet’s use this moment to reset. Let’s stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease. \n\nLet’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. \n\nWe can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n\nOfficer Mora was 27 years old. \n\nOfficer Rivera was 22. \n\nBoth Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n\nI spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves.
|
||||
Source: 24-pl
|
||||
Content: And a proud Ukrainian people, who have known 30 years of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards. \n\nTo all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. \n\nAnd I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. \n\nTonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n\nAmerica will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n\nThese steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. \n\nBut I want you to know that we are going to be okay.
|
||||
Source: 5-pl
|
||||
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defeated totalitarianism and terror. \n\nAnd built the strongest, freest, and most prosperous nation the world has ever known. \n\nNow is the hour. \n\nOur moment of responsibility. \n\nOur test of resolve and conscience, of history itself. \n\nIt is in this moment that our character is formed. Our purpose is found. Our future is forged. \n\nWell I know this nation.
|
||||
Source: 34-pl
|
||||
=========
|
||||
FINAL ANSWER: The president did not mention Michael Jackson.
|
||||
SOURCES:
|
||||
|
||||
QUESTION: {question}
|
||||
=========
|
||||
{summaries}
|
||||
=========
|
||||
FINAL ANSWER:"""
|
||||
PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
|
||||
|
||||
EXAMPLE_PROMPT = PromptTemplate(
|
||||
template="Content: {page_content}\nSource: {source}",
|
||||
input_variables=["page_content", "source"],
|
||||
)
|
67
langchain/chains/question_answering/__init__.py
Normal file
67
langchain/chains/question_answering/__init__.py
Normal file
@ -0,0 +1,67 @@
|
||||
"""Load question answering chains."""
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
||||
from langchain.chains.combine_documents.refine import RefineDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.question_answering import (
|
||||
map_reduce_prompt,
|
||||
refine_prompt,
|
||||
stuff_prompt,
|
||||
)
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
|
||||
def _load_stuff_chain(llm: LLM) -> StuffDocumentsChain:
|
||||
llm_chain = LLMChain(llm=llm, prompt=stuff_prompt.PROMPT)
|
||||
# TODO: document prompt
|
||||
return StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context")
|
||||
|
||||
|
||||
def _load_map_reduce_chain(llm: LLM) -> MapReduceDocumentsChain:
|
||||
map_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.QUESTION_PROMPT)
|
||||
reduce_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.COMBINE_PROMPT)
|
||||
# TODO: document prompt
|
||||
combine_document_chain = StuffDocumentsChain(
|
||||
llm_chain=reduce_chain, document_variable_name="summaries"
|
||||
)
|
||||
return MapReduceDocumentsChain(
|
||||
llm_chain=map_chain,
|
||||
combine_document_chain=combine_document_chain,
|
||||
document_variable_name="context",
|
||||
)
|
||||
|
||||
|
||||
def _load_refine_chain(llm: LLM) -> RefineDocumentsChain:
|
||||
initial_chain = LLMChain(llm=llm, prompt=refine_prompt.DEFAULT_TEXT_QA_PROMPT)
|
||||
refine_chain = LLMChain(llm=llm, prompt=refine_prompt.DEFAULT_REFINE_PROMPT)
|
||||
return RefineDocumentsChain(
|
||||
initial_llm_chain=initial_chain,
|
||||
refine_llm_chain=refine_chain,
|
||||
document_variable_name="context_str",
|
||||
initial_response_name="existing_answer",
|
||||
)
|
||||
|
||||
|
||||
def load_qa_chain(llm: LLM, chain_type: str = "stuff") -> BaseCombineDocumentsChain:
|
||||
"""Load question answering chain.
|
||||
|
||||
Args:
|
||||
llm: Language Model to use in the chain.
|
||||
chain_type: Type of document combining chain to use. Should be one of "stuff",
|
||||
"map_reduce", and "refine".
|
||||
|
||||
Returns:
|
||||
A chain to use for question answering.
|
||||
"""
|
||||
loader_mapping = {
|
||||
"stuff": _load_stuff_chain,
|
||||
"map_reduce": _load_map_reduce_chain,
|
||||
"refine": _load_refine_chain,
|
||||
}
|
||||
if chain_type not in loader_mapping:
|
||||
raise ValueError(
|
||||
f"Got unsupported chain type: {chain_type}. "
|
||||
f"Should be one of {loader_mapping.keys()}"
|
||||
)
|
||||
return loader_mapping[chain_type](llm)
|
45
langchain/chains/question_answering/map_reduce_prompt.py
Normal file
45
langchain/chains/question_answering/map_reduce_prompt.py
Normal file
@ -0,0 +1,45 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question.
|
||||
Return any relevant text verbatim.
|
||||
{context}
|
||||
Question: {question}
|
||||
Relevant text, if any:"""
|
||||
QUESTION_PROMPT = PromptTemplate(
|
||||
template=question_prompt_template, input_variables=["context", "question"]
|
||||
)
|
||||
|
||||
combine_prompt_template = """Given the following extracted parts of a long document and a question, create a final answer.
|
||||
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
|
||||
|
||||
QUESTION: Which state/country's law governs the interpretation of the contract?
|
||||
=========
|
||||
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an injunction or other relief to protect its Intellectual Property Rights.
|
||||
|
||||
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other) right or remedy.\n\n11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation in force of the remainder of the term (if any) and this Agreement.\n\n11.8 No Agency. Except as expressly stated otherwise, nothing in this Agreement shall create an agency, partnership or joint venture of any kind between the parties.\n\n11.9 No Third-Party Beneficiaries.
|
||||
|
||||
Content: (b) if Google believes, in good faith, that the Distributor has violated or caused Google to violate any Anti-Bribery Laws (as defined in Clause 8.5) or that such a violation is reasonably likely to occur,
|
||||
=========
|
||||
FINAL ANSWER: This Agreement is governed by English law.
|
||||
|
||||
QUESTION: What did the president say about Michael Jackson?
|
||||
=========
|
||||
Content: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland.
|
||||
|
||||
Content: And we won’t stop. \n\nWe have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. \n\nLet’s use this moment to reset. Let’s stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease. \n\nLet’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. \n\nWe can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n\nOfficer Mora was 27 years old. \n\nOfficer Rivera was 22. \n\nBoth Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n\nI spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves.
|
||||
|
||||
Content: And a proud Ukrainian people, who have known 30 years of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards. \n\nTo all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. \n\nAnd I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. \n\nTonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n\nAmerica will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n\nThese steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. \n\nBut I want you to know that we are going to be okay.
|
||||
|
||||
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defeated totalitarianism and terror. \n\nAnd built the strongest, freest, and most prosperous nation the world has ever known. \n\nNow is the hour. \n\nOur moment of responsibility. \n\nOur test of resolve and conscience, of history itself. \n\nIt is in this moment that our character is formed. Our purpose is found. Our future is forged. \n\nWell I know this nation.
|
||||
=========
|
||||
FINAL ANSWER: The president did not mention Michael Jackson.
|
||||
|
||||
QUESTION: {question}
|
||||
=========
|
||||
{summaries}
|
||||
=========
|
||||
FINAL ANSWER:"""
|
||||
COMBINE_PROMPT = PromptTemplate(
|
||||
template=combine_prompt_template, input_variables=["summaries", "question"]
|
||||
)
|
32
langchain/chains/question_answering/refine_prompt.py
Normal file
32
langchain/chains/question_answering/refine_prompt.py
Normal file
@ -0,0 +1,32 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
DEFAULT_REFINE_PROMPT_TMPL = (
|
||||
"The original question is as follows: {query_str}\n"
|
||||
"We have provided an existing answer: {existing_answer}\n"
|
||||
"We have the opportunity to refine the existing answer"
|
||||
"(only if needed) with some more context below.\n"
|
||||
"------------\n"
|
||||
"{context_str}\n"
|
||||
"------------\n"
|
||||
"Given the new context, refine the original answer to better "
|
||||
"answer the question. "
|
||||
"If the context isn't useful, return the original answer."
|
||||
)
|
||||
DEFAULT_REFINE_PROMPT = PromptTemplate(
|
||||
input_variables=["query_str", "existing_answer", "context_str"],
|
||||
template=DEFAULT_REFINE_PROMPT_TMPL,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_TEXT_QA_PROMPT_TMPL = (
|
||||
"Context information is below. \n"
|
||||
"---------------------\n"
|
||||
"{context_str}"
|
||||
"\n---------------------\n"
|
||||
"Given the context information and not prior knowledge, "
|
||||
"answer the question: {query_str}\n"
|
||||
)
|
||||
DEFAULT_TEXT_QA_PROMPT = PromptTemplate(
|
||||
input_variables=["context_str", "query_str"], template=DEFAULT_TEXT_QA_PROMPT_TMPL
|
||||
)
|
12
langchain/chains/question_answering/stuff_prompt.py
Normal file
12
langchain/chains/question_answering/stuff_prompt.py
Normal file
@ -0,0 +1,12 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
||||
|
||||
{context}
|
||||
|
||||
Question: {question}
|
||||
Helpful Answer:"""
|
||||
PROMPT = PromptTemplate(
|
||||
template=prompt_template, input_variables=["context", "question"]
|
||||
)
|
65
langchain/chains/summarize/__init__.py
Normal file
65
langchain/chains/summarize/__init__.py
Normal file
@ -0,0 +1,65 @@
|
||||
"""Load summarizing chains."""
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
||||
from langchain.chains.combine_documents.refine import RefineDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.summarize import map_reduce_prompt, refine_prompt, stuff_prompt
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
|
||||
def _load_stuff_chain(llm: LLM) -> StuffDocumentsChain:
|
||||
llm_chain = LLMChain(llm=llm, prompt=stuff_prompt.PROMPT)
|
||||
# TODO: document prompt
|
||||
return StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
|
||||
|
||||
|
||||
def _load_map_reduce_chain(llm: LLM) -> MapReduceDocumentsChain:
|
||||
map_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.PROMPT)
|
||||
reduce_chain = LLMChain(llm=llm, prompt=map_reduce_prompt.PROMPT)
|
||||
# TODO: document prompt
|
||||
combine_document_chain = StuffDocumentsChain(
|
||||
llm_chain=reduce_chain, document_variable_name="text"
|
||||
)
|
||||
return MapReduceDocumentsChain(
|
||||
llm_chain=map_chain,
|
||||
combine_document_chain=combine_document_chain,
|
||||
document_variable_name="text",
|
||||
)
|
||||
|
||||
|
||||
def _load_refine_chain(llm: LLM) -> RefineDocumentsChain:
|
||||
initial_chain = LLMChain(llm=llm, prompt=refine_prompt.PROMPT)
|
||||
refine_chain = LLMChain(llm=llm, prompt=refine_prompt.REFINE_PROMPT)
|
||||
return RefineDocumentsChain(
|
||||
initial_llm_chain=initial_chain,
|
||||
refine_llm_chain=refine_chain,
|
||||
document_variable_name="text",
|
||||
initial_response_name="existing_answer",
|
||||
)
|
||||
|
||||
|
||||
def load_summarize_chain(
|
||||
llm: LLM, chain_type: str = "stuff"
|
||||
) -> BaseCombineDocumentsChain:
|
||||
"""Load summarizing chain.
|
||||
|
||||
Args:
|
||||
llm: Language Model to use in the chain.
|
||||
chain_type: Type of document combining chain to use. Should be one of "stuff",
|
||||
"map_reduce", and "refine".
|
||||
|
||||
Returns:
|
||||
A chain to use for summarizing.
|
||||
"""
|
||||
loader_mapping = {
|
||||
"stuff": _load_stuff_chain,
|
||||
"map_reduce": _load_map_reduce_chain,
|
||||
"refine": _load_refine_chain,
|
||||
}
|
||||
if chain_type not in loader_mapping:
|
||||
raise ValueError(
|
||||
f"Got unsupported chain type: {chain_type}. "
|
||||
f"Should be one of {loader_mapping.keys()}"
|
||||
)
|
||||
return loader_mapping[chain_type](llm)
|
11
langchain/chains/summarize/map_reduce_prompt.py
Normal file
11
langchain/chains/summarize/map_reduce_prompt.py
Normal file
@ -0,0 +1,11 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
prompt_template = """Write a concise summary of the following:
|
||||
|
||||
|
||||
{text}
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
|
28
langchain/chains/summarize/refine_prompt.py
Normal file
28
langchain/chains/summarize/refine_prompt.py
Normal file
@ -0,0 +1,28 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
REFINE_PROMPT_TMPL = (
|
||||
"Your job is to produce a final summary\n"
|
||||
"We have provided an existing summary up to a certain point: {existing_answer}\n"
|
||||
"We have the opportunity to refine the existing summary"
|
||||
"(only if needed) with some more context below.\n"
|
||||
"------------\n"
|
||||
"{text}\n"
|
||||
"------------\n"
|
||||
"Given the new context, refine the original summary"
|
||||
"If the context isn't useful, return the original summary."
|
||||
)
|
||||
REFINE_PROMPT = PromptTemplate(
|
||||
input_variables=["existing_answer", "text"],
|
||||
template=REFINE_PROMPT_TMPL,
|
||||
)
|
||||
|
||||
|
||||
prompt_template = """Write a concise summary of the following:
|
||||
|
||||
|
||||
{text}
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
|
11
langchain/chains/summarize/stuff_prompt.py
Normal file
11
langchain/chains/summarize/stuff_prompt.py
Normal file
@ -0,0 +1,11 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
prompt_template = """Write a concise summary of the following:
|
||||
|
||||
|
||||
{text}
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
|
@ -1,9 +1,13 @@
|
||||
"""Chain for question-answering against a vector database."""
|
||||
from typing import Dict, List
|
||||
from __future__ import annotations
|
||||
|
||||
from pydantic import BaseModel, Extra
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, root_validator
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.vector_db_qa.prompt import PROMPT
|
||||
from langchain.llms.base import LLM
|
||||
@ -24,14 +28,12 @@ class VectorDBQA(Chain, BaseModel):
|
||||
|
||||
"""
|
||||
|
||||
llm: LLM
|
||||
"""LLM wrapper to use."""
|
||||
vectorstore: VectorStore
|
||||
"""Vector Database to connect to."""
|
||||
k: int = 4
|
||||
"""Number of documents to query for."""
|
||||
prompt: PromptTemplate = PROMPT
|
||||
"""Prompt to use when questioning the documents."""
|
||||
combine_documents_chain: BaseCombineDocumentsChain
|
||||
"""Chain to use to combine the documents."""
|
||||
input_key: str = "query" #: :meta private:
|
||||
output_key: str = "result" #: :meta private:
|
||||
|
||||
@ -57,13 +59,47 @@ class VectorDBQA(Chain, BaseModel):
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
# TODO: deprecate this
|
||||
@root_validator(pre=True)
|
||||
def load_combine_documents_chain(cls, values: Dict) -> Dict:
|
||||
"""Validate question chain."""
|
||||
if "combine_documents_chain" not in values:
|
||||
if "llm" not in values:
|
||||
raise ValueError(
|
||||
"If `combine_documents_chain` not provided, `llm` should be."
|
||||
)
|
||||
prompt = values.pop("prompt", PROMPT)
|
||||
llm = values.pop("llm")
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
document_prompt = PromptTemplate(
|
||||
input_variables=["page_content"], template="Context:\n{page_content}"
|
||||
)
|
||||
combine_documents_chain = StuffDocumentsChain(
|
||||
llm_chain=llm_chain,
|
||||
document_variable_name="context",
|
||||
document_prompt=document_prompt,
|
||||
)
|
||||
values["combine_documents_chain"] = combine_documents_chain
|
||||
return values
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls, llm: LLM, prompt: PromptTemplate = PROMPT, **kwargs: Any
|
||||
) -> VectorDBQA:
|
||||
"""Initialize from LLM."""
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
document_prompt = PromptTemplate(
|
||||
input_variables=["page_content"], template="Context:\n{page_content}"
|
||||
)
|
||||
combine_documents_chain = StuffDocumentsChain(
|
||||
llm_chain=llm_chain,
|
||||
document_variable_name="context",
|
||||
document_prompt=document_prompt,
|
||||
)
|
||||
return cls(combine_documents_chain=combine_documents_chain, **kwargs)
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
question = inputs[self.input_key]
|
||||
llm_chain = LLMChain(llm=self.llm, prompt=self.prompt)
|
||||
docs = self.vectorstore.similarity_search(question, k=self.k)
|
||||
contexts = []
|
||||
for j, doc in enumerate(docs):
|
||||
contexts.append(f"Context {j}:\n{doc.page_content}")
|
||||
# TODO: handle cases where this context is too long.
|
||||
answer = llm_chain.predict(question=question, context="\n\n".join(contexts))
|
||||
answer = self.combine_documents_chain.combine_docs(docs, question=question)
|
||||
return {self.output_key: answer}
|
||||
|
Loading…
Reference in New Issue
Block a user