Harrison/combine documents chain (#212)

combine documents chain powering vector db qa with sources chain
This commit is contained in:
Harrison Chase 2022-11-30 22:00:02 -08:00 committed by GitHub
parent ab9abf53b7
commit 347fc49d4d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 530 additions and 8 deletions

View File

@ -0,0 +1,200 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "efc5be67",
"metadata": {},
"source": [
"# Question-Answering with Sources\n",
"\n",
"This notebook goes over how to do question-answering with sources. It does this in a few different ways - first showing how you can use the `QAWithSourcesChain` to take in documents and use those, and next showing the `VectorDBQAWithSourcesChain`, which also does the lookup of the documents from a vector database. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1c613960",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "17d1306e",
"metadata": {},
"outputs": [],
"source": [
"with open('../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0e745d99",
"metadata": {},
"outputs": [],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f42d79dc",
"metadata": {},
"outputs": [],
"source": [
"# Add in a fake source information\n",
"for i, d in enumerate(docsearch.docstore._dict.values()):\n",
" d.metadata = {'source': f\"{i}-pl\"}"
]
},
{
"cell_type": "markdown",
"id": "aa1c1b60",
"metadata": {},
"source": [
"### QAWithSourcesChain\n",
"This shows how to use the `QAWithSourcesChain`, which takes in document objects and uses them directly."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "61bce191",
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Justice Breyer\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "57ddf8c7",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains import QAWithSourcesChain\n",
"from langchain.llms import OpenAI, Cohere\n",
"from langchain.docstore.document import Document"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f908a92a",
"metadata": {},
"outputs": [],
"source": [
"chain = QAWithSourcesChain.from_llm(OpenAI(temperature=0))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a505ac89",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'answer': ' The president thanked Justice Breyer for his service.',\n",
" 'sources': '27-pl'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain({\"docs\": docs, \"question\": query}, return_only_outputs=True)"
]
},
{
"cell_type": "markdown",
"id": "e6fc81de",
"metadata": {},
"source": [
"### VectorDBQAWithSourcesChain\n",
"\n",
"This shows how to use the `VectorDBQAWithSourcesChain`, which uses a vector database to look up relevant documents."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8aa571ae",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains import VectorDBQAWithSourcesChain"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "aa859d4c",
"metadata": {},
"outputs": [],
"source": [
"chain = VectorDBQAWithSourcesChain.from_llm(OpenAI(temperature=0), vectorstore=docsearch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ba36fa7",
"metadata": {},
"outputs": [],
"source": [
"chain({\"question\": \"What did the president say about Justice Breyer\"}, return_only_outputs=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "980fae3b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "99bbe19b",
"metadata": {},
"outputs": [
@ -49,7 +49,7 @@
"\"\\n\\nThe President discusses the recent aggression by Russia, and the response by the United States and its allies. He announces new sanctions against Russia, and says that the free world is united in holding Putin accountable. The President also discusses the American Rescue Plan, the Bipartisan Infrastructure Law, and the Bipartisan Innovation Act. Finally, the President addresses the need for women's rights and equality for LGBTQ+ Americans.\""
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -63,7 +63,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b581501e",
"id": "baa6e808",
"metadata": {},
"outputs": [],
"source": []

View File

@ -12,8 +12,10 @@ from langchain.chains import (
LLMMathChain,
PALChain,
PythonChain,
QAWithSourcesChain,
SQLDatabaseChain,
VectorDBQA,
VectorDBQAWithSourcesChain,
)
from langchain.docstore import InMemoryDocstore, Wikipedia
from langchain.llms import Cohere, HuggingFaceHub, OpenAI
@ -51,5 +53,7 @@ __all__ = [
"ElasticVectorSearch",
"InMemoryDocstore",
"ConversationChain",
"VectorDBQAWithSourcesChain",
"QAWithSourcesChain",
"PALChain",
]

View File

@ -4,6 +4,8 @@ from langchain.chains.llm import LLMChain
from langchain.chains.llm_math.base import LLMMathChain
from langchain.chains.pal.base import PALChain
from langchain.chains.python import PythonChain
from langchain.chains.qa_with_sources.base import QAWithSourcesChain
from langchain.chains.qa_with_sources.vector_db import VectorDBQAWithSourcesChain
from langchain.chains.sequential import SequentialChain, SimpleSequentialChain
from langchain.chains.sql_database.base import SQLDatabaseChain
from langchain.chains.vector_db_qa.base import VectorDBQA
@ -17,5 +19,7 @@ __all__ = [
"SequentialChain",
"SimpleSequentialChain",
"ConversationChain",
"QAWithSourcesChain",
"VectorDBQAWithSourcesChain",
"PALChain",
]

View File

@ -0,0 +1,94 @@
"""Document combining chain."""
from typing import Any, Dict, List
from pydantic import BaseModel, Extra, Field, root_validator
from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain.prompts.base import BasePromptTemplate
from langchain.prompts.prompt import Prompt
def _get_default_document_prompt() -> Prompt:
return Prompt(input_variables=["page_content"], template="{page_content}")
class CombineDocumentsChain(Chain, BaseModel):
"""Combine documents."""
llm_chain: LLMChain
"""LLM wrapper to use after formatting documents."""
document_prompt: BasePromptTemplate = Field(
default_factory=_get_default_document_prompt
)
"""Prompt to use to format each document."""
document_variable_name: str
"""The variable name in the llm_chain to put the documents in.
If only one variable in the llm_chain, this need not be provided."""
input_key: str = "input_documents" #: :meta private:
output_key: str = "output_text" #: :meta private:
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
@property
def input_keys(self) -> List[str]:
"""Expect input key.
:meta private:
"""
return [self.input_key]
@property
def output_keys(self) -> List[str]:
"""Return output key.
:meta private:
"""
return [self.output_key]
@root_validator(pre=True)
def get_default_document_variable_name(cls, values: Dict) -> Dict:
"""Get default document variable name, if not provided."""
if "document_variable_name" not in values:
llm_chain_variables = values["llm_chain"].prompt.input_variables
if len(llm_chain_variables) == 1:
values["document_variable_name"] = llm_chain_variables[0]
else:
raise ValueError(
"document_variable_name must be provided if there are "
"multiple llm_chain_variables"
)
else:
llm_chain_variables = values["llm_chain"].prompt.input_variables
if values["document_variable_name"] not in llm_chain_variables:
raise ValueError(
f"document_variable_name {values['document_variable_name']} was "
f"not found in llm_chain input_variables: {llm_chain_variables}"
)
return values
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
docs = inputs[self.input_key]
# Other keys are assumed to be needed for LLM prediction
other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
# Get relevant information from each document.
doc_dicts = []
for doc in docs:
base_info = {"page_content": doc.page_content}
base_info.update(doc.metadata)
document_info = {
k: base_info[k] for k in self.document_prompt.input_variables
}
doc_dicts.append(document_info)
# Format each document according to the prompt
doc_strings = [self.document_prompt.format(**doc) for doc in doc_dicts]
# Join the documents together to put them in the prompt.
other_keys[self.document_variable_name] = "\n".join(doc_strings)
# Call predict on the LLM.
output = self.llm_chain.predict(**other_keys)
return {self.output_key: output}

View File

@ -9,7 +9,9 @@ from typing import Dict, List
from pydantic import BaseModel, Extra
from langchain.chains.base import Chain
from langchain.chains.combine_documents import CombineDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.docstore.document import Document
from langchain.llms.base import LLM
from langchain.prompts.base import BasePromptTemplate
from langchain.text_splitter import TextSplitter
@ -66,10 +68,9 @@ class MapReduceChain(Chain, BaseModel):
input_list = [{self.map_llm.prompt.input_variables[0]: d} for d in docs]
summary_results = self.map_llm.apply(input_list)
summaries = [res[self.map_llm.output_key] for res in summary_results]
summary_docs = [Document(page_content=text) for text in summaries]
# We then need to combine these individual parts into one.
# This is the reduce part.
summary_str = "\n".join(summaries)
inputs = {self.reduce_llm.prompt.input_variables[0]: summary_str}
output = self.reduce_llm.predict(**inputs)
return {self.output_key: output}
reduce_chain = CombineDocumentsChain(llm_chain=self.reduce_llm)
outputs = reduce_chain({reduce_chain.input_key: summary_docs})
return {self.output_key: outputs[self.output_key]}

View File

@ -0,0 +1 @@
"""Question answering with sources over documents."""

View File

@ -0,0 +1,143 @@
"""Question answering with sources over documents."""
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from pydantic import BaseModel, Extra, root_validator
from langchain.chains.base import Chain
from langchain.chains.combine_documents import CombineDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains.qa_with_sources.prompt import (
COMBINE_PROMPT,
EXAMPLE_PROMPT,
QUESTION_PROMPT,
)
from langchain.docstore.document import Document
from langchain.llms.base import LLM
from langchain.prompts.base import BasePromptTemplate
class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
"""Question answering with sources over documents."""
llm_question_chain: LLMChain
"""LLM wrapper to use for asking questions to each document."""
combine_document_chain: CombineDocumentsChain
"""Chain to use to combine documents."""
doc_source_key: str = "source"
"""Key in document.metadata to use as source information"""
question_key: str = "question" #: :meta private:
input_docs_key: str = "docs" #: :meta private:
answer_key: str = "answer" #: :meta private:
sources_answer_key: str = "sources" #: :meta private:
@classmethod
def from_llm(
cls,
llm: LLM,
combine_document_prompt: BasePromptTemplate = EXAMPLE_PROMPT,
question_prompt: BasePromptTemplate = QUESTION_PROMPT,
combine_prompt: BasePromptTemplate = COMBINE_PROMPT,
**kwargs: Any,
) -> "BaseQAWithSourcesChain":
"""Construct the chain from an LLM."""
llm_question_chain = LLMChain(llm=llm, prompt=question_prompt)
llm_combine_chain = LLMChain(llm=llm, prompt=combine_prompt)
combine_document_chain = CombineDocumentsChain(
llm_chain=llm_combine_chain,
document_prompt=combine_document_prompt,
document_variable_name="summaries",
)
return cls(
llm_question_chain=llm_question_chain,
combine_document_chain=combine_document_chain,
**kwargs,
)
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
@property
def input_keys(self) -> List[str]:
"""Expect input key.
:meta private:
"""
return [self.question_key]
@property
def output_keys(self) -> List[str]:
"""Return output key.
:meta private:
"""
return [self.answer_key, self.sources_answer_key]
@root_validator(pre=True)
def validate_question_chain(cls, values: Dict) -> Dict:
"""Validate question chain."""
llm_question_chain = values["llm_question_chain"]
if len(llm_question_chain.input_keys) != 2:
raise ValueError(
f"The llm_question_chain should have two inputs: a content key "
f"(the first one) and a question key (the second one). Got "
f"{llm_question_chain.input_keys}."
)
return values
@root_validator()
def validate_combine_chain_can_be_constructed(cls, values: Dict) -> Dict:
"""Validate that the combine chain can be constructed."""
# Try to construct the combine documents chains.
return values
@abstractmethod
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
"""Get docs to run questioning over."""
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
docs = self._get_docs(inputs)
query = inputs[self.question_key]
content_key, query_key = self.llm_question_chain.input_keys
results = self.llm_question_chain.apply(
[{content_key: d.page_content, query_key: query} for d in docs]
)
question_result_key = self.llm_question_chain.output_key
result_docs = [
Document(page_content=r[question_result_key], metadata=docs[i].metadata)
for i, r in enumerate(results)
]
answer_dict = self.combine_document_chain(
{
self.combine_document_chain.input_key: result_docs,
self.question_key: query,
}
)
answer = answer_dict[self.combine_document_chain.output_key]
if "\nSOURCES: " in answer:
answer, sources = answer.split("\nSOURCES: ")
else:
sources = ""
return {self.answer_key: answer, self.sources_answer_key: sources}
class QAWithSourcesChain(BaseQAWithSourcesChain, BaseModel):
"""Question answering with sources over documents."""
input_docs_key: str = "docs" #: :meta private:
@property
def input_keys(self) -> List[str]:
"""Expect input key.
:meta private:
"""
return [self.input_docs_key, self.question_key]
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
return inputs[self.input_docs_key]

View File

@ -0,0 +1,55 @@
# flake8: noqa
from langchain.prompts import PromptTemplate
question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question.
Return any relevant text verbatim.
{context}
Question: {question}
Relevant text, if any:"""
QUESTION_PROMPT = PromptTemplate(
template=question_prompt_template, input_variables=["context", "question"]
)
combine_prompt_template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
QUESTION: Which state/country's law governs the interpretation of the contract?
=========
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other) right or remedy.\n\n11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation in force of the remainder of the term (if any) and this Agreement.\n\n11.8 No Agency. Except as expressly stated otherwise, nothing in this Agreement shall create an agency, partnership or joint venture of any kind between the parties.\n\n11.9 No Third-Party Beneficiaries.
Source: 30-pl
Content: (b) if Google believes, in good faith, that the Distributor has violated or caused Google to violate any Anti-Bribery Laws (as defined in Clause 8.5) or that such a violation is reasonably likely to occur,
Source: 4-pl
=========
FINAL ANSWER: This Agreement is governed by English law.
SOURCES: 28-pl
QUESTION: What did the president say about Michael Jackson?
=========
Content: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russias Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland.
Source: 0-pl
Content: And we wont stop. \n\nWe have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. \n\nLets use this moment to reset. Lets stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease. \n\nLets stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. \n\nWe cant change how divided weve been. But we can change how we move forwardon COVID-19 and other issues we must face together. \n\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n\nOfficer Mora was 27 years old. \n\nOfficer Rivera was 22. \n\nBoth Dominican Americans whod grown up on the same streets they later chose to patrol as police officers. \n\nI spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves.
Source: 24-pl
Content: And a proud Ukrainian people, who have known 30 years of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards. \n\nTo all Americans, I will be honest with you, as Ive always promised. A Russian dictator, invading a foreign country, has costs around the world. \n\nAnd Im taking robust action to make sure the pain of our sanctions is targeted at Russias economy. And I will use every tool at our disposal to protect American businesses and consumers. \n\nTonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n\nAmerica will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n\nThese steps will help blunt gas prices here at home. And I know the news about whats happening can seem alarming. \n\nBut I want you to know that we are going to be okay.
Source: 5-pl
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIts based on DARPAthe Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purposeto drive breakthroughs in cancer, Alzheimers, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americanstonight , we have gathered in a sacred spacethe citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defeated totalitarianism and terror. \n\nAnd built the strongest, freest, and most prosperous nation the world has ever known. \n\nNow is the hour. \n\nOur moment of responsibility. \n\nOur test of resolve and conscience, of history itself. \n\nIt is in this moment that our character is formed. Our purpose is found. Our future is forged. \n\nWell I know this nation.
Source: 34-pl
=========
FINAL ANSWER: The president did not mention Michael Jackson.
SOURCES:
QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""
COMBINE_PROMPT = PromptTemplate(
template=combine_prompt_template, input_variables=["summaries", "question"]
)
EXAMPLE_PROMPT = PromptTemplate(
template="Content: {page_content}\nSource: {source}",
input_variables=["page_content", "source"],
)

View File

@ -0,0 +1,20 @@
"""Question-answering with sources over a vector database."""
from typing import Any, Dict, List
from pydantic import BaseModel
from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
from langchain.docstore.document import Document
from langchain.vectorstores.base import VectorStore
class VectorDBQAWithSourcesChain(BaseQAWithSourcesChain, BaseModel):
"""Question-answering with sources over a vector database."""
vectorstore: VectorStore
"""Vector Database to connect to."""
k: int = 4
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
question = inputs[self.question_key]
return self.vectorstore.similarity_search(question, k=self.k)