mirror of https://github.com/hwchase17/langchain
Harrison/combine documents chain (#212)
combine documents chain powering vector db qa with sources chainpull/232/head
parent
ab9abf53b7
commit
347fc49d4d
@ -0,0 +1,200 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "efc5be67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question-Answering with Sources\n",
|
||||
"\n",
|
||||
"This notebook goes over how to do question-answering with sources. It does this in a few different ways - first showing how you can use the `QAWithSourcesChain` to take in documents and use those, and next showing the `VectorDBQAWithSourcesChain`, which also does the lookup of the documents from a vector database. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1c613960",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.embeddings.cohere import CohereEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
|
||||
"from langchain.vectorstores.faiss import FAISS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "17d1306e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "0e745d99",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = FAISS.from_texts(texts, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f42d79dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add in a fake source information\n",
|
||||
"for i, d in enumerate(docsearch.docstore._dict.values()):\n",
|
||||
" d.metadata = {'source': f\"{i}-pl\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aa1c1b60",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### QAWithSourcesChain\n",
|
||||
"This shows how to use the `QAWithSourcesChain`, which takes in document objects and uses them directly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "61bce191",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Justice Breyer\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "57ddf8c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import QAWithSourcesChain\n",
|
||||
"from langchain.llms import OpenAI, Cohere\n",
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f908a92a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = QAWithSourcesChain.from_llm(OpenAI(temperature=0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "a505ac89",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'answer': ' The president thanked Justice Breyer for his service.',\n",
|
||||
" 'sources': '27-pl'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain({\"docs\": docs, \"question\": query}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6fc81de",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### VectorDBQAWithSourcesChain\n",
|
||||
"\n",
|
||||
"This shows how to use the `VectorDBQAWithSourcesChain`, which uses a vector database to look up relevant documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "8aa571ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import VectorDBQAWithSourcesChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "aa859d4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = VectorDBQAWithSourcesChain.from_llm(OpenAI(temperature=0), vectorstore=docsearch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8ba36fa7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain({\"question\": \"What did the president say about Justice Breyer\"}, return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "980fae3b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,94 @@
|
||||
"""Document combining chain."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
from langchain.prompts.prompt import Prompt
|
||||
|
||||
|
||||
def _get_default_document_prompt() -> Prompt:
|
||||
return Prompt(input_variables=["page_content"], template="{page_content}")
|
||||
|
||||
|
||||
class CombineDocumentsChain(Chain, BaseModel):
|
||||
"""Combine documents."""
|
||||
|
||||
llm_chain: LLMChain
|
||||
"""LLM wrapper to use after formatting documents."""
|
||||
document_prompt: BasePromptTemplate = Field(
|
||||
default_factory=_get_default_document_prompt
|
||||
)
|
||||
"""Prompt to use to format each document."""
|
||||
document_variable_name: str
|
||||
"""The variable name in the llm_chain to put the documents in.
|
||||
If only one variable in the llm_chain, this need not be provided."""
|
||||
input_key: str = "input_documents" #: :meta private:
|
||||
output_key: str = "output_text" #: :meta private:
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.input_key]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Return output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@root_validator(pre=True)
|
||||
def get_default_document_variable_name(cls, values: Dict) -> Dict:
|
||||
"""Get default document variable name, if not provided."""
|
||||
if "document_variable_name" not in values:
|
||||
llm_chain_variables = values["llm_chain"].prompt.input_variables
|
||||
if len(llm_chain_variables) == 1:
|
||||
values["document_variable_name"] = llm_chain_variables[0]
|
||||
else:
|
||||
raise ValueError(
|
||||
"document_variable_name must be provided if there are "
|
||||
"multiple llm_chain_variables"
|
||||
)
|
||||
else:
|
||||
llm_chain_variables = values["llm_chain"].prompt.input_variables
|
||||
if values["document_variable_name"] not in llm_chain_variables:
|
||||
raise ValueError(
|
||||
f"document_variable_name {values['document_variable_name']} was "
|
||||
f"not found in llm_chain input_variables: {llm_chain_variables}"
|
||||
)
|
||||
return values
|
||||
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
docs = inputs[self.input_key]
|
||||
# Other keys are assumed to be needed for LLM prediction
|
||||
other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
|
||||
# Get relevant information from each document.
|
||||
doc_dicts = []
|
||||
for doc in docs:
|
||||
base_info = {"page_content": doc.page_content}
|
||||
base_info.update(doc.metadata)
|
||||
document_info = {
|
||||
k: base_info[k] for k in self.document_prompt.input_variables
|
||||
}
|
||||
doc_dicts.append(document_info)
|
||||
# Format each document according to the prompt
|
||||
doc_strings = [self.document_prompt.format(**doc) for doc in doc_dicts]
|
||||
# Join the documents together to put them in the prompt.
|
||||
other_keys[self.document_variable_name] = "\n".join(doc_strings)
|
||||
# Call predict on the LLM.
|
||||
output = self.llm_chain.predict(**other_keys)
|
||||
return {self.output_key: output}
|
@ -0,0 +1 @@
|
||||
"""Question answering with sources over documents."""
|
@ -0,0 +1,143 @@
|
||||
"""Question answering with sources over documents."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Extra, root_validator
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents import CombineDocumentsChain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.qa_with_sources.prompt import (
|
||||
COMBINE_PROMPT,
|
||||
EXAMPLE_PROMPT,
|
||||
QUESTION_PROMPT,
|
||||
)
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.llms.base import LLM
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
|
||||
|
||||
class BaseQAWithSourcesChain(Chain, BaseModel, ABC):
|
||||
"""Question answering with sources over documents."""
|
||||
|
||||
llm_question_chain: LLMChain
|
||||
"""LLM wrapper to use for asking questions to each document."""
|
||||
combine_document_chain: CombineDocumentsChain
|
||||
"""Chain to use to combine documents."""
|
||||
doc_source_key: str = "source"
|
||||
"""Key in document.metadata to use as source information"""
|
||||
question_key: str = "question" #: :meta private:
|
||||
input_docs_key: str = "docs" #: :meta private:
|
||||
answer_key: str = "answer" #: :meta private:
|
||||
sources_answer_key: str = "sources" #: :meta private:
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: LLM,
|
||||
combine_document_prompt: BasePromptTemplate = EXAMPLE_PROMPT,
|
||||
question_prompt: BasePromptTemplate = QUESTION_PROMPT,
|
||||
combine_prompt: BasePromptTemplate = COMBINE_PROMPT,
|
||||
**kwargs: Any,
|
||||
) -> "BaseQAWithSourcesChain":
|
||||
"""Construct the chain from an LLM."""
|
||||
llm_question_chain = LLMChain(llm=llm, prompt=question_prompt)
|
||||
llm_combine_chain = LLMChain(llm=llm, prompt=combine_prompt)
|
||||
combine_document_chain = CombineDocumentsChain(
|
||||
llm_chain=llm_combine_chain,
|
||||
document_prompt=combine_document_prompt,
|
||||
document_variable_name="summaries",
|
||||
)
|
||||
return cls(
|
||||
llm_question_chain=llm_question_chain,
|
||||
combine_document_chain=combine_document_chain,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.question_key]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Return output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.answer_key, self.sources_answer_key]
|
||||
|
||||
@root_validator(pre=True)
|
||||
def validate_question_chain(cls, values: Dict) -> Dict:
|
||||
"""Validate question chain."""
|
||||
llm_question_chain = values["llm_question_chain"]
|
||||
if len(llm_question_chain.input_keys) != 2:
|
||||
raise ValueError(
|
||||
f"The llm_question_chain should have two inputs: a content key "
|
||||
f"(the first one) and a question key (the second one). Got "
|
||||
f"{llm_question_chain.input_keys}."
|
||||
)
|
||||
return values
|
||||
|
||||
@root_validator()
|
||||
def validate_combine_chain_can_be_constructed(cls, values: Dict) -> Dict:
|
||||
"""Validate that the combine chain can be constructed."""
|
||||
# Try to construct the combine documents chains.
|
||||
|
||||
return values
|
||||
|
||||
@abstractmethod
|
||||
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
|
||||
"""Get docs to run questioning over."""
|
||||
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
docs = self._get_docs(inputs)
|
||||
query = inputs[self.question_key]
|
||||
content_key, query_key = self.llm_question_chain.input_keys
|
||||
results = self.llm_question_chain.apply(
|
||||
[{content_key: d.page_content, query_key: query} for d in docs]
|
||||
)
|
||||
question_result_key = self.llm_question_chain.output_key
|
||||
result_docs = [
|
||||
Document(page_content=r[question_result_key], metadata=docs[i].metadata)
|
||||
for i, r in enumerate(results)
|
||||
]
|
||||
answer_dict = self.combine_document_chain(
|
||||
{
|
||||
self.combine_document_chain.input_key: result_docs,
|
||||
self.question_key: query,
|
||||
}
|
||||
)
|
||||
answer = answer_dict[self.combine_document_chain.output_key]
|
||||
if "\nSOURCES: " in answer:
|
||||
answer, sources = answer.split("\nSOURCES: ")
|
||||
else:
|
||||
sources = ""
|
||||
return {self.answer_key: answer, self.sources_answer_key: sources}
|
||||
|
||||
|
||||
class QAWithSourcesChain(BaseQAWithSourcesChain, BaseModel):
|
||||
"""Question answering with sources over documents."""
|
||||
|
||||
input_docs_key: str = "docs" #: :meta private:
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.input_docs_key, self.question_key]
|
||||
|
||||
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
|
||||
return inputs[self.input_docs_key]
|
@ -0,0 +1,20 @@
|
||||
"""Question-answering with sources over a vector database."""
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
|
||||
class VectorDBQAWithSourcesChain(BaseQAWithSourcesChain, BaseModel):
|
||||
"""Question-answering with sources over a vector database."""
|
||||
|
||||
vectorstore: VectorStore
|
||||
"""Vector Database to connect to."""
|
||||
k: int = 4
|
||||
|
||||
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
|
||||
question = inputs[self.question_key]
|
||||
return self.vectorstore.similarity_search(question, k=self.k)
|
Loading…
Reference in New Issue