diff --git a/docs/use_cases/evaluation/question_answering.ipynb b/docs/use_cases/evaluation/question_answering.ipynb index b80844eb..98de7537 100644 --- a/docs/use_cases/evaluation/question_answering.ipynb +++ b/docs/use_cases/evaluation/question_answering.ipynb @@ -190,6 +190,51 @@ " print()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "782ae8c8", + "metadata": {}, + "source": [ + "## Customize Prompt\n", + "\n", + "You can also customize the prompt that is used. Here is an example prompting it using a score from 0 to 10.\n", + "The custom prompt requires 3 input variables: \"query\", \"answer\" and \"result\". Where \"query\" is the question, \"answer\" is the ground truth answer, and \"result\" is the predicted answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "153425c4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "_PROMPT_TEMPLATE = \"\"\"You are an expert professor specialized in grading students' answers to questions.\n", + "You are grading the following question:\n", + "{query}\n", + "Here is the real answer:\n", + "{answer}\n", + "You are grading the following predicted answer:\n", + "{result}\n", + "What grade do you give from 0 to 10, where 0 is the lowest (very low similarity) and 10 is the highest (very high similarity)?\n", + "\"\"\"\n", + "\n", + "PROMPT = PromptTemplate(input_variables=[\"query\", \"answer\", \"result\"], template=_PROMPT_TEMPLATE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a3b0fb7", + "metadata": {}, + "outputs": [], + "source": [ + "evalchain = QAEvalChain.from_llm(llm=llm,prompt=PROMPT)\n", + "evalchain.evaluate(examples, predictions, question_key=\"question\", answer_key=\"answer\", prediction_key=\"text\")" + ] + }, { "cell_type": "markdown", "id": "aaa61f0c", @@ -271,7 +316,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -285,7 +330,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]" + }, + "vscode": { + "interpreter": { + "hash": "53f3bc57609c7a84333bb558594977aa5b4026b1d6070b93987956689e367341" + } } }, "nbformat": 4, diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py index ded97302..e712cff7 100644 --- a/langchain/evaluation/qa/eval_chain.py +++ b/langchain/evaluation/qa/eval_chain.py @@ -3,6 +3,7 @@ from __future__ import annotations from typing import Any, List +from langchain import PromptTemplate from langchain.chains.llm import LLMChain from langchain.evaluation.qa.eval_prompt import PROMPT from langchain.llms.base import BaseLLM @@ -12,9 +13,25 @@ class QAEvalChain(LLMChain): """LLM Chain specifically for evaluating question answering.""" @classmethod - def from_llm(cls, llm: BaseLLM, **kwargs: Any) -> QAEvalChain: - """Load QA Eval Chain from LLM.""" - return cls(llm=llm, prompt=PROMPT, **kwargs) + def from_llm( + cls, llm: BaseLLM, prompt: PromptTemplate = PROMPT, **kwargs: Any + ) -> QAEvalChain: + """Load QA Eval Chain from LLM. + + Args: + llm (BaseLLM): the base language model to use. + + prompt (PromptTemplate): A prompt template containing the input_variables: + 'input', 'answer' and 'result' that will be used as the prompt + for evaluation. + Defaults to PROMPT. + + **kwargs: additional keyword arguments. + + Returns: + QAEvalChain: the loaded QA eval chain. + """ + return cls(llm=llm, prompt=prompt, **kwargs) def evaluate( self, @@ -33,4 +50,5 @@ class QAEvalChain(LLMChain): "result": predictions[i][prediction_key], } inputs.append(_input) + return self.apply(inputs)