diff --git a/README.md b/README.md index 71bcfaf9..c137947b 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Please see [here](https://langchain.readthedocs.io/en/latest/?) for full documen ## 🚀 What can this help with? -There are five main areas that LangChain is designed to help with. +There are six main areas that LangChain is designed to help with. These are, in increasing order of complexity: **📃 LLMs and Prompts:** @@ -51,8 +51,13 @@ Agents involve an LLM making decisions about which Actions to take, taking that Memory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory. +**🧐 Evaluation:** + +[BETA] Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this. + For more information on these concepts, please see our [full documentation](https://langchain.readthedocs.io/en/latest/?). + ## 💁 Contributing As an open source project in a rapidly developing field, we are extremely open diff --git a/docs/examples/evaluation.rst b/docs/examples/evaluation.rst new file mode 100644 index 00000000..75900377 --- /dev/null +++ b/docs/examples/evaluation.rst @@ -0,0 +1,18 @@ +Evaluation +============== + +The examples here all highlight how to use language models to assist in evaluation of themselves. + +`Question Answering `_: An overview of LLMs aimed at evaluating question answering systems in general. + +`Data Augmented Question Answering `_: An end-to-end example of evaluating a question answering system focused on a specific document (a VectorDBQAChain to be precise). This example highlights how to use LLMs to come up with question/answer examples to evaluate over, and then highlights how to use LLMs to evaluate performance on those generated examples. + +`Hugging Face Datasets `_: Covers an example of loading and using a dataset from Hugging Face for evaluation. + + +.. toctree:: + :maxdepth: 1 + :glob: + :hidden: + + evaluation/* diff --git a/docs/examples/evaluation/data_augmented_question_answering.ipynb b/docs/examples/evaluation/data_augmented_question_answering.ipynb new file mode 100644 index 00000000..329fd43d --- /dev/null +++ b/docs/examples/evaluation/data_augmented_question_answering.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e78b7bb1", + "metadata": {}, + "source": [ + "# Data Augmented Question Answering\n", + "\n", + "This notebook uses some generic prompts/language models to evaluate an question answering system that uses other sources of data besides what is in the model. For example, this can be used to evaluate a question answering system over your propritary data.\n", + "\n", + "## Setup\n", + "Let's set up an example with our favorite example - the state of the union address." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ab4a6931", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores.faiss import FAISS\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain import OpenAI, VectorDBQA" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4fdc211d", + "metadata": {}, + "outputs": [], + "source": [ + "with open('../state_of_the_union.txt') as f:\n", + " state_of_the_union = f.read()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "texts = text_splitter.split_text(state_of_the_union)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "docsearch = FAISS.from_texts(texts, embeddings)\n", + "qa = VectorDBQA.from_llm(llm=OpenAI(), vectorstore=docsearch)" + ] + }, + { + "cell_type": "markdown", + "id": "30fd72f2", + "metadata": {}, + "source": [ + "## Examples\n", + "Now we need some examples to evaluate. We can do this in two ways:\n", + "\n", + "1. Hard code some examples ourselves\n", + "2. Generate examples automatically, using a language model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3459b001", + "metadata": {}, + "outputs": [], + "source": [ + "# Hard-coded examples\n", + "examples = [\n", + " {\n", + " \"query\": \"What did the president say about Ketanji Brown Jackson\",\n", + " \"answer\": \"He praised her legal ability and said he nominated her for the supreme court.\"\n", + " },\n", + " {\n", + " \"query\": \"What did the president say about Michael Jackson\",\n", + " \"answer\": \"Nothing\"\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b9c3fa75", + "metadata": {}, + "outputs": [], + "source": [ + "# Generated examples\n", + "from langchain.evaluation.qa import QAGenerateChain\n", + "example_gen_chain = QAGenerateChain.from_llm(OpenAI())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c24543a9", + "metadata": {}, + "outputs": [], + "source": [ + "new_examples = example_gen_chain.apply_and_parse([{\"doc\": t} for t in texts[:5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a2d27560", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'query': 'What did Vladimir Putin seek to do according to the document?',\n", + " 'answer': 'Vladimir Putin sought to shake the foundations of the free world and make it bend to his menacing ways.'},\n", + " {'query': 'What did President Zelenskyy say in his speech to the European Parliament?',\n", + " 'answer': 'President Zelenskyy said \"Light will win over darkness.\"'},\n", + " {'query': \"How many countries joined the European Union in opposing Putin's attack on Ukraine?\",\n", + " 'answer': '27'},\n", + " {'query': 'What is the U.S. Department of Justice assembling in response to the Russian oligarchs?',\n", + " 'answer': 'A dedicated task force.'},\n", + " {'query': 'How much direct assistance is the US providing to Ukraine?',\n", + " 'answer': 'The US is providing more than $1 Billion in direct assistance to Ukraine.'}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_examples" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "558da6f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Combine examples\n", + "examples += new_examples" + ] + }, + { + "cell_type": "markdown", + "id": "443dc34e", + "metadata": {}, + "source": [ + "## Evaluate\n", + "Now that we have examples, we can use the question answering evaluator to evaluate our question answering chain." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "782169a5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.evaluation.qa import QAEvalChain" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1bb77416", + "metadata": {}, + "outputs": [], + "source": [ + "predictions = qa.apply(examples)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bcd0ad7f", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0)\n", + "eval_chain = QAEvalChain.from_llm(llm)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e6af79a", + "metadata": {}, + "outputs": [], + "source": [ + "graded_outputs = eval_chain.evaluate(examples, predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "32fac2dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'text': ' CORRECT'},\n", + " {'text': ' CORRECT'},\n", + " {'text': ' INCORRECT'},\n", + " {'text': ' CORRECT'},\n", + " {'text': ' CORRECT'},\n", + " {'text': ' CORRECT'},\n", + " {'text': ' CORRECT'}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graded_outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bb9bc7e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/evaluation/huggingface_datasets.ipynb b/docs/examples/evaluation/huggingface_datasets.ipynb new file mode 100644 index 00000000..599aaff4 --- /dev/null +++ b/docs/examples/evaluation/huggingface_datasets.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3cadcf88", + "metadata": {}, + "source": [ + "# Using HuggingFace Datasets\n", + "\n", + "This example shows how to use HuggingFace datasets to evaluate models. Specifically, we show how to load examples to evaluate models on from HuggingFace's dataset package." + ] + }, + { + "cell_type": "markdown", + "id": "0e3ce977", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "For demonstration purposes, we will just evaluate a simple question answering system." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c10054f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "from langchain.chains import LLMChain\n", + "from langchain.llms import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9abdf160", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = PromptTemplate(template=\"Question: {question}\\nAnswer:\", input_variables=[\"question\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d41ef7bb", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(model_name=\"text-davinci-003\", temperature=0)\n", + "chain = LLMChain(llm=llm, prompt=prompt)" + ] + }, + { + "cell_type": "markdown", + "id": "cbea2132", + "metadata": {}, + "source": [ + "# Examples\n", + "\n", + "Now we load a dataset from HuggingFace, and then convert it to a list of dictionaries for easier usage." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d2373cf1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset truthful_qa (/Users/harrisonchase/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/70210b72382652635215516e59663843b88eda16bd2acef909fb46700beb039a)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91c809ea00a549f3b20266ce95b4928d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 Sequence[Union[str, List[str], Dict[str, str]]]: + """Call apply and then parse the results.""" + result = self.apply(input_list) + if self.prompt.output_parser is not None: + new_result = [] + for res in result: + text = res[self.output_key] + new_result.append(self.prompt.output_parser.parse(text)) + return new_result + else: + return result diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py new file mode 100644 index 00000000..4714192a --- /dev/null +++ b/langchain/evaluation/__init__.py @@ -0,0 +1 @@ +"""[BETA] Functionality relating to evaluation.""" diff --git a/langchain/evaluation/qa/__init__.py b/langchain/evaluation/qa/__init__.py new file mode 100644 index 00000000..72863316 --- /dev/null +++ b/langchain/evaluation/qa/__init__.py @@ -0,0 +1,5 @@ +"""Chains and utils related to evaluating question answering functionality.""" +from langchain.evaluation.qa.eval_chain import QAEvalChain +from langchain.evaluation.qa.generate_chain import QAGenerateChain + +__all__ = ["QAEvalChain", "QAGenerateChain"] diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py new file mode 100644 index 00000000..ded97302 --- /dev/null +++ b/langchain/evaluation/qa/eval_chain.py @@ -0,0 +1,36 @@ +"""LLM Chain specifically for evaluating question answering.""" +from __future__ import annotations + +from typing import Any, List + +from langchain.chains.llm import LLMChain +from langchain.evaluation.qa.eval_prompt import PROMPT +from langchain.llms.base import BaseLLM + + +class QAEvalChain(LLMChain): + """LLM Chain specifically for evaluating question answering.""" + + @classmethod + def from_llm(cls, llm: BaseLLM, **kwargs: Any) -> QAEvalChain: + """Load QA Eval Chain from LLM.""" + return cls(llm=llm, prompt=PROMPT, **kwargs) + + def evaluate( + self, + examples: List[dict], + predictions: List[dict], + question_key: str = "query", + answer_key: str = "answer", + prediction_key: str = "result", + ) -> List[dict]: + """Evaluate question answering examples and predictions.""" + inputs = [] + for i, example in enumerate(examples): + _input = { + "query": example[question_key], + "answer": example[answer_key], + "result": predictions[i][prediction_key], + } + inputs.append(_input) + return self.apply(inputs) diff --git a/langchain/evaluation/qa/eval_prompt.py b/langchain/evaluation/qa/eval_prompt.py new file mode 100644 index 00000000..4e6c7c6a --- /dev/null +++ b/langchain/evaluation/qa/eval_prompt.py @@ -0,0 +1,21 @@ +# flake8: noqa +from langchain.prompts import PromptTemplate + +template = """You are a teacher grading a quiz. +You are given a question, the student's answer, and the true answer, and are asked to score it as either CORRECT or INCORRECT. + +Example Format: +QUESTION: question here +STUDENT ANSWER: student's answer here +TRUE ANSWER: true answer here +GRADE: CORRECT or INCORRECT here + +Please remember to grade them based on being factually accurate. Begin! + +QUESTION: {query} +STUDENT ANSWER: {result} +TRUE ANSWER: {answer} +GRADE:""" +PROMPT = PromptTemplate( + input_variables=["query", "result", "answer"], template=template +) diff --git a/langchain/evaluation/qa/generate_chain.py b/langchain/evaluation/qa/generate_chain.py new file mode 100644 index 00000000..62941462 --- /dev/null +++ b/langchain/evaluation/qa/generate_chain.py @@ -0,0 +1,17 @@ +"""LLM Chain specifically for generating examples for question answering.""" +from __future__ import annotations + +from typing import Any + +from langchain.chains.llm import LLMChain +from langchain.evaluation.qa.generate_prompt import PROMPT +from langchain.llms.base import BaseLLM + + +class QAGenerateChain(LLMChain): + """LLM Chain specifically for generating examples for question answering.""" + + @classmethod + def from_llm(cls, llm: BaseLLM, **kwargs: Any) -> QAGenerateChain: + """Load QA Generate Chain from LLM.""" + return cls(llm=llm, prompt=PROMPT, **kwargs) diff --git a/langchain/evaluation/qa/generate_prompt.py b/langchain/evaluation/qa/generate_prompt.py new file mode 100644 index 00000000..9ee74e8c --- /dev/null +++ b/langchain/evaluation/qa/generate_prompt.py @@ -0,0 +1,40 @@ +# flake8: noqa +import re +from typing import Dict + +from langchain.prompts import PromptTemplate +from langchain.prompts.base import BaseOutputParser + + +class QAGenerationOutputParser(BaseOutputParser): + """Parse output in question/answer pair.""" + + def parse(self, text: str) -> Dict[str, str]: + regex = r"QUESTION: (.*?)\nANSWER: (.*)" + match = re.search(regex, text) + if match: + question = match.group(1) + answer = match.group(2) + return {"query": question, "answer": answer} + else: + raise ValueError(f"Could not parse output: {text}") + + +template = """You are a teacher coming up with questions to ask on a quiz. +Given the following document, please generate a question and answer based on that document. + +Example Format: + +... + +QUESTION: question here +ANSWER: answer here + +These questions should be detailed and be based explicitly on information in the document. Begin! + + +{doc} +""" +PROMPT = PromptTemplate( + input_variables=["doc"], template=template, output_parser=QAGenerationOutputParser() +)