From 2d098e8869f32ed1c09a51031fb8f185c3bcdb58 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 14 Mar 2023 12:37:48 -0700 Subject: [PATCH] Harrison/agent eval (#1620) Co-authored-by: jerwelborn --- docs/modules/chains/examples/sqlite.ipynb | 2 +- docs/use_cases/evaluation.rst | 80 ++- .../evaluation/agent_benchmarking.ipynb | 343 ++++++++++++ .../evaluation/agent_vectordb_sota_pg.ipynb | 516 ++++++++++++++++++ .../evaluation/benchmarking_template.ipynb | 160 ++++++ .../evaluation/qa_benchmarking_pg.ipynb | 374 +++++++++++++ .../evaluation/qa_benchmarking_sota.ipynb | 451 +++++++++++++++ docs/use_cases/evaluation/qa_generation.ipynb | 117 ++++ .../evaluation/question_answering.ipynb | 5 +- .../sql_qa_benchmarking_chinook.ipynb | 423 ++++++++++++++ langchain/chains/__init__.py | 2 + langchain/chains/qa_generation/__init__.py | 0 langchain/chains/qa_generation/base.py | 55 ++ langchain/chains/qa_generation/prompt.py | 50 ++ langchain/evaluation/loading.py | 8 + langchain/indexes/vectorstore.py | 2 +- 16 files changed, 2581 insertions(+), 7 deletions(-) create mode 100644 docs/use_cases/evaluation/agent_benchmarking.ipynb create mode 100644 docs/use_cases/evaluation/agent_vectordb_sota_pg.ipynb create mode 100644 docs/use_cases/evaluation/benchmarking_template.ipynb create mode 100644 docs/use_cases/evaluation/qa_benchmarking_pg.ipynb create mode 100644 docs/use_cases/evaluation/qa_benchmarking_sota.ipynb create mode 100644 docs/use_cases/evaluation/qa_generation.ipynb create mode 100644 docs/use_cases/evaluation/sql_qa_benchmarking_chinook.ipynb create mode 100644 langchain/chains/qa_generation/__init__.py create mode 100644 langchain/chains/qa_generation/base.py create mode 100644 langchain/chains/qa_generation/prompt.py create mode 100644 langchain/evaluation/loading.py diff --git a/docs/modules/chains/examples/sqlite.ipynb b/docs/modules/chains/examples/sqlite.ipynb index 2f614c8c7c..bce2dbbcf3 100644 --- a/docs/modules/chains/examples/sqlite.ipynb +++ b/docs/modules/chains/examples/sqlite.ipynb @@ -679,7 +679,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/use_cases/evaluation.rst b/docs/use_cases/evaluation.rst index 9517bb2371..effc10bd45 100644 --- a/docs/use_cases/evaluation.rst +++ b/docs/use_cases/evaluation.rst @@ -1,9 +1,85 @@ Evaluation ============== -Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this. +This section of documentation covers how we approach and think about evaluation in LangChain. +Both evaluation of internal chains/agents, but also how we would recommend people building on top of LangChain approach evaluation. -The examples here all highlight how to use language models to assist in evaluation of themselves. +The Problem +----------- + +It can be really hard to evaluate LangChain chains and agents. +There are two main reasons for this: + +**# 1: Lack of data** + +You generally don't have a ton of data to evaluate your chains/agents over before starting a project. +This is usually because Large Language Models (the core of most chains/agents) are terrific few-shot and zero shot learners, +meaning you are almost always able to get started on a particular task (text-to-SQL, question answering, etc) without +a large dataset of examples. +This is in stark contrast to traditional machine learning where you had to first collect a bunch of datapoints +before even getting started using a model. + +**# 2: Lack of metrics** + +Most chains/agents are performing tasks for which there are not very good metrics to evaluate performance. +For example, one of the most common use cases is generating text of some form. +Evaluating generated text is much more complicated than evaluating a classification prediction, or a numeric prediction. + +The Solution +------------ + +LangChain attempts to tackle both of those issues. +What we have so far are initial passes at solutions - we do not think we have a perfect solution. +So we very much welcome feedback, contributions, integrations, and thoughts on this. + +Here is what we have for each problem so far: + +**# 1: Lack of data** + +We have started `LangChainDatasets `_ a Community space on Hugging Face. +We intend this to be a collection of open source datasets for evaluating common chains and agents. +We have contributed five datasets of our own to start, but we highly intend this to be a community effort. +In order to contribute a dataset, you simply need to join the community and then you will be able to upload datasets. + +We're also aiming to make it as easy as possible for people to create their own datasets. +As a first pass at this, we've added a QAGenerationChain, which given a document comes up +with question-answer pairs that can be used to evaluate question-answering tasks over that document down the line. +See `this notebook <./evaluation/qa_generation.html>`_ for an example of how to use this chain. + +**# 2: Lack of metrics** + +We have two solutions to the lack of metrics. + +The first solution is to use no metrics, and rather just rely on looking at results by eye to get a sense for how the chain/agent is performing. +To assist in this, we have developed (and will continue to develop) `tracing <../tracing.html>`_, a UI-based visualizer of your chain and agent runs. + +The second solution we recommend is to use Language Models themselves to evaluate outputs. +For this we have a few different chains and prompts aimed at tackling this issue. + +The Examples +------------ + +We have created a bunch of examples combining the above two solutions to show how we internally evaluate chains and agents when we are developing. +In addition to the examples we've curated, we also highly welcome contributions here. +To facilitate that, we've included a `template notebook <./evaluation/benchmarking_template.html>`_ for community members to use to build their own examples. + +The existing examples we have are: + +`Question Answering (State of Union) <./evaluation/qa_benchmarking_sota.html>`_: An notebook showing evaluation of a question-answering task over a State-of-the-Union address. + +`Question Answering (Paul Graham Essay) <./evaluation/qa_benchmarking_pg.html>`_: An notebook showing evaluation of a question-answering task over a Paul Graham essay. + +`SQL Question Answering (Chinook) <./evaluation/sql_qa_benchmarking_chinook.html>`_: An notebook showing evaluation of a question-answering task over a SQL database (the Chinook database). + +`Agent Vectorstore <./evaluation/agent_vectordb_sota_pg.html>`_: An notebook showing evaluation of an agent doing question answering while routing between two different vector databases. + +`Agent Search + Calculator <./evaluation/agent_benchmarking.html>`_: An notebook showing evaluation of an agent doing question answering using a Search engine and a Calculator as tools. + + +Other Examples +------------ + +In addition, we also have some more generic resources for evaluation. `Question Answering <./evaluation/question_answering.html>`_: An overview of LLMs aimed at evaluating question answering systems in general. diff --git a/docs/use_cases/evaluation/agent_benchmarking.ipynb b/docs/use_cases/evaluation/agent_benchmarking.ipynb new file mode 100644 index 0000000000..26fcb4adb0 --- /dev/null +++ b/docs/use_cases/evaluation/agent_benchmarking.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "984169ca", + "metadata": {}, + "source": [ + "# Agent Benchmarking: Search + Calculator\n", + "\n", + "Here we go over how to benchmark performance of an agent on tasks where it has access to a calculator and a search tool.\n", + "\n", + "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "46bf9205", + "metadata": {}, + "outputs": [], + "source": [ + "# Comment this out if you are NOT using tracing\n", + "import os\n", + "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" + ] + }, + { + "cell_type": "markdown", + "id": "8a16b75d", + "metadata": {}, + "source": [ + "## Loading the data\n", + "First, let's load the data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5b2d5e98", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3a275586643f4ccfba1a8d54be28c351", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).\n" + ] + } + ], + "source": [ + "predictions = []\n", + "predicted_dataset = []\n", + "error_dataset = []\n", + "for data in dataset:\n", + " new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n", + " try:\n", + " predictions.append(agent(new_data))\n", + " predicted_dataset.append(new_data)\n", + " except Exception:\n", + " error_dataset.append(new_data)" + ] + }, + { + "cell_type": "markdown", + "id": "49d969fb", + "metadata": {}, + "source": [ + "## Evaluate performance\n", + "Now we can evaluate the predictions. The first thing we can do is look at them by eye." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1d583f03", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': 'How many people live in canada as of 2023?',\n", + " 'answer': 'approximately 38,625,801',\n", + " 'output': '38,630,316 people live in Canada as of 2023.',\n", + " 'intermediate_steps': [(AgentAction(tool='Search', tool_input='Population of Canada 2023', log=' I need to find population data\\nAction: Search\\nAction Input: Population of Canada 2023'),\n", + " '38,630,316')]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0]" + ] + }, + { + "cell_type": "markdown", + "id": "4783344b", + "metadata": {}, + "source": [ + "Next, we can use a language model to score them programatically" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d0a9341d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.evaluation.qa import QAEvalChain" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1612dec1", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0)\n", + "eval_chain = QAEvalChain.from_llm(llm)\n", + "graded_outputs = eval_chain.evaluate(dataset, predictions, question_key=\"question\", prediction_key=\"output\")" + ] + }, + { + "cell_type": "markdown", + "id": "79587806", + "metadata": {}, + "source": [ + "We can add in the graded output to the `predictions` dict and then get a count of the grades." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2a689df5", + "metadata": {}, + "outputs": [], + "source": [ + "for i, prediction in enumerate(predictions):\n", + " prediction['grade'] = graded_outputs[i]['text']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "27b61215", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({' CORRECT': 4, ' INCORRECT': 6})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from collections import Counter\n", + "Counter([pred['grade'] for pred in predictions])" + ] + }, + { + "cell_type": "markdown", + "id": "12fe30f4", + "metadata": {}, + "source": [ + "We can also filter the datapoints to the incorrect examples and look at them." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "47c692a1", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect = [pred for pred in predictions if pred['grade'] == \" INCORRECT\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "0ef976c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n", + " 'answer': 'her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665',\n", + " 'output': \"Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\",\n", + " 'intermediate_steps': [(AgentAction(tool='Search', tool_input=\"Dua Lipa's boyfriend\", log=' I need to find out who Dua Lipa\\'s boyfriend is and then calculate his age raised to the .43 power\\nAction: Search\\nAction Input: \"Dua Lipa\\'s boyfriend\"'),\n", + " 'Dua and Isaac, a model and a chef, dated on and off from 2013 to 2019. The two first split in early 2017, which is when Dua went on to date LANY ...'),\n", + " (AgentAction(tool='Search', tool_input='Isaac Carew age', log=' I need to find out Isaac\\'s age\\nAction: Search\\nAction Input: \"Isaac Carew age\"'),\n", + " '36 years'),\n", + " (AgentAction(tool='Calculator', tool_input='36^.43', log=' I need to calculate 36 raised to the .43 power\\nAction: Calculator\\nAction Input: 36^.43'),\n", + " 'Answer: 4.6688516567750975\\n')],\n", + " 'grade': ' INCORRECT'}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "incorrect[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7710401a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/use_cases/evaluation/agent_vectordb_sota_pg.ipynb b/docs/use_cases/evaluation/agent_vectordb_sota_pg.ipynb new file mode 100644 index 0000000000..954dc7759e --- /dev/null +++ b/docs/use_cases/evaluation/agent_vectordb_sota_pg.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "984169ca", + "metadata": {}, + "source": [ + "# Agent VectorDB Question Answering Benchmarking\n", + "\n", + "Here we go over how to benchmark performance on a question answering task using an agent to route between multiple vectordatabases.\n", + "\n", + "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "7b57a50f", + "metadata": {}, + "outputs": [], + "source": [ + "# Comment this out if you are NOT using tracing\n", + "import os\n", + "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" + ] + }, + { + "cell_type": "markdown", + "id": "8a16b75d", + "metadata": {}, + "source": [ + "## Loading the data\n", + "First, let's load the data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5b2d5e98", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-vectordb-qa-sota-pg-d3ae24016b514f92/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7abbc20615d4c58b75a055a790d7212", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 QAGenerationChain: + _prompt = prompt or PROMPT_SELECTOR.get_prompt(llm) + chain = LLMChain(llm=llm, prompt=_prompt) + return cls(llm_chain=chain, **kwargs) + + @property + def _chain_type(self) -> str: + raise NotImplementedError + + @property + def input_keys(self) -> List[str]: + return [self.input_key] + + @property + def output_keys(self) -> List[str]: + return [self.output_key] + + def _call(self, inputs: Dict[str, str]) -> Dict[str, Any]: + docs = self.text_splitter.create_documents([inputs[self.input_key]]) + results = self.llm_chain.generate([{"text": d.page_content} for d in docs]) + qa = [json.loads(res[0].text) for res in results.generations] + return {self.output_key: qa} + + async def _acall(self, inputs: Dict[str, str]) -> Dict[str, str]: + raise NotImplementedError diff --git a/langchain/chains/qa_generation/prompt.py b/langchain/chains/qa_generation/prompt.py new file mode 100644 index 0000000000..3919c2a239 --- /dev/null +++ b/langchain/chains/qa_generation/prompt.py @@ -0,0 +1,50 @@ +# flake8: noqa +from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.prompts.prompt import PromptTemplate + +templ1 = """You are a smart assistant designed to help high school teachers come up with reading comprehension questions. +Given a piece of text, you must come up with a question and answer pair that can be used to test a student's reading comprehension abilities. +When coming up with this question/answer pair, you must respond in the following format: +``` +{{ + "question": "$YOUR_QUESTION_HERE", + "answer": "$THE_ANSWER_HERE" +}} +``` + +Everything between the ``` must be valid json. +""" +templ2 = """Please come up with a question/answer pair, in the specified JSON format, for the following text: +---------------- +{text}""" +CHAT_PROMPT = ChatPromptTemplate.from_messages( + [ + SystemMessagePromptTemplate.from_template(templ1), + HumanMessagePromptTemplate.from_template(templ2), + ] +) +templ = """You are a smart assistant designed to help high school teachers come up with reading comprehension questions. +Given a piece of text, you must come up with a question and answer pair that can be used to test a student's reading comprehension abilities. +When coming up with this question/answer pair, you must respond in the following format: +``` +{{ + "question": "$YOUR_QUESTION_HERE", + "answer": "$THE_ANSWER_HERE" +}} +``` + +Everything between the ``` must be valid json. + +Please come up with a question/answer pair, in the specified JSON format, for the following text: +---------------- +{text}""" +PROMPT = PromptTemplate.from_template(templ) + +PROMPT_SELECTOR = ConditionalPromptSelector( + default_prompt=PROMPT, conditionals=[(is_chat_model, CHAT_PROMPT)] +) diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py new file mode 100644 index 0000000000..613e261303 --- /dev/null +++ b/langchain/evaluation/loading.py @@ -0,0 +1,8 @@ +from typing import Dict, List + + +def load_dataset(uri: str) -> List[Dict]: + from datasets import load_dataset + + dataset = load_dataset(f"LangChainDatasets/{uri}") + return [d for d in dataset["train"]] diff --git a/langchain/indexes/vectorstore.py b/langchain/indexes/vectorstore.py index b3d0ca7227..dc5807e490 100644 --- a/langchain/indexes/vectorstore.py +++ b/langchain/indexes/vectorstore.py @@ -50,8 +50,8 @@ class VectorstoreIndexCreator(BaseModel): """Logic for creating indexes.""" vectorstore_cls: Type[VectorStore] = Chroma - text_splitter: TextSplitter = Field(default_factory=_get_default_text_splitter) embedding: Embeddings = Field(default_factory=OpenAIEmbeddings) + text_splitter: TextSplitter = Field(default_factory=_get_default_text_splitter) vectorstore_kwargs: dict = Field(default_factory=dict) class Config: