mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/llm math (#1808)
Co-authored-by: Vadym Barda <vadim.barda@gmail.com>
This commit is contained in:
parent
7b6ff7fe00
commit
d5b4393bb2
306
docs/use_cases/evaluation/llm_math.ipynb
Normal file
306
docs/use_cases/evaluation/llm_math.ipynb
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a4734146",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# LLM Math\n",
|
||||||
|
"\n",
|
||||||
|
"Evaluating chains that know how to do math."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "fdd7afae",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Comment this out if you are NOT using tracing\n",
|
||||||
|
"import os\n",
|
||||||
|
"os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "ce05ffea",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d028a511cede4de2b845b9a9954d6bea",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading readme: 0%| | 0.00/21.0 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloading and preparing dataset json/LangChainDatasets--llm-math to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "a71c8e5a21dd4da5a20a354b544f7a58",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "ae530ca624154a1a934075c47d1093a6",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data: 0%| | 0.00/631 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "7a4968df05d84bc483aa2c5039aecafe",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Generating train split: 0 examples [00:00, ? examples/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "9a2caed96225410fb1cc0f8f155eb766",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation.loading import load_dataset\n",
|
||||||
|
"dataset = load_dataset(\"llm-math\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8a998d6f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setting up a chain\n",
|
||||||
|
"Now we need to create some pipelines for doing math."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "7078f7f8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.chains import LLMMathChain"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "2bd70c46",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"llm = OpenAI()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "954c3270",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = LLMMathChain(llm=llm)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "f252027e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"predictions = chain.apply(dataset)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "c8af7041",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"numeric_output = [float(p['answer'].strip().strip(\"Answer: \")) for p in predictions]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "cc09ffe4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"correct = [example['answer'] == numeric_output[i] for i, example in enumerate(dataset)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "585244e4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sum(correct) / len(correct)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "0d14ac78",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"input: 5\n",
|
||||||
|
"expected output : 5.0\n",
|
||||||
|
"prediction: 5.0\n",
|
||||||
|
"input: 5 + 3\n",
|
||||||
|
"expected output : 8.0\n",
|
||||||
|
"prediction: 8.0\n",
|
||||||
|
"input: 2^3.171\n",
|
||||||
|
"expected output : 9.006708689094099\n",
|
||||||
|
"prediction: 9.006708689094099\n",
|
||||||
|
"input: 2 ^3.171 \n",
|
||||||
|
"expected output : 9.006708689094099\n",
|
||||||
|
"prediction: 9.006708689094099\n",
|
||||||
|
"input: two to the power of three point one hundred seventy one\n",
|
||||||
|
"expected output : 9.006708689094099\n",
|
||||||
|
"prediction: 9.006708689094099\n",
|
||||||
|
"input: five + three squared minus 1\n",
|
||||||
|
"expected output : 13.0\n",
|
||||||
|
"prediction: 13.0\n",
|
||||||
|
"input: 2097 times 27.31\n",
|
||||||
|
"expected output : 57269.07\n",
|
||||||
|
"prediction: 57269.07\n",
|
||||||
|
"input: two thousand ninety seven times twenty seven point thirty one\n",
|
||||||
|
"expected output : 57269.07\n",
|
||||||
|
"prediction: 57269.07\n",
|
||||||
|
"input: 209758 / 2714\n",
|
||||||
|
"expected output : 77.28739867354459\n",
|
||||||
|
"prediction: 77.28739867354459\n",
|
||||||
|
"input: 209758.857 divided by 2714.31\n",
|
||||||
|
"expected output : 77.27888745205964\n",
|
||||||
|
"prediction: 77.27888745205964\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for i, example in enumerate(dataset):\n",
|
||||||
|
" print(\"input: \", example[\"question\"])\n",
|
||||||
|
" print(\"expected output :\", example[\"answer\"])\n",
|
||||||
|
" print(\"prediction: \", numeric_output[i])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b9021ffd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -1,26 +1,17 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
from langchain.prompts.prompt import PromptTemplate
|
from langchain.prompts.prompt import PromptTemplate
|
||||||
|
|
||||||
_PROMPT_TEMPLATE = """You are GPT-3, and you can't do math.
|
_PROMPT_TEMPLATE = """Translate a math problem into Python code that can be executed in Python 3 REPL. Use the output of running this code to answer the question.
|
||||||
|
|
||||||
You can do basic math, and your memorization abilities are impressive, but you can't do any complex calculations that a human could not do in their head. You also have an annoying tendency to just make up highly specific, but wrong, answers.
|
Question: ${{Question with math problem.}}
|
||||||
|
|
||||||
So we hooked you up to a Python 3 kernel, and now you can execute code. If anyone gives you a hard math problem, just use this format and we’ll take care of the rest:
|
|
||||||
|
|
||||||
Question: ${{Question with hard calculation.}}
|
|
||||||
```python
|
```python
|
||||||
${{Code that prints what you need to know}}
|
${{Code that solves the problem and prints the solution}}
|
||||||
```
|
```
|
||||||
```output
|
```output
|
||||||
${{Output of your code}}
|
${{Output of running the code}}
|
||||||
```
|
```
|
||||||
Answer: ${{Answer}}
|
Answer: ${{Answer}}
|
||||||
|
|
||||||
Otherwise, use this simpler format:
|
|
||||||
|
|
||||||
Question: ${{Question without hard calculation}}
|
|
||||||
Answer: ${{Answer}}
|
|
||||||
|
|
||||||
Begin.
|
Begin.
|
||||||
|
|
||||||
Question: What is 37593 * 67?
|
Question: What is 37593 * 67?
|
||||||
|
Loading…
Reference in New Issue
Block a user