langchain/docs/use_cases/evaluation/llm_math.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a4734146",
   "metadata": {},
   "source": [
    "# LLM Math\n",
    "\n",
    "Evaluating chains that know how to do math."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fdd7afae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Comment this out if you are NOT using tracing\n",
    "import os\n",
    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ce05ffea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d028a511cede4de2b845b9a9954d6bea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset json/LangChainDatasets--llm-math to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a71c8e5a21dd4da5a20a354b544f7a58",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ae530ca624154a1a934075c47d1093a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/631 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7a4968df05d84bc483aa2c5039aecafe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9a2caed96225410fb1cc0f8f155eb766",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from langchain.evaluation.loading import load_dataset\n",
    "dataset = load_dataset(\"llm-math\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a998d6f",
   "metadata": {},
   "source": [
    "## Setting up a chain\n",
    "Now we need to create some pipelines for doing math."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7078f7f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.llms import OpenAI\n",
    "from langchain.chains import LLMMathChain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2bd70c46",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "954c3270",
   "metadata": {},
   "outputs": [],
   "source": [
    "chain = LLMMathChain(llm=llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f252027e",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = chain.apply(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c8af7041",
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_output = [float(p['answer'].strip().strip(\"Answer: \")) for p in predictions]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cc09ffe4",
   "metadata": {},
   "outputs": [],
   "source": [
    "correct = [example['answer'] == numeric_output[i] for i, example in enumerate(dataset)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "585244e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(correct) / len(correct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "0d14ac78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input:  5\n",
      "expected output : 5.0\n",
      "prediction:  5.0\n",
      "input:  5 + 3\n",
      "expected output : 8.0\n",
      "prediction:  8.0\n",
      "input:  2^3.171\n",
      "expected output : 9.006708689094099\n",
      "prediction:  9.006708689094099\n",
      "input:    2 ^3.171 \n",
      "expected output : 9.006708689094099\n",
      "prediction:  9.006708689094099\n",
      "input:  two to the power of three point one hundred seventy one\n",
      "expected output : 9.006708689094099\n",
      "prediction:  9.006708689094099\n",
      "input:  five + three squared minus 1\n",
      "expected output : 13.0\n",
      "prediction:  13.0\n",
      "input:  2097 times 27.31\n",
      "expected output : 57269.07\n",
      "prediction:  57269.07\n",
      "input:  two thousand ninety seven times twenty seven point thirty one\n",
      "expected output : 57269.07\n",
      "prediction:  57269.07\n",
      "input:  209758 / 2714\n",
      "expected output : 77.28739867354459\n",
      "prediction:  77.28739867354459\n",
      "input:  209758.857 divided by 2714.31\n",
      "expected output : 77.27888745205964\n",
      "prediction:  77.27888745205964\n"
     ]
    }
   ],
   "source": [
    "for i, example in enumerate(dataset):\n",
    "    print(\"input: \", example[\"question\"])\n",
    "    print(\"expected output :\", example[\"answer\"])\n",
    "    print(\"prediction: \", numeric_output[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9021ffd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}