fixed styling (#1119)

2 months ago · 27f7f361b9
parent 6333678834
commit 27f7f361b9
2 changed files with 514 additions and 235 deletions
--- a/.gitignore
+++ b/.gitignore
@ -137,3 +137,6 @@ dmypy.json
 *.DS_Store
 tmp_*
 examples/fine-tuned_qa/local_cache/*
 # PyCharm files
 .idea/
--- a/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb
+++ b/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb
@ -95,13 +95,16 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "metadata": {},
+   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-27T02:24:59.629Z",
     "start_time": "2024-03-27T02:24:50.505893Z"
    }
   },
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
    "\n",
    "client = OpenAI()"
   ]
@ -172,21 +175,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Q: What is the average horsepower for cars made by makers in Europe?\n",
+      "Q: What is the average horsepower for cars made in Europe?\n",
      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
      "\n",
-      "Q: What is the average weight of cars produced by makers from the continent of Europe?\n",
+      "Q: What is the average horsepower for cars made in the USA?\n",
-      "A: SELECT AVG(cars_data.Weight) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId WHERE countries.CountryName = 'USA'\n",
      "\n",
-      "Q: What is the average MPG for cars made in countries in the continent of Europe?\n",
+      "Q: What is the average horsepower for cars produced in countries from the continent with the id '3'?\n",
-      "A: SELECT AVG(cars_data.MPG) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.ContId = '3'\n",
      "\n",
-      "Q: What is the average horsepower for cars made by a maker from Europe?\n",
+      "Q: What is the average horsepower for cars made by makers from Europe?\n",
      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
      "\n",
-      "A: SELECT AVG(cars_data.Horsepower) AS AverageHorsepower FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "Q: What is the average horsepower for cars made in the USA?\n",
      "\n",
-      "Q: What is the average horsepower for cars made by makers in the continent of Europe?\n",
+      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId WHERE countries.CountryName = 'USA'\n",
      "A: SELECT avg(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
      "\n"
     ]
    }
@ -262,11 +265,11 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made by makers in Europe?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made in Europe?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
-      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average weight of cars produced by makers from the continent of Europe?'}], 'ideal': \"SELECT AVG(cars_data.Weight) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made in the USA?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId WHERE countries.CountryName = 'USA'\"}\n",
-      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average MPG for cars made in countries in the continent of Europe?'}], 'ideal': \"SELECT AVG(cars_data.MPG) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': \"What is the average horsepower for cars produced in countries from the continent with the id '3'?\"}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.ContId = '3'\"}\n",
-      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made by a maker from Europe?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) AS AverageHorsepower FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made by makers from Europe?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
-      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made by makers in the continent of Europe?'}], 'ideal': \"SELECT avg(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n"
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average horsepower for cars made in the USA?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN car_makers ON car_names.MakeId = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId WHERE countries.CountryName = 'USA'\"}\n"
     ]
    }
   ],
@ -357,7 +360,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-18T07:29:03.774758Z",
@ -370,7 +373,7 @@
   },
   "outputs": [],
   "source": [
-    "# !pip install evals"
+    "!pip install evals --quiet"
   ]
  },
  {
@ -393,7 +396,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-18T07:31:42.602736Z",
@ -409,66 +412,76 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-03-25 13:23:36,497] [registry.py:257] Loading registry from /Users/roy/Documents/Github/openai-cookbook/.venv/lib/python3.9/site-packages/evals/registry/evals\n",
+      "[2024-03-26 19:44:39,836] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/evals\n",
-      "[2024-03-25 13:23:38,131] [registry.py:257] Loading registry from /Users/roy/.evals/evals\n",
+      "[2024-03-26 19:44:43,623] [registry.py:257] Loading registry from /Users/shyamal/.evals/evals\n",
-      "[2024-03-25 13:23:38,133] [oaieval.py:189] \u001b[1;35mRun started: 2403252023385ZVJZ3UF\u001b[0m\n",
+      "[2024-03-26 19:44:43,635] [oaieval.py:189] \u001b[1;35mRun started: 240327024443FACXGMKA\u001b[0m\n",
-      "[2024-03-25 13:23:38,143] [registry.py:257] Loading registry from /Users/roy/Documents/Github/openai-cookbook/.venv/lib/python3.9/site-packages/evals/registry/modelgraded\n",
+      "[2024-03-26 19:44:43,663] [registry.py:257] Loading registry from /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/modelgraded\n",
-      "[2024-03-25 13:23:38,217] [registry.py:257] Loading registry from /Users/roy/.evals/modelgraded\n",
+      "[2024-03-26 19:44:43,851] [registry.py:257] Loading registry from /Users/shyamal/.evals/modelgraded\n",
-      "[2024-03-25 13:23:38,218] [data.py:90] Fetching /Users/roy/Documents/Github/openai-cookbook/.venv/lib/python3.9/site-packages/evals/registry/data/sql/spider_sql.jsonl\n",
+      "[2024-03-26 19:44:43,853] [data.py:90] Fetching /Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry/data/sql/spider_sql.jsonl\n",
-      "[2024-03-25 13:23:38,224] [eval.py:36] Evaluating 20 samples\n",
+      "[2024-03-26 19:44:43,878] [eval.py:36] Evaluating 25 samples\n",
-      "[2024-03-25 13:23:38,282] [eval.py:144] Running in threaded mode with 10 threads!\n",
+      "[2024-03-26 19:44:43,952] [eval.py:144] Running in threaded mode with 10 threads!\n",
-      "  0%|                                                    | 0/20 [00:00<?, ?it/s][2024-03-25 13:23:38,795] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "  0%|                                                    | 0/25 [00:00<?, ?it/s][2024-03-26 19:44:44,810] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:38,836] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:44,829] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:38,839] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:44,991] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:38,862] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:45,090] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:38,875] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:45,145] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:38,981] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:45,971] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:39,070] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:46,040] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:39,581] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:46,069] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:39,829] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:46,378] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:40,234] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:46,587] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:40,593] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:47,412] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "  5%|██▏                                         | 1/20 [00:02<00:43,  2.31s/it][2024-03-25 13:23:40,868] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "  4%|█▊                                          | 1/25 [00:03<01:23,  3.46s/it][2024-03-26 19:44:47,714] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 10%|████▍                                       | 2/20 [00:02<00:20,  1.11s/it][2024-03-25 13:23:41,090] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "  8%|███▌                                        | 2/25 [00:03<00:36,  1.60s/it][2024-03-26 19:44:47,947] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 15%|██████▌                                     | 3/20 [00:02<00:12,  1.41it/s][2024-03-25 13:23:41,356] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 12%|█████▎                                      | 3/25 [00:03<00:21,  1.02it/s][2024-03-26 19:44:48,413] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:41,707] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:48,643] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:42,223] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 16%|███████                                     | 4/25 [00:04<00:18,  1.15it/s][2024-03-26 19:44:48,909] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 20%|████████▊                                   | 4/20 [00:03<00:13,  1.14it/s][2024-03-25 13:23:42,342] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 20%|████████▊                                   | 5/25 [00:04<00:12,  1.54it/s][2024-03-26 19:44:49,131] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 25%|███████████                                 | 5/20 [00:04<00:09,  1.66it/s][2024-03-25 13:23:42,532] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:49,500] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 30%|█████████████▏                              | 6/20 [00:04<00:06,  2.17it/s][2024-03-25 13:23:42,787] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:49,530] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 35%|███████████████▍                            | 7/20 [00:04<00:05,  2.54it/s][2024-03-25 13:23:42,963] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 24%|██████████▌                                 | 6/25 [00:05<00:12,  1.56it/s][2024-03-26 19:44:49,962] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:42,984] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:49,964] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 40%|█████████████████▌                          | 8/20 [00:04<00:03,  3.02it/s][2024-03-25 13:23:43,056] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:49,967] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:43,108] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 28%|████████████▎                               | 7/25 [00:06<00:10,  1.73it/s][2024-03-26 19:44:50,577] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:43,127] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:50,602] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 45%|███████████████████▊                        | 9/20 [00:04<00:02,  3.67it/s][2024-03-25 13:23:43,585] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:50,634] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 50%|█████████████████████▌                     | 10/20 [00:05<00:03,  3.04it/s][2024-03-25 13:23:43,653] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:50,862] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:43,699] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:51,503] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:43,839] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:51,608] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:43,927] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 40%|█████████████████▏                         | 10/25 [00:07<00:08,  1.79it/s][2024-03-26 19:44:51,801] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:44,946] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 44%|██████████████████▉                        | 11/25 [00:07<00:06,  2.09it/s][2024-03-26 19:44:51,856] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:45,205] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:51,969] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:45,213] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:52,227] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 55%|███████████████████████▋                   | 11/20 [00:06<00:06,  1.38it/s][2024-03-25 13:23:45,485] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 52%|██████████████████████▎                    | 13/25 [00:08<00:04,  2.65it/s][2024-03-26 19:44:52,450] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 65%|███████████████████████████▉               | 13/20 [00:07<00:03,  2.21it/s][2024-03-25 13:23:45,611] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:52,526] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 70%|██████████████████████████████             | 14/20 [00:07<00:02,  2.70it/s][2024-03-25 13:23:45,730] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:52,615] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 75%|████████████████████████████████▎          | 15/20 [00:07<00:01,  3.28it/s][2024-03-25 13:23:45,769] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 56%|████████████████████████                   | 14/25 [00:08<00:04,  2.64it/s][2024-03-26 19:44:52,625] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:46,265] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:52,777] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 85%|████████████████████████████████████▌      | 17/20 [00:07<00:00,  3.46it/s][2024-03-25 13:23:46,393] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:53,653] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 90%|██████████████████████████████████████▋    | 18/20 [00:08<00:00,  3.99it/s][2024-03-25 13:23:47,284] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      " 60%|█████████████████████████▊                 | 15/25 [00:09<00:05,  1.87it/s][2024-03-26 19:44:53,670] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      " 95%|████████████████████████████████████████▊  | 19/20 [00:09<00:00,  2.43it/s][2024-03-25 13:23:49,136] [_client.py:1013] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "[2024-03-26 19:44:54,028] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "100%|███████████████████████████████████████████| 20/20 [00:10<00:00,  1.84it/s]\n",
+      " 68%|█████████████████████████████▏             | 17/25 [00:10<00:03,  2.54it/s][2024-03-26 19:44:54,388] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,153] [record.py:360] Final report: {'counts/Correct': 17, 'counts/Incorrect': 3, 'score': 0.85}. Logged to /tmp/evallogs/2403252023385ZVJZ3UF_gpt-3.5-turbo_spider-sql.jsonl\n",
+      "[2024-03-26 19:44:54,396] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,154] [oaieval.py:229] Final report:\n",
+      " 72%|██████████████████████████████▉            | 18/25 [00:10<00:02,  2.58it/s][2024-03-26 19:44:54,529] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,154] [oaieval.py:231] counts/Correct: 17\n",
+      "[2024-03-26 19:44:54,585] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,154] [oaieval.py:231] counts/Incorrect: 3\n",
+      " 76%|████████████████████████████████▋          | 19/25 [00:10<00:02,  2.94it/s][2024-03-26 19:44:54,980] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,154] [oaieval.py:231] score: 0.85\n",
+      " 80%|██████████████████████████████████▍        | 20/25 [00:11<00:01,  2.82it/s][2024-03-26 19:44:55,152] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "[2024-03-25 13:23:49,176] [record.py:349] Logged 60 rows of events to /tmp/evallogs/2403252023385ZVJZ3UF_gpt-3.5-turbo_spider-sql.jsonl: insert_time=20.087ms\n"
+      " 84%|████████████████████████████████████       | 21/25 [00:11<00:01,  3.27it/s][2024-03-26 19:44:56,420] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
      " 88%|█████████████████████████████████████▊     | 22/25 [00:12<00:01,  1.75it/s][2024-03-26 19:44:56,984] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
      " 92%|███████████████████████████████████████▌   | 23/25 [00:13<00:01,  1.76it/s][2024-03-26 19:44:57,370] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
      " 96%|█████████████████████████████████████████▎ | 24/25 [00:13<00:00,  1.94it/s][2024-03-26 19:44:59,589] [_client.py:1026] HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
      "100%|███████████████████████████████████████████| 25/25 [00:15<00:00,  1.60it/s]\n",
      "[2024-03-26 19:44:59,607] [record.py:360] Final report: {'counts/Correct': 20, 'counts/Incorrect': 5, 'score': 0.8}. Logged to /tmp/evallogs/240327024443FACXGMKA_gpt-3.5-turbo_spider-sql.jsonl\n",
      "[2024-03-26 19:44:59,608] [oaieval.py:229] Final report:\n",
      "[2024-03-26 19:44:59,608] [oaieval.py:231] counts/Correct: 20\n",
      "[2024-03-26 19:44:59,608] [oaieval.py:231] counts/Incorrect: 5\n",
      "[2024-03-26 19:44:59,608] [oaieval.py:231] score: 0.8\n",
      "[2024-03-26 19:44:59,640] [record.py:349] Logged 75 rows of events to /tmp/evallogs/240327024443FACXGMKA_gpt-3.5-turbo_spider-sql.jsonl: insert_time=27.915ms\n"
     ]
    }
   ],
   "source": [
-    "!oaieval gpt-3.5-turbo spider-sql --max_samples 20"
+    "!oaieval gpt-3.5-turbo spider-sql --max_samples 25"
   ]
  },
  {
@ -489,7 +502,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-03-18T20:37:01.920497Z",
@ -536,7 +549,7 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>{'completion_fns': ['gpt-3.5-turbo'], 'eval_na...</td>\n",
+       "      <td>{'completion_fns': ['gpt-3.5-turbo'], 'eval_name': 'spider-sql.dev.v0', 'base_eval': 'spider-sql', 'split': 'dev', 'run_config': {'completion_fns': ['gpt-3.5-turbo'], 'eval_spec': {'cls': 'evals.elsuite.modelgraded.classify:ModelBasedClassify', 'registry_path': '/Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry', 'args': {'samples_jsonl': 'sql/spider_sql.jsonl', 'eval_type': 'cot_classify', 'modelgraded_spec': 'sql'}, 'key': 'spider-sql.dev.v0', 'group': 'sql'}, 'seed': 20220722, 'max_samples': 25, 'command': '/Users/shyamal/.virtualenvs/openai/bin/oaieval gpt-3.5-turbo spider-sql --max_samples 25', 'initial_settings': {'visible': False}}, 'created_by': '', 'run_id': '240327024443FACXGMKA', 'created_at': '2024-03-27 02:44:43.626043'}</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -549,7 +562,7 @@
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
-       "      <td>{'counts/Correct': 17, 'counts/Incorrect': 3, ...</td>\n",
+       "      <td>{'counts/Correct': 20, 'counts/Incorrect': 5, 'score': 0.8}</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -562,77 +575,131 @@
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
-       "      <td>2403252023385ZVJZ3UF</td>\n",
+       "      <td>240327024443FACXGMKA</td>\n",
       "      <td>0.0</td>\n",
-       "      <td>spider-sql.dev.117</td>\n",
+       "      <td>spider-sql.dev.88</td>\n",
       "      <td>sampling</td>\n",
-       "      <td>{'prompt': [{'content': 'Answer the following ...</td>\n",
+       "      <td>{'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
       "Use only the following tables and columns:\n",
       "Table: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\n",
       "Table: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\n",
       "Table: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\n",
       "\n",
       "Question: Find the average rank of winners in all matches.\n",
       "', 'role': 'system'}], 'sampled': ['SELECT AVG(winner_rank) AS average_rank_of_winners\n",
       "FROM matches;']}</td>\n",
       "      <td></td>\n",
-       "      <td>2024-03-25 20:23:38.803226+00:00</td>\n",
+       "      <td>2024-03-27 02:44:44.821110+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
-       "      <td>2403252023385ZVJZ3UF</td>\n",
+       "      <td>240327024443FACXGMKA</td>\n",
       "      <td>1.0</td>\n",
-       "      <td>spider-sql.dev.72</td>\n",
+       "      <td>spider-sql.dev.82</td>\n",
       "      <td>sampling</td>\n",
-       "      <td>{'prompt': [{'content': 'Answer the following ...</td>\n",
+       "      <td>{'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
       "Use only the following tables and columns:\n",
       "Table: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\n",
       "Table: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\n",
       "Table: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\n",
       "\n",
       "Question: Find the total number of matches.\n",
       "', 'role': 'system'}], 'sampled': ['SELECT COUNT(*) AS total_matches\n",
       "FROM matches;']}</td>\n",
       "      <td></td>\n",
-       "      <td>2024-03-25 20:23:38.840276+00:00</td>\n",
+       "      <td>2024-03-27 02:44:44.831848+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
-       "      <td>2403252023385ZVJZ3UF</td>\n",
+       "      <td>240327024443FACXGMKA</td>\n",
       "      <td>2.0</td>\n",
-       "      <td>spider-sql.dev.88</td>\n",
+       "      <td>spider-sql.dev.25</td>\n",
       "      <td>sampling</td>\n",
-       "      <td>{'prompt': [{'content': 'Answer the following ...</td>\n",
+       "      <td>{'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
       "Use only the following tables and columns:\n",
       "Table: continents. Columns: ContId (number), Continent (text)\n",
       "Table: countries. Columns: CountryId (number), CountryName (text), Continent (number)\n",
       "Table: car_makers. Columns: Id (number), Maker (text), FullName (text), Country (text)\n",
       "Table: model_list. Columns: ModelId (number), Maker (number), Model (text)\n",
       "Table: car_names. Columns: MakeId (number), Model (text), Make (text)\n",
       "Table: cars_data. Columns: Id (number), MPG (text), Cylinders (number), Edispl (number), Horsepower (text), Weight (number), Accelerate (number), Year (number)\n",
       "\n",
       "Question: How many countries exist?\n",
       "', 'role': 'system'}], 'sampled': ['SELECT COUNT(*) AS TotalCountries\n",
       "FROM countries;']}</td>\n",
       "      <td></td>\n",
-       "      <td>2024-03-25 20:23:38.841729+00:00</td>\n",
+       "      <td>2024-03-27 02:44:44.996647+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
-       "                                                spec  \\\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   spec  \\\n",
-       "0  {'completion_fns': ['gpt-3.5-turbo'], 'eval_na...   \n",
+       "0  {'completion_fns': ['gpt-3.5-turbo'], 'eval_name': 'spider-sql.dev.v0', 'base_eval': 'spider-sql', 'split': 'dev', 'run_config': {'completion_fns': ['gpt-3.5-turbo'], 'eval_spec': {'cls': 'evals.elsuite.modelgraded.classify:ModelBasedClassify', 'registry_path': '/Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry', 'args': {'samples_jsonl': 'sql/spider_sql.jsonl', 'eval_type': 'cot_classify', 'modelgraded_spec': 'sql'}, 'key': 'spider-sql.dev.v0', 'group': 'sql'}, 'seed': 20220722, 'max_samples': 25, 'command': '/Users/shyamal/.virtualenvs/openai/bin/oaieval gpt-3.5-turbo spider-sql --max_samples 25', 'initial_settings': {'visible': False}}, 'created_by': '', 'run_id': '240327024443FACXGMKA', 'created_at': '2024-03-27 02:44:43.626043'}   \n",
-       "1                                                NaN   \n",
+       "1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   NaN   \n",
-       "2                                                NaN   \n",
+       "2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   NaN   \n",
-       "3                                                NaN   \n",
+       "3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   NaN   \n",
-       "4                                                NaN   \n",
+       "4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   NaN   \n",
       "\n",
       "                                                  final_report  \\\n",
       "0                                                          NaN   \n",
       "1  {'counts/Correct': 20, 'counts/Incorrect': 5, 'score': 0.8}   \n",
       "2                                                          NaN   \n",
       "3                                                          NaN   \n",
       "4                                                          NaN   \n",
       "\n",
       "                 run_id  event_id          sample_id      type  \\\n",
       "0                   NaN       NaN                NaN       NaN   \n",
       "1                   NaN       NaN                NaN       NaN   \n",
       "2  240327024443FACXGMKA       0.0  spider-sql.dev.88  sampling   \n",
       "3  240327024443FACXGMKA       1.0  spider-sql.dev.82  sampling   \n",
       "4  240327024443FACXGMKA       2.0  spider-sql.dev.25  sampling   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           data  \\\n",
       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           NaN   \n",
       "1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           NaN   \n",
       "2  {'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
       "Use only the following tables and columns:\n",
       "Table: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\n",
       "Table: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\n",
       "Table: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\n",
       "\n",
-       "                                        final_report                run_id  \\\n",
+       "Question: Find the average rank of winners in all matches.\n",
-       "0                                                NaN                   NaN   \n",
+       "', 'role': 'system'}], 'sampled': ['SELECT AVG(winner_rank) AS average_rank_of_winners\n",
-       "1  {'counts/Correct': 17, 'counts/Incorrect': 3, ...                   NaN   \n",
+       "FROM matches;']}   \n",
-       "2                                                NaN  2403252023385ZVJZ3UF   \n",
+       "3                                   {'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
-       "3                                                NaN  2403252023385ZVJZ3UF   \n",
+       "Use only the following tables and columns:\n",
-       "4                                                NaN  2403252023385ZVJZ3UF   \n",
+       "Table: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\n",
       "Table: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\n",
       "Table: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\n",
       "\n",
-       "   event_id           sample_id      type  \\\n",
+       "Question: Find the total number of matches.\n",
-       "0       NaN                 NaN       NaN   \n",
+       "', 'role': 'system'}], 'sampled': ['SELECT COUNT(*) AS total_matches\n",
-       "1       NaN                 NaN       NaN   \n",
+       "FROM matches;']}   \n",
-       "2       0.0  spider-sql.dev.117  sampling   \n",
+       "4                                                                                                                                                                                                                                                                                                                                                                                                                                   {'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
-       "3       1.0   spider-sql.dev.72  sampling   \n",
+       "Use only the following tables and columns:\n",
-       "4       2.0   spider-sql.dev.88  sampling   \n",
+       "Table: continents. Columns: ContId (number), Continent (text)\n",
       "Table: countries. Columns: CountryId (number), CountryName (text), Continent (number)\n",
       "Table: car_makers. Columns: Id (number), Maker (text), FullName (text), Country (text)\n",
       "Table: model_list. Columns: ModelId (number), Maker (number), Model (text)\n",
       "Table: car_names. Columns: MakeId (number), Model (text), Make (text)\n",
       "Table: cars_data. Columns: Id (number), MPG (text), Cylinders (number), Edispl (number), Horsepower (text), Weight (number), Accelerate (number), Year (number)\n",
       "\n",
-       "                                                data created_by  \\\n",
+       "Question: How many countries exist?\n",
-       "0                                                NaN        NaN   \n",
+       "', 'role': 'system'}], 'sampled': ['SELECT COUNT(*) AS TotalCountries\n",
-       "1                                                NaN        NaN   \n",
+       "FROM countries;']}   \n",
       "2  {'prompt': [{'content': 'Answer the following ...              \n",
       "3  {'prompt': [{'content': 'Answer the following ...              \n",
       "4  {'prompt': [{'content': 'Answer the following ...              \n",
       "\n",
-       "                        created_at  \n",
+       "  created_by                       created_at  \n",
-       "0                              NaT  \n",
+       "0        NaN                              NaT  \n",
-       "1                              NaT  \n",
+       "1        NaN                              NaT  \n",
-       "2 2024-03-25 20:23:38.803226+00:00  \n",
+       "2            2024-03-27 02:44:44.821110+00:00  \n",
-       "3 2024-03-25 20:23:38.840276+00:00  \n",
+       "3            2024-03-27 02:44:44.831848+00:00  \n",
-       "4 2024-03-25 20:23:38.841729+00:00  "
+       "4            2024-03-27 02:44:44.996647+00:00  "
      ]
     },
     "metadata": {},
@ -640,14 +707,14 @@
    }
   ],
   "source": [
-    "log_name = '2403252023385ZVJZ3UF_gpt-3.5-turbo_spider-sql.jsonl' # \"EDIT THIS\" - copy from above\n",
+    "log_name = '240327024443FACXGMKA_gpt-3.5-turbo_spider-sql.jsonl' # \"EDIT THIS\" - copy from above\n",
    "events = f\"/tmp/evallogs/{log_name}\"\n",
    "display(pd.read_json(events, lines=True).head(5))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "jupyter": {
@ -671,7 +738,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
@ -683,19 +750,19 @@
       " 'split': 'dev',\n",
       " 'run_config': {'completion_fns': ['gpt-3.5-turbo'],\n",
       "  'eval_spec': {'cls': 'evals.elsuite.modelgraded.classify:ModelBasedClassify',\n",
-       "   'registry_path': '/Users/roy/Documents/Github/openai-cookbook/.venv/lib/python3.9/site-packages/evals/registry',\n",
+       "   'registry_path': '/Users/shyamal/.virtualenvs/openai/lib/python3.11/site-packages/evals/registry',\n",
       "   'args': {'samples_jsonl': 'sql/spider_sql.jsonl',\n",
       "    'eval_type': 'cot_classify',\n",
       "    'modelgraded_spec': 'sql'},\n",
       "   'key': 'spider-sql.dev.v0',\n",
       "   'group': 'sql'},\n",
       "  'seed': 20220722,\n",
-       "  'max_samples': 20,\n",
+       "  'max_samples': 25,\n",
-       "  'command': '/Users/roy/Documents/Github/openai-cookbook/.venv/bin/oaieval gpt-3.5-turbo spider-sql --max_samples 20',\n",
+       "  'command': '/Users/shyamal/.virtualenvs/openai/bin/oaieval gpt-3.5-turbo spider-sql --max_samples 25',\n",
       "  'initial_settings': {'visible': False}},\n",
       " 'created_by': '',\n",
-       " 'run_id': '2403252023385ZVJZ3UF',\n",
+       " 'run_id': '240327024443FACXGMKA',\n",
-       " 'created_at': '2024-03-25 20:23:38.132021'}"
+       " 'created_at': '2024-03-27 02:44:43.626043'}"
      ]
     },
     "metadata": {},
@ -715,13 +782,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "{'counts/Correct': 17, 'counts/Incorrect': 3, 'score': 0.85}"
+       "{'counts/Correct': 20, 'counts/Incorrect': 5, 'score': 0.8}"
      ]
     },
     "metadata": {},
@ -741,27 +808,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "run_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         2403252023385ZVJZ3UF\n",
+       "run_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                240327024443FACXGMKA\n",
-       "event_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        0.0\n",
+       "event_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               0.0\n",
-       "sample_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        spider-sql.dev.117\n",
+       "sample_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                spider-sql.dev.88\n",
-       "type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       sampling\n",
+       "type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              sampling\n",
       "data          {'prompt': [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\n",
       "Use only the following tables and columns:\n",
-       "Table: TV_Channel. Columns: id (text), series_name (text), Country (text), Language (text), Content (text), Pixel_aspect_ratio_PAR (text), Hight_definition_TV (text), Pay_per_view_PPV (text), Package_Option (text)\n",
+       "Table: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\n",
-       "Table: TV_series. Columns: id (number), Episode (text), Air_Date (text), Rating (text), Share (number), 18_49_Rating_Share (text), Viewers_m (text), Weekly_Rank (number), Channel (text)\n",
+       "Table: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\n",
-       "Table: Cartoon. Columns: id (number), Title (text), Directed_by (text), Written_by (text), Original_air_date (text), Production_code (number), Channel (text)\n",
+       "Table: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\n",
       "\n",
-       "Question: What is the name and directors of all the cartoons that are ordered by air date?\n",
+       "Question: Find the average rank of winners in all matches.\n",
-       "', 'role': 'system'}], 'sampled': ['SELECT Title, Directed_by\n",
+       "', 'role': 'system'}], 'sampled': ['SELECT AVG(winner_rank) AS average_rank_of_winners\n",
-       "FROM Cartoon\n",
+       "FROM matches;']}\n",
-       "ORDER BY Original_air_date;']}\n",
+       "created_at                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                2024-03-27 02:44:44.821110+00:00\n",
       "created_at                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         2024-03-25 20:23:38.803226+00:00\n",
       "Name: 2, dtype: object"
      ]
     },
@ -776,27 +842,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: TV_Channel. Columns: id (text), series_name (text), Country (text), Language (text), Content (text), Pixel_aspect_ratio_PAR (text), Hight_definition_TV (text), Pay_per_view_PPV (text), Package_Option (text)\\nTable: TV_series. Columns: id (number), Episode (text), Air_Date (text), Rating (text), Share (number), 18_49_Rating_Share (text), Viewers_m (text), Weekly_Rank (number), Channel (text)\\nTable: Cartoon. Columns: id (number), Title (text), Directed_by (text), Written_by (text), Original_air_date (text), Production_code (number), Channel (text)\\n\\nQuestion: What is the name and directors of all the cartoons that are ordered by air date?\\n', 'role': 'system'}]\n",
      "Sampled: ['SELECT Title, Directed_by\\nFROM Cartoon\\nORDER BY Original_air_date;']\n",
      "----------\n",
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: museum. Columns: Museum_ID (number), Name (text), Num_of_Staff (number), Open_Year (text)\\nTable: visitor. Columns: ID (number), Name (text), Level_of_membership (number), Age (number)\\nTable: visit. Columns: Museum_ID (number), visitor_ID (text), Num_of_Ticket (number), Total_spent (number)\\n\\nQuestion: What is the average age of the visitors whose membership level is not higher than 4?\\n', 'role': 'system'}]\n",
      "Sampled: ['SELECT AVG(Age) \\nFROM visitor \\nWHERE Level_of_membership <= 4;']\n",
      "----------\n",
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\\nTable: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\\nTable: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\\n\\nQuestion: Find the average rank of winners in all matches.\\n', 'role': 'system'}]\n",
-      "Sampled: ['SELECT AVG(winner_rank) AS average_winner_rank\\nFROM matches;']\n",
+      "Sampled: ['SELECT AVG(winner_rank) AS average_rank_of_winners\\nFROM matches;']\n",
      "----------\n",
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: players. Columns: player_id (number), first_name (text), last_name (text), hand (text), birth_date (time), country_code (text)\\nTable: matches. Columns: best_of (number), draw_size (number), loser_age (number), loser_entry (text), loser_hand (text), loser_ht (number), loser_id (number), loser_ioc (text), loser_name (text), loser_rank (number), loser_rank_points (number), loser_seed (number), match_num (number), minutes (number), round (text), score (text), surface (text), tourney_date (time), tourney_id (text), tourney_level (text), tourney_name (text), winner_age (number), winner_entry (text), winner_hand (text), winner_ht (number), winner_id (number), winner_ioc (text), winner_name (text), winner_rank (number), winner_rank_points (number), winner_seed (number), year (number)\\nTable: rankings. Columns: ranking_date (time), ranking (number), player_id (number), ranking_points (number), tours (number)\\n\\nQuestion: Find the total number of matches.\\n', 'role': 'system'}]\n",
      "Sampled: ['SELECT COUNT(*) AS total_matches\\nFROM matches;']\n",
      "----------\n",
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: continents. Columns: ContId (number), Continent (text)\\nTable: countries. Columns: CountryId (number), CountryName (text), Continent (number)\\nTable: car_makers. Columns: Id (number), Maker (text), FullName (text), Country (text)\\nTable: model_list. Columns: ModelId (number), Maker (number), Model (text)\\nTable: car_names. Columns: MakeId (number), Model (text), Make (text)\\nTable: cars_data. Columns: Id (number), MPG (text), Cylinders (number), Edispl (number), Horsepower (text), Weight (number), Accelerate (number), Year (number)\\n\\nQuestion: How many countries exist?\\n', 'role': 'system'}]\n",
-      "Sampled: ['```sql\\nSELECT COUNT(*) AS TotalCountries\\nFROM countries;\\n```']\n",
+      "Sampled: ['SELECT COUNT(*) AS TotalCountries\\nFROM countries;']\n",
      "----------\n",
-      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: city. Columns: ID (number), Name (text), CountryCode (text), District (text), Population (number)\\nTable: sqlite_sequence. Columns: name (text), seq (text)\\nTable: country. Columns: Code (text), Name (text), Continent (text), Region (text), SurfaceArea (number), IndepYear (number), Population (number), LifeExpectancy (number), GNP (number), GNPOld (number), LocalName (text), GovernmentForm (text), HeadOfState (text), Capital (number), Code2 (text)\\nTable: countrylanguage. Columns: CountryCode (text), Language (text), IsOfficial (text), Percentage (number)\\n\\nQuestion: How many countries have a republic as their form of government?\\n', 'role': 'system'}]\n",
+      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: TV_Channel. Columns: id (text), series_name (text), Country (text), Language (text), Content (text), Pixel_aspect_ratio_PAR (text), Hight_definition_TV (text), Pay_per_view_PPV (text), Package_Option (text)\\nTable: TV_series. Columns: id (number), Episode (text), Air_Date (text), Rating (text), Share (number), 18_49_Rating_Share (text), Viewers_m (text), Weekly_Rank (number), Channel (text)\\nTable: Cartoon. Columns: id (number), Title (text), Directed_by (text), Written_by (text), Original_air_date (text), Production_code (number), Channel (text)\\n\\nQuestion: What is the name and directors of all the cartoons that are ordered by air date?\\n', 'role': 'system'}]\n",
-      "Sampled: [\"```sql\\nSELECT COUNT(*) \\nFROM country \\nWHERE GovernmentForm = 'Republic';\\n```\"]\n",
+      "Sampled: ['SELECT Title, Directed_by\\nFROM Cartoon\\nORDER BY Original_air_date;']\n",
      "----------\n",
      "Prompt: [{'content': 'Answer the following question with syntactically correct SQLite SQL. Be creative but the SQL must be correct.\\nUse only the following tables and columns:\\nTable: stadium. Columns: Stadium_ID (number), Location (text), Name (text), Capacity (number), Highest (number), Lowest (number), Average (number)\\nTable: singer. Columns: Singer_ID (number), Name (text), Country (text), Song_Name (text), Song_release_year (text), Age (number), Is_male (others)\\nTable: concert. Columns: concert_ID (number), concert_Name (text), Theme (text), Stadium_ID (text), Year (text)\\nTable: singer_in_concert. Columns: concert_ID (number), Singer_ID (text)\\n\\nQuestion: Show the name and the release year of the song by the youngest singer.\\n', 'role': 'system'}]\n",
      "Sampled: ['```sql\\nSELECT s.Name, s.Song_release_year\\nFROM singer s\\nWHERE s.Age = (SELECT MIN(Age) FROM singer)\\n```']\n",
      "----------\n"
     ]
    }
@ -819,96 +885,306 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_print_text(prompt):\n",
-    "    # Define markers for the start of each section\n",
+    "    # Define markers for the sections\n",
-    "    question_marker = \"[Question]:\"\n",
+    "    markers = {\n",
-    "    expert_marker = \"[Expert]:\"\n",
+    "        \"question\": \"[Question]:\",\n",
-    "    submission_marker = \"[Submission]:\"\n",
+    "        \"expert\": \"[Expert]:\",\n",
-    "\n",
+    "        \"submission\": \"[Submission]:\",\n",
-    "    # Find the start indices of each section\n",
+    "        \"end\": \"[END DATA]\"\n",
-    "    question_start = prompt.find(question_marker) + len(question_marker)\n",
+    "    }\n",
-    "    expert_start = prompt.find(expert_marker) + len(expert_marker)\n",
+    "    \n",
-    "    submission_start = prompt.find(submission_marker) + len(submission_marker)\n",
+    "    # Function to extract text between markers\n",
-    "\n",
+    "    def extract_text(start_marker, end_marker):\n",
-    "    # Find the end index for the question and expert sections by looking for the next section's start\n",
+    "        start = prompt.find(start_marker) + len(start_marker)\n",
-    "    question_end = prompt.find(expert_marker)\n",
+    "        end = prompt.find(end_marker)\n",
-    "    expert_end = prompt.find(submission_marker)\n",
+    "        text = prompt[start:end].strip()\n",
-    "    submission_end = prompt.find('[END DATA]')\n",
+    "        if start_marker == markers[\"question\"]:\n",
-    "\n",
+    "            text = text.split(\"\\n\\nQuestion:\")[-1].strip() if \"\\n\\nQuestion:\" in text else text\n",
-    "    # Extract the text for each section\n",
+    "        elif start_marker == markers[\"submission\"]:\n",
-    "    question_text = prompt[question_start:question_end].strip()\n",
+    "            text = text.replace(\"```sql\", \"\").replace(\"```\", \"\").strip()\n",
-    "    expert_answer_text = prompt[expert_start:expert_end].strip()\n",
+    "        return text\n",
-    "    submission_text = prompt[submission_start:submission_end].strip().replace(\"```sql\", \"\").replace(\"```\", \"\").strip()\n",
+    "    \n",
-    "\n",
+    "    # Extracting text for each section\n",
-    "    # Remove table definitions from the question text\n",
+    "    question_text = extract_text(markers[\"question\"], markers[\"expert\"])\n",
-    "    question_text = question_text.split(\"\\n\\nQuestion:\")[1].strip() if \"\\n\\nQuestion:\" in question_text else question_text\n",
+    "    expert_text = extract_text(markers[\"expert\"], markers[\"submission\"])\n",
-    "\n",
+    "    submission_text = extract_text(markers[\"submission\"], markers[\"end\"])\n",
-    "    # Define ANSI color codes for readability\n",
+    "    \n",
-    "    color_question = '\\033[94m'  # Blue\n",
+    "    # HTML color codes and formatting\n",
-    "    color_expert = '\\033[92m'   # Green\n",
+    "    colors = {\n",
-    "    color_submission = '\\033[93m' # Yellow\n",
+    "        \"question\": '<span style=\"color: #0000FF;\">QUESTION:<br>', \n",
-    "    color_end = '\\033[0m'        # Reset to default color\n",
+    "        \"expert\": '<span style=\"color: #008000;\">EXPECTED:<br>',  \n",
-    "\n",
+    "        \"submission\": '<span style=\"color: #FFA500;\">SUBMISSION:<br>' \n",
-    "    # Print with section headers and colors\n",
+    "    }\n",
-    "    print(f\"{color_question}QUESTION:\\n{question_text}{color_end}\")\n",
+    "    color_end = '</span>'\n",
-    "    print(f\"{color_expert}EXPECTED:\\n{expert_answer_text}{color_end}\")\n",
+    "    \n",
-    "    print(f\"{color_submission}SUBMISSION:\\n{submission_text}{color_end}\")\n"
+    "    # Display each section with color\n",
    "    from IPython.display import display, HTML\n",
    "    display(HTML(f\"{colors['question']}{question_text}{color_end}\"))\n",
    "    display(HTML(f\"{colors['expert']}{expert_text}{color_end}\"))\n",
    "    display(HTML(f\"{colors['submission']}{submission_text}{color_end}\"))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span style=\"color: #0000FF;\">QUESTION:<br>How many countries have a republic as their form of government?\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #008000;\">EXPECTED:<br>SELECT count(*) FROM country WHERE GovernmentForm  =  \"Republic\"\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #FFA500;\">SUBMISSION:<br>SELECT COUNT(*) \n",
       "FROM country \n",
       "WHERE GovernmentForm LIKE '%Republic%'\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------------------------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #0000FF;\">QUESTION:<br>Return the document id, template id, and description for the document with the name Robbin CV.\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #008000;\">EXPECTED:<br>SELECT document_id ,  template_id ,  Document_Description FROM Documents WHERE document_name  =  \"Robbin CV\"\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #FFA500;\">SUBMISSION:<br>SELECT Documents.Document_ID, Documents.Template_ID, Documents.Document_Description\n",
       "FROM Documents\n",
       "JOIN Templates ON Documents.Template_ID = Templates.Template_ID\n",
       "WHERE Documents.Document_Name = 'Robbin CV';\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------------------------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #0000FF;\">QUESTION:<br>Which professionals live in the state of Indiana or have done treatment on more than 2 treatments? List his or her id, last name and cell phone.\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #008000;\">EXPECTED:<br>SELECT professional_id ,  last_name ,  cell_number FROM Professionals WHERE state  =  'Indiana' UNION SELECT T1.professional_id ,  T1.last_name ,  T1.cell_number FROM Professionals AS T1 JOIN Treatments AS T2 ON T1.professional_id  =  T2.professional_id GROUP BY T1.professional_id HAVING count(*)  >  2\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #FFA500;\">SUBMISSION:<br>SELECT professional_id, last_name, cell_number\n",
       "FROM Professionals\n",
       "WHERE state = 'Indiana'\n",
       "OR professional_id IN (\n",
       "    SELECT professional_id\n",
       "    FROM Treatments\n",
       "    GROUP BY professional_id\n",
       "    HAVING COUNT(*) > 2\n",
       ");\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------------------------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #0000FF;\">QUESTION:<br>What is the continent name which Anguilla belongs to?\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #008000;\">EXPECTED:<br>SELECT Continent FROM country WHERE Name  =  \"Anguilla\"\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #FFA500;\">SUBMISSION:<br>SELECT c.Continent\n",
       "FROM country c\n",
       "WHERE c.Code = 'AIA';\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------------------------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #0000FF;\">QUESTION:<br>How many airlines do we have?\n",
       "\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #008000;\">EXPECTED:<br>SELECT count(*) FROM AIRLINES\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<span style=\"color: #FFA500;\">SUBMISSION:<br>SELECT COUNT(DISTINCT Airline) AS TotalAirlines\n",
       "FROM airlines;\n",
       "************</span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[94mQUESTION:\n",
      "Return the document id, template id, and description for the document with the name Robbin CV.\n",
      "\n",
      "************\u001b[0m\n",
      "\u001b[92mEXPECTED:\n",
      "SELECT document_id ,  template_id ,  Document_Description FROM Documents WHERE document_name  =  \"Robbin CV\"\n",
      "************\u001b[0m\n",
      "\u001b[93mSUBMISSION:\n",
      "SELECT Documents.Document_ID, Documents.Template_ID, Documents.Document_Description\n",
      "FROM Documents\n",
      "JOIN Templates ON Documents.Template_ID = Templates.Template_ID\n",
      "WHERE Documents.Document_Name = 'Robbin CV';\n",
      "\n",
      "************\u001b[0m\n",
      "----------------------------------------\n",
      "\u001b[94mQUESTION:\n",
      "What country is Jetblue Airways affiliated with?\n",
      "\n",
      "************\u001b[0m\n",
      "\u001b[92mEXPECTED:\n",
      "SELECT Country FROM AIRLINES WHERE Airline  =  \"JetBlue Airways\"\n",
      "************\u001b[0m\n",
      "\u001b[93mSUBMISSION:\n",
      "SELECT Country\n",
      "FROM airlines\n",
      "WHERE Airline = 'Jetblue Airways';\n",
      "************\u001b[0m\n",
      "----------------------------------------\n",
      "\u001b[94mQUESTION:\n",
      "Find the maximum weight for each type of pet. List the maximum weight and pet type.\n",
      "\n",
      "************\u001b[0m\n",
      "\u001b[92mEXPECTED:\n",
      "SELECT max(weight) ,  petType FROM pets GROUP BY petType\n",
      "************\u001b[0m\n",
      "\u001b[93mSUBMISSION:\n",
      "SELECT PetType, MAX(weight) AS max_weight\n",
      "FROM Pets\n",
      "GROUP BY PetType;\n",
      "\n",
      "************\u001b[0m\n",
      "----------------------------------------\n"
     ]
    }
@ -933,9 +1209,9 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Reviewing each of these failures we see the following:\n",
+    "Reviewing some of the failures we see the following:\n",
-    "* The first incorrect answer had an unnecessary join with the 'Templates' table. Our eval was able to accurately identify this and flag this as incorrect. \n",
+    "* The second incorrect answer had an unnecessary join with the 'Templates' table. Our eval was able to accurately identify this and flag this as incorrect. \n",
-    "* The following two answers are technically correct and would succeeed if we compared the results, however they have minor syntax differences that caused the answers to get flagged.\n",
+    "* Few other answers have minor syntax differences that caused the answers to get flagged.\n",
    "  * In situations like this, it would be worthwhile exploring whether we should continue iterating on the prompt to ensure certain stylistic choices, or if we should modify the evaluation suite to capture this variation.\n",
    "  * This type of failure hints at the potential need for model-graded evals as a way to ensure accuracy in grading the results"
   ]
@ -973,7 +1249,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.8"
  }
 },
 "nbformat": 4,