Eval dataset creation

3 months ago · 000391c623
parent 79a6cff533
commit 000391c623
1 changed files with 133 additions and 12 deletions
--- a/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb
+++ b/examples/evaluation/Getting_Started_with_OpenAI_Evals.ipynb
@ -90,22 +90,22 @@
    "2/ The eval template to be used\n",
    "\n",
    "### Creating the eval dataset\n",
-    "Lets create a dataset for a use case where we are evaluating the model's ability to generate syntactically correct SQL.\n",
+    "Lets create a dataset for a use case where we are evaluating the model's ability to generate syntactically correct SQL. In this use case, we have a series of tables that are related to car manufacturing\n",
    "\n",
    "First we will need to create a system prompt that we would like to evaluate. We will pass in instructions for the model as well as an overview of the table structure:\n",
-    "`\"TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable city, columns = [*,ID,Name,CountryCode,District,Population]\\nTable country, columns = [*,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2]\\nTable countrylanguage, columns = [*,CountryCode,Language,IsOfficial,Percentage]\\nTable sqlite_sequence, columns = [*,name,seq]\\nForeign_keys = [city.CountryCode = country.Code,countrylanguage.CountryCode = country.Code]\\n\"`\n",
+    "`\"TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]\"`\n",
    "\n",
    "For this prompt, we can ask a specific question:\n",
-    "`\"Q: What is the GNP of Afghanistan?\"`\n",
+    "`\"Q: how many car makers are their in germany?\"`\n",
    "\n",
    "And we have an expected answer:\n",
-    "`\"A: SELECT GNP FROM country WHERE name = \\\"Afghanistan\\\"\"`\n",
+    "`\"A: SELECT count ( * )  FROM CAR_MAKERS AS T1 JOIN COUNTRIES AS T2 ON T1.Country   =   T2.CountryId WHERE T2.CountryName   =   'germany'\"`\n",
    "\n",
    "The dataset needs to be in the followingformat\"\n",
-    "`\"input\": [{\"role\": \"system\", \"content\": \"<input prompt>\",\"name\":\"example-user\"}, \"ideal\": \"correct answer\"]`\n",
+    "`\"input\": [{\"role\": \"system\", \"content\": \"<input prompt>\"}, {\"role\": \"user\", \"content\": <user input>}, \"ideal\": \"correct answer\"]`\n",
    "\n",
    "Putting it all together, we get:\n",
-    "`{\"input\": [{\"role\": \"system\", \"content\": \"TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable city, columns = [*,ID,Name,CountryCode,District,Population]\\nTable country, columns = [*,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2]\\nTable countrylanguage, columns = [*,CountryCode,Language,IsOfficial,Percentage]\\nTable sqlite_sequence, columns = [*,name,seq]\\nForeign_keys = [city.CountryCode = country.Code,countrylanguage.CountryCode = country.Code]\\n\"}, {\"role\": \"user\", \"content\": \"Q: What is the GNP of Afghanistan?\"}], \"ideal\": [\"A: SELECT GNP FROM country WHERE name = \\\"Afghanistan\\\"\"]}`\n",
+    "`{\"input\": [{\"role\": \"system\", \"content\": \"TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]\\n\"}, {\"role\": \"system\", \"content\": \"Q: how many car makers are their in germany\"}, \"ideal\": [\"A: SELECT count ( * )  FROM CAR_MAKERS AS T1 JOIN COUNTRIES AS T2 ON T1.Country   =   T2.CountryId WHERE T2.CountryName   =   'germany'\"]}`\n",
    "\n",
    "\n",
    "One way to speed up the process of building eval datasets, is to use GPT-4 to generate synthetic data"
@ -113,13 +113,134 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Q: Which maker has the highest number of models available in the dataset?\n",
+      "A: SELECT Maker, COUNT(Model) AS ModelCount FROM model_list GROUP BY Maker ORDER BY ModelCount DESC LIMIT 1\n",
+      "\n",
+      "Q: What is the average horsepower of cars made by a maker from the continent 'Europe'?\n",
+      "A: SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "\n",
+      "Q: What are the average horsepower and weight for cars made by makers from the continent of Europe?\n",
+      "A: SELECT AVG(cars_data.Horsepower) AS AVG_Horsepower, AVG(cars_data.Weight) AS AVG_Weight FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "\n",
+      "Q: Which car maker has the most models with horsepower greater than 200?\n",
+      "A: SELECT Maker, count(*) as ModelCount FROM car_names AS cn JOIN model_list AS ml ON cn.Model = ml.Model JOIN car_makers AS cm ON ml.Maker = cm.Id JOIN cars_data AS cd ON cn.MakeId = cd.Id WHERE cd.Horsepower > 200 GROUP BY Maker ORDER BY ModelCount DESC LIMIT 1\n",
+      "\n",
+      "Q: What is the average MPG (Miles Per Gallon) for cars made by manufacturers from Europe?\n",
+      "A: SELECT AVG(cars_data.MPG) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Use GPT-4 to generate synthetic data\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI()\n",
+    "# Define the system prompt and user input (these should be filled as per the specific use case)\n",
+    "system_prompt = \"\"\"You are a helpful assistant that can ask questions about a database table and write SQL queries to answer the question.\n",
+    "    A user will pass in a table schema and your job is to return a question answer pairing. The question should relevant to the schema of the table,\n",
+    "    and you can speculate on its contents. You will then have to generate a SQL query to answer the question. Below are some examples of what this should look like.\n",
+    "\n",
+    "    Example 1\n",
+    "    ```````````\n",
+    "    User input: Table museum, columns = [*,Museum_ID,Name,Num_of_Staff,Open_Year]\\nTable visit, columns = [*,Museum_ID,visitor_ID,Num_of_Ticket,Total_spent]\\nTable visitor, columns = [*,ID,Name,Level_of_membership,Age]\\nForeign_keys = [visit.visitor_ID = visitor.ID,visit.Museum_ID = museum.Museum_ID]\\n\n",
+    "    Assistant Response:\n",
+    "    Q: How many visitors have visited the museum with the most staff?\n",
+    "    A: SELECT count ( * )  FROM VISIT AS T1 JOIN MUSEUM AS T2 ON T1.Museum_ID   =   T2.Museum_ID WHERE T2.Num_of_Staff   =   ( SELECT max ( Num_of_Staff )  FROM MUSEUM ) \n",
+    "    ```````````\n",
+    "\n",
+    "    Example 2\n",
+    "    ```````````\n",
+    "    User input: Table museum, columns = [*,Museum_ID,Name,Num_of_Staff,Open_Year]\\nTable visit, columns = [*,Museum_ID,visitor_ID,Num_of_Ticket,Total_spent]\\nTable visitor, columns = [*,ID,Name,Level_of_membership,Age]\\nForeign_keys = [visit.visitor_ID = visitor.ID,visit.Museum_ID = museum.Museum_ID]\\n\n",
+    "    Assistant Response:\n",
+    "    Q: What are the names who have a membership level higher than 4?\n",
+    "    A: SELECT Name   FROM VISITOR AS T1 WHERE T1.Level_of_membership   >   4 \n",
+    "    ```````````\n",
+    "\n",
+    "    Example 3\n",
+    "    ```````````\n",
+    "    User input: Table museum, columns = [*,Museum_ID,Name,Num_of_Staff,Open_Year]\\nTable visit, columns = [*,Museum_ID,visitor_ID,Num_of_Ticket,Total_spent]\\nTable visitor, columns = [*,ID,Name,Level_of_membership,Age]\\nForeign_keys = [visit.visitor_ID = visitor.ID,visit.Museum_ID = museum.Museum_ID]\\n\n",
+    "    Assistant Response:\n",
+    "    Q: How many tickets of customer id 5?\n",
+    "    A: SELECT count ( * )  FROM VISIT AS T1 JOIN VISITOR AS T2 ON T1.visitor_ID   =   T2.ID WHERE T2.ID   =   5 \n",
+    "    ```````````\n",
+    "    \"\"\"\n",
+    "\n",
+    "user_input = \"Table car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]\"\n",
+    "\n",
+    "messages = []\n",
+    "messages.append({\n",
+    "    \"role\": \"system\",\n",
+    "    \"content\": system_prompt\n",
+    "})\n",
+    "messages.append({\n",
+    "    \"role\": \"user\",\n",
+    "    \"content\": user_input\n",
+    "})\n",
+    "\n",
+    "\n",
+    "completion = client.chat.completions.create(\n",
+    "  model=\"gpt-4-turbo-preview\",\n",
+    "  messages=messages,\n",
+    "  temperature=1.0,\n",
+    "  n=5\n",
+    ")\n",
+    "\n",
+    "for choice in completion.choices:\n",
+    "    print(choice.message.content + \"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have the synthetic data, we need to convert it to match the format of the eval dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'Which maker has the highest number of models available in the dataset?'}], 'ideal': 'SELECT Maker, COUNT(Model) AS ModelCount FROM model_list GROUP BY Maker ORDER BY ModelCount DESC LIMIT 1'}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': \"What is the average horsepower of cars made by a maker from the continent 'Europe'?\"}], 'ideal': \"SELECT AVG(cars_data.Horsepower) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What are the average horsepower and weight for cars made by makers from the continent of Europe?'}], 'ideal': \"SELECT AVG(cars_data.Horsepower) AS AVG_Horsepower, AVG(cars_data.Weight) AS AVG_Weight FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'Which car maker has the most models with horsepower greater than 200?'}], 'ideal': 'SELECT Maker, count(*) as ModelCount FROM car_names AS cn JOIN model_list AS ml ON cn.Model = ml.Model JOIN car_makers AS cm ON ml.Maker = cm.Id JOIN cars_data AS cd ON cn.MakeId = cd.Id WHERE cd.Horsepower > 200 GROUP BY Maker ORDER BY ModelCount DESC LIMIT 1'}\n",
+      "{'input': [{'role': 'system', 'content': 'TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]'}, {'role': 'user', 'content': 'What is the average MPG (Miles Per Gallon) for cars made by manufacturers from Europe?'}], 'ideal': \"SELECT AVG(cars_data.MPG) FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id JOIN countries ON car_makers.Country = countries.CountryId JOIN continents ON countries.Continent = continents.ContId WHERE continents.Continent = 'Europe'\"}\n"
+     ]
+    }
+   ],
   "source": [
-    "## Use GPT-4 to generate synthetic data"
+    "eval_data = []\n",
+    "input_prompt = \"TASK: Answer the following question with syntactically correct SQLite SQL. The SQL should be correct and be in context of the previous question-answer pairs.\\nTable car_makers, columns = [*,Id,Maker,FullName,Country]\\nTable car_names, columns = [*,MakeId,Model,Make]\\nTable cars_data, columns = [*,Id,MPG,Cylinders,Edispl,Horsepower,Weight,Accelerate,Year]\\nTable continents, columns = [*,ContId,Continent]\\nTable countries, columns = [*,CountryId,CountryName,Continent]\\nTable model_list, columns = [*,ModelId,Maker,Model]\\nForeign_keys = [countries.Continent = continents.ContId,car_makers.Country = countries.CountryId,model_list.Maker = car_makers.Id,car_names.Model = model_list.Model,cars_data.Id = car_names.MakeId]\"\n",
+    "\n",
+    "for choice in completion.choices:\n",
+    "    question = choice.message.content.split(\"Q: \")[1].split(\"\\n\")[0]  # Extracting the question\n",
+    "    answer = choice.message.content.split(\"\\nA: \")[1].split(\"\\n\")[0]  # Extracting the answer\n",
+    "    eval_data.append({\n",
+    "        \"input\": [\n",
+    "            {\"role\": \"system\", \"content\": input_prompt},\n",
+    "            {\"role\": \"user\", \"content\": question},\n",
+    "        ],\n",
+    "        \"ideal\": answer\n",
+    "    })\n",
+    "\n",
+    "for item in eval_data:\n",
+    "    print(item)\n"
   ]
  },
  {
@ -157,14 +278,14 @@
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-    "version": 2
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,