show cleaned output

pull/1147/head
FardinAhsan146 2 months ago
parent cfaf8500e8
commit 835823ab69

@ -2,14 +2,16 @@
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 76,
"id": "ca3f397e-5fbb-4f4c-a191-4e6de2cd9d2d",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json \n",
"import pandas as pd \n",
"from openai import OpenAI\n",
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"try:\n",
" from dotenv import load_dotenv\n",
@ -33,7 +35,7 @@
"\n",
"Using a LLM to parse the data can significantly streamline data cleaning. \n",
"\n",
"In this notebook, we clean up some artificial user input data that includes order items and their respective dimensions. "
"In this notebook, we clean up some artificial user input data that includes order items from an imaginary ecommerce company and their respective dimensions. "
]
},
{
@ -125,40 +127,177 @@
"id": "a14e80ce",
"metadata": {},
"source": [
"### The function call "
"### The function call\n",
"\n",
"Function calling is a tool in the ChatCompletion endpoint. It allows you to specify the output format of the completion request as a JSON object. \n",
"\n",
"Lets define a python function that accepts a ticket and returns a dictionary "
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 80,
"id": "ecddfe05-d9e8-4a75-ae4b-ef2fb2914fea",
"metadata": {},
"outputs": [],
"source": [
"def call_and_clean(text: str, model: str) -> str:\n",
"def call_and_clean(text: str, model: str = \"gpt-4-turbo-preview\") -> dict:\n",
" \"\"\"\n",
" args-- \n",
" text: The text that you want to parse and clean \n",
" model: The OpenAi model alias \n",
" Parse and clean ticket text data utilizing a predefined tool to get item numbers and dimensions.\n",
"\n",
" Args:\n",
" text (str): The text that you want to parse and clean.\n",
" model (str): The OpenAI model alias.\n",
"\n",
" Returns:\n",
" dict: The cleaned and parsed text output from the model.\n",
" \"\"\"\n",
" completion = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Hello!\"}\n",
"\n",
" # View this as a structured output format you want \n",
" tools = [{\"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_item_id_and_dimensions\",\n",
" \"description\": \"Gets the order item number and dimensions from a user ticket about an item.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"order_item_id\": {\n",
" \"type\": \"number\",\n",
" \"description\": \"The order item identifier, its only the number that usually follows a hashtag, or oi tag.\"\n",
" },\n",
" \"length\": { \"type\": \"number\" },\n",
" \"width\": { \"type\": \"number\" },\n",
" \"height\": { \"type\": \"number\" },\n",
" \"length_units\": { \"type\": \"string\", \"enum\": [\"cm\", \"in\", \"ft\"] },\n",
" \"width_units\": { \"type\": \"string\", \"enum\": [\"cm\", \"in\", \"ft\"] },\n",
" \"height_units\": { \"type\": \"string\", \"enum\": [\"cm\", \"in\", \"ft\"] }\n",
" },\n",
" \"required\": [\"order_item_id\", \"length\", \"width\", \"height\", \"length_units\", \"width_units\", \"height_units\"]\n",
" }\n",
" }\n",
" }\n",
" ]\n",
" \n",
" system_prompt = \"\"\"\n",
" Given an item ticket, parse the ticket such that the get_item_id_and_dimensions function can be called for the contents of the ticket.\n",
"\n",
" If no dimensions are provided assume they are in cm. Sometimes the units might be spelled incorrectly, infer the unit in those cases.\n",
" \"\"\"\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=[\n",
" {\"role\":\"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": f\"Item Ticket: {text}\"}\n",
" ],\n",
" tools=tools \n",
" )\n",
"\n",
" return completion.choices[0].message"
" return json.loads(response.choices[0].message.tool_calls[0].function.arguments)"
]
},
{
"cell_type": "markdown",
"id": "d017fd74",
"metadata": {},
"source": [
"### Clean the data "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e9e1568-8295-4a8c-b543-46651f85ced3",
"execution_count": 81,
"id": "0d0adf8d",
"metadata": {},
"outputs": [],
"source": []
"source": [
"df['parsed_tickets'] = df['ticket_content'].apply(call_and_clean)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "64c6899b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_content</th>\n",
" <th>parsed_tickets</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OI-842342, length = 73cm, width = 45cm, height = 55cm</td>\n",
" <td>{'height': 55, 'height_units': 'cm', 'length': 73, 'length_units': 'cm', 'order_item_id': 842342, 'width': 45, 'width_units': 'cm'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>#Item-325364, 46x34x56 cm</td>\n",
" <td>{'height': 56, 'height_units': 'cm', 'length': 46, 'length_units': 'cm', 'order_item_id': 325364, 'width': 34, 'width_units': 'cm'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>#OI-43253252, l-45cm,w-34cm,h-67cm</td>\n",
" <td>{'height': 67, 'height_units': 'cm', 'length': 45, 'length_units': 'cm', 'order_item_id': 43253252, 'width': 34, 'width_units': 'cm'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>#452453 34inx56cmx2ft</td>\n",
" <td>{'height': 2, 'height_units': 'ft', 'length': 34, 'length_units': 'in', 'order_item_id': 452453, 'width': 56, 'width_units': 'cm'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>OrderItem#373578 96,56,23</td>\n",
" <td>{'height': 23, 'height_units': 'cm', 'length': 96, 'length_units': 'cm', 'order_item_id': 373578, 'width': 56, 'width_units': 'cm'}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ticket_content \\\n",
"0 OI-842342, length = 73cm, width = 45cm, height = 55cm \n",
"1 #Item-325364, 46x34x56 cm \n",
"2 #OI-43253252, l-45cm,w-34cm,h-67cm \n",
"3 #452453 34inx56cmx2ft \n",
"4 OrderItem#373578 96,56,23 \n",
"\n",
" parsed_tickets \n",
"0 {'height': 55, 'height_units': 'cm', 'length': 73, 'length_units': 'cm', 'order_item_id': 842342, 'width': 45, 'width_units': 'cm'} \n",
"1 {'height': 56, 'height_units': 'cm', 'length': 46, 'length_units': 'cm', 'order_item_id': 325364, 'width': 34, 'width_units': 'cm'} \n",
"2 {'height': 67, 'height_units': 'cm', 'length': 45, 'length_units': 'cm', 'order_item_id': 43253252, 'width': 34, 'width_units': 'cm'} \n",
"3 {'height': 2, 'height_units': 'ft', 'length': 34, 'length_units': 'in', 'order_item_id': 452453, 'width': 56, 'width_units': 'cm'} \n",
"4 {'height': 23, 'height_units': 'cm', 'length': 96, 'length_units': 'cm', 'order_item_id': 373578, 'width': 56, 'width_units': 'cm'} "
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {

Loading…
Cancel
Save