@ -21,7 +21,32 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 1,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import pandas as pd\n",
"import tiktoken\n",
"\n",
"from openai.embeddings_utils import get_embedding\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# embedding model parameters\n",
"embedding_model = \"text-embedding-ada-002\"\n",
"embedding_encoding = \"cl100k_base\" # this the encoding for text-embedding-ada-002\n",
"max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
@ -97,25 +122,26 @@
"1 Title: Arrived in pieces; Content: Not pleased... "
"1 Title: Arrived in pieces; Content: Not pleased... "
]
]
},
},
"execution_count": 1 ,
"execution_count": 8 ,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
],
],
"source": [
"source": [
"import pandas as pd\n",
"# load & inspect dataset\n",
"\n",
"input_datapath = \"data/fine_food_reviews_1k.csv\" # to save space, we provide a pre-filtered dataset\n",
"input_datapath = 'data/fine_food_reviews_1k.csv' # to save space, we provide a pre-filtered dataset\n",
"df = pd.read_csv(input_datapath, index_col=0)\n",
"df = pd.read_csv(input_datapath, index_col=0)\n",
"df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text' ]]\n",
"df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\" ]]\n",
"df = df.dropna()\n",
"df = df.dropna()\n",
"df['combined'] = \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
"df[\"combined\"] = (\n",
"df.head(2)"
" \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
")\n",
"df.head(2)\n"
]
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 2 ,
"execution_count": 9 ,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
@ -124,54 +150,52 @@
"1000"
"1000"
]
]
},
},
"execution_count": 2 ,
"execution_count": 9 ,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
],
],
"source": [
"source": [
"# subsample to 1k most recent reviews and remove samples that are too long\n",
"# subsample to 1k most recent reviews and remove samples that are too long\n",
"df = df.sort_values('Time').tail(1_100)\n",
"top_n = 1000\n",
"df.drop('Time', axis=1, inplace=True)\n",
"df = df.sort_values(\"Time\").tail(top_n * 2) # first cut to first 2k entries, assuming less than half will be filtered out\n",
"df.drop(\"Time\", axis=1, inplace=True)\n",
"\n",
"\n",
"from transformers import GPT2TokenizerFast\n",
"encoding = tiktoken.get_encoding(embedding_encoding)\n",
"tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n",
"\n",
"\n",
"# remove reviews that are too long \n",
"# omit reviews that are too long to embed \n",
"df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer .encode(x)))\n",
"df[\"n_tokens\"] = df.combined.apply(lambda x: len(encoding .encode(x)))\n",
"df = df[df.n_tokens<8000].tail(1_000 )\n",
"df = df[df.n_tokens <= max_tokens].tail(top_n )\n",
"len(df)"
"len(df)\n "
]
]
},
},
{
{
"attachments": {},
"cell_type": "markdown",
"cell_type": "markdown",
"metadata": {},
"metadata": {},
"source": [
"source": [
"### 2. Get embeddings and save them for future reuse"
"## 2. Get embeddings and save them for future reuse"
]
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 3 ,
"execution_count": 10 ,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"import openai\n",
"from openai.embeddings_utils import get_embedding\n",
"# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
"# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
"\n",
"\n",
"# This will take just between 5 and 10 minutes\n",
"# This may take a few minutes\n",
"df['ada_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
"df[\"embedding\"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))\n",
"df['ada_search'] = df['ada_similarity']\n",
"df.to_csv(\"data/fine_food_reviews_with_embeddings_1k.csv\")\n"
"df.to_csv('data/fine_food_reviews_with_embeddings_1k.csv')"
]
]
}
}
],
],
"metadata": {
"metadata": {
"kernelspec": {
"kernelspec": {
"display_name": "openai-cookbook ",
"display_name": "openai",
"language": "python",
"language": "python",
"name": "openai-cookbook "
"name": "python3 "
},
},
"language_info": {
"language_info": {
"codemirror_mode": {
"codemirror_mode": {
@ -183,12 +207,12 @@
"name": "python",
"name": "python",
"nbconvert_exporter": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.9.9 (main, Dec 7 2021, 18:04:5 6) \n[Clang 13.0.0 (clang-1300.0.29.3)] "
},
},
"orig_nbformat": 4,
"orig_nbformat": 4,
"vscode": {
"vscode": {
"interpreter": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6 "
"hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97 "
}
}
}
}
},
},