Fix for issue #858: User_and_product_embeddings.ipynb points to incorrect CSV (#863)

pull/890/head
Gabor Cselle 6 months ago committed by GitHub
parent 6390b8c66e
commit 786d0a0b9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -22,16 +22,90 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ProductId</th>\n",
" <th>UserId</th>\n",
" <th>Score</th>\n",
" <th>Summary</th>\n",
" <th>Text</th>\n",
" <th>combined</th>\n",
" <th>n_tokens</th>\n",
" <th>embedding</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>B003XPF9BO</td>\n",
" <td>A3R7JR3FMEBXQB</td>\n",
" <td>5</td>\n",
" <td>where does one start...and stop... with a tre...</td>\n",
" <td>Wanted to save some to bring to my Chicago fam...</td>\n",
" <td>Title: where does one start...and stop... wit...</td>\n",
" <td>52</td>\n",
" <td>[0.007018072064965963, -0.02731654793024063, 0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>297</th>\n",
" <td>B003VXHGPK</td>\n",
" <td>A21VWSCGW7UUAR</td>\n",
" <td>4</td>\n",
" <td>Good, but not Wolfgang Puck good</td>\n",
" <td>Honestly, I have to admit that I expected a li...</td>\n",
" <td>Title: Good, but not Wolfgang Puck good; Conte...</td>\n",
" <td>178</td>\n",
" <td>[-0.003140551969408989, -0.009995664469897747,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"(24502, 19035)"
" ProductId UserId Score \n",
"0 B003XPF9BO A3R7JR3FMEBXQB 5 \\\n",
"297 B003VXHGPK A21VWSCGW7UUAR 4 \n",
"\n",
" Summary \n",
"0 where does one start...and stop... with a tre... \\\n",
"297 Good, but not Wolfgang Puck good \n",
"\n",
" Text \n",
"0 Wanted to save some to bring to my Chicago fam... \\\n",
"297 Honestly, I have to admit that I expected a li... \n",
"\n",
" combined n_tokens \n",
"0 Title: where does one start...and stop... wit... 52 \\\n",
"297 Title: Good, but not Wolfgang Puck good; Conte... 178 \n",
"\n",
" embedding \n",
"0 [0.007018072064965963, -0.02731654793024063, 0... \n",
"297 [-0.003140551969408989, -0.009995664469897747,... "
]
},
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -42,8 +116,28 @@
"from sklearn.model_selection import train_test_split\n",
"from ast import literal_eval\n",
"\n",
"df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
"df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
"df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
"df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(577, 706)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n",
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
"\n",
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
@ -71,7 +165,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -105,7 +199,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 10,
"metadata": {},
"outputs": [
{

Loading…
Cancel
Save