diff --git a/examples/User_and_product_embeddings.ipynb b/examples/User_and_product_embeddings.ipynb index a0249bac..dc1e0509 100644 --- a/examples/User_and_product_embeddings.ipynb +++ b/examples/User_and_product_embeddings.ipynb @@ -22,16 +22,90 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ProductIdUserIdScoreSummaryTextcombinedn_tokensembedding
0B003XPF9BOA3R7JR3FMEBXQB5where does one start...and stop... with a tre...Wanted to save some to bring to my Chicago fam...Title: where does one start...and stop... wit...52[0.007018072064965963, -0.02731654793024063, 0...
297B003VXHGPKA21VWSCGW7UUAR4Good, but not Wolfgang Puck goodHonestly, I have to admit that I expected a li...Title: Good, but not Wolfgang Puck good; Conte...178[-0.003140551969408989, -0.009995664469897747,...
\n", + "
" + ], "text/plain": [ - "(24502, 19035)" + " ProductId UserId Score \n", + "0 B003XPF9BO A3R7JR3FMEBXQB 5 \\\n", + "297 B003VXHGPK A21VWSCGW7UUAR 4 \n", + "\n", + " Summary \n", + "0 where does one start...and stop... with a tre... \\\n", + "297 Good, but not Wolfgang Puck good \n", + "\n", + " Text \n", + "0 Wanted to save some to bring to my Chicago fam... \\\n", + "297 Honestly, I have to admit that I expected a li... \n", + "\n", + " combined n_tokens \n", + "0 Title: where does one start...and stop... wit... 52 \\\n", + "297 Title: Good, but not Wolfgang Puck good; Conte... 178 \n", + "\n", + " embedding \n", + "0 [0.007018072064965963, -0.02731654793024063, 0... \n", + "297 [-0.003140551969408989, -0.009995664469897747,... " ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -42,8 +116,28 @@ "from sklearn.model_selection import train_test_split\n", "from ast import literal_eval\n", "\n", - "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n", - "df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n", + "df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0) # note that you will need to generate this file to run the code below\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(577, 706)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n", "X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n", "\n", "user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n", @@ -71,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [ {