diff --git a/examples/User_and_product_embeddings.ipynb b/examples/User_and_product_embeddings.ipynb
index a0249bac..dc1e0509 100644
--- a/examples/User_and_product_embeddings.ipynb
+++ b/examples/User_and_product_embeddings.ipynb
@@ -22,16 +22,90 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ProductId | \n",
+ " UserId | \n",
+ " Score | \n",
+ " Summary | \n",
+ " Text | \n",
+ " combined | \n",
+ " n_tokens | \n",
+ " embedding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " B003XPF9BO | \n",
+ " A3R7JR3FMEBXQB | \n",
+ " 5 | \n",
+ " where does one start...and stop... with a tre... | \n",
+ " Wanted to save some to bring to my Chicago fam... | \n",
+ " Title: where does one start...and stop... wit... | \n",
+ " 52 | \n",
+ " [0.007018072064965963, -0.02731654793024063, 0... | \n",
+ "
\n",
+ " \n",
+ " 297 | \n",
+ " B003VXHGPK | \n",
+ " A21VWSCGW7UUAR | \n",
+ " 4 | \n",
+ " Good, but not Wolfgang Puck good | \n",
+ " Honestly, I have to admit that I expected a li... | \n",
+ " Title: Good, but not Wolfgang Puck good; Conte... | \n",
+ " 178 | \n",
+ " [-0.003140551969408989, -0.009995664469897747,... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "(24502, 19035)"
+ " ProductId UserId Score \n",
+ "0 B003XPF9BO A3R7JR3FMEBXQB 5 \\\n",
+ "297 B003VXHGPK A21VWSCGW7UUAR 4 \n",
+ "\n",
+ " Summary \n",
+ "0 where does one start...and stop... with a tre... \\\n",
+ "297 Good, but not Wolfgang Puck good \n",
+ "\n",
+ " Text \n",
+ "0 Wanted to save some to bring to my Chicago fam... \\\n",
+ "297 Honestly, I have to admit that I expected a li... \n",
+ "\n",
+ " combined n_tokens \n",
+ "0 Title: where does one start...and stop... wit... 52 \\\n",
+ "297 Title: Good, but not Wolfgang Puck good; Conte... 178 \n",
+ "\n",
+ " embedding \n",
+ "0 [0.007018072064965963, -0.02731654793024063, 0... \n",
+ "297 [-0.003140551969408989, -0.009995664469897747,... "
]
},
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -42,8 +116,28 @@
"from sklearn.model_selection import train_test_split\n",
"from ast import literal_eval\n",
"\n",
- "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
- "df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
+ "df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(577, 706)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['babbage_similarity'] = df[\"embedding\"].apply(literal_eval).apply(np.array)\n",
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
"\n",
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
@@ -71,7 +165,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -105,7 +199,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{