replace eval with safer literal_eval (#561)

pull/593/head
Tomas Dulka 11 months ago committed by GitHub
parent 3bd18cc07d
commit 4fd2b1a6d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1114,10 +1114,11 @@
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n",
"\n",
"df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
"df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
"df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)\n",
"\n",
"df.head()"
]

@ -15,6 +15,7 @@ import tiktoken
import openai
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
from ast import literal_eval
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
@ -300,7 +301,7 @@ df.head()
################################################################################
df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)
df.head()

@ -40,6 +40,7 @@
"# imports\n",
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
@ -49,7 +50,7 @@
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"\n",
"df = pd.read_csv(datafile_path)\n",
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array) # convert string to array\n",
"df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array) # convert string to array\n",
"\n",
"# split data into train and test\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
@ -67,6 +68,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -105,6 +107,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -29,17 +30,19 @@
"# imports\n",
"import numpy as np\n",
"import pandas as pd\n",
"from ast import literal_eval\n",
"\n",
"# load data\n",
"datafile_path = \"./data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"\n",
"df = pd.read_csv(datafile_path)\n",
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array) # convert string to numpy array\n",
"df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array) # convert string to numpy array\n",
"matrix = np.vstack(df.embedding.values)\n",
"matrix.shape\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -47,6 +50,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -155,6 +159,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -240,6 +245,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -12,6 +13,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -33,6 +35,7 @@
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from ast import literal_eval\n",
"\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"COMPLETIONS_MODEL = \"text-davinci-003\"\n",
@ -42,6 +45,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -77,7 +81,7 @@
],
"source": [
"embedding_df = pd.read_csv(embedding_path)\n",
"embedding_df[\"embedding\"] = embedding_df.embedding.apply(eval).apply(np.array)\n",
"embedding_df[\"embedding\"] = embedding_df.embedding.apply(literal_eval).apply(np.array)\n",
"matrix = np.vstack(embedding_df.embedding.values)\n",
"matrix.shape"
]
@ -264,6 +268,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -15,6 +16,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -48,6 +50,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -247,6 +250,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -303,6 +307,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -663,6 +668,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -670,6 +676,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -914,6 +921,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -1078,9 +1086,10 @@
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import classification_report, accuracy_score\n",
"from ast import literal_eval\n",
"\n",
"fs_df = pd.read_csv(embedding_path)\n",
"fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(eval).apply(np.array)\n",
"fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
"fs_df.head()"
]
},
@ -1135,6 +1144,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -1144,6 +1154,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -1153,6 +1164,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -1703,6 +1715,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -1832,6 +1845,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -2159,6 +2173,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -27,6 +28,7 @@
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import train_test_split\n",
@ -35,7 +37,7 @@
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"\n",
"df = pd.read_csv(datafile_path)\n",
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
"df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n",
"\n",
@ -79,6 +81,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -18,11 +18,12 @@
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"\n",
"df = pd.read_csv(datafile_path)\n",
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n"
"df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n"
]
},
{
@ -101,6 +102,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -126,6 +128,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -10,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -38,9 +40,10 @@
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from ast import literal_eval\n",
"\n",
"df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n",
"df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n",
"df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
"X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
"\n",
"user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
@ -49,6 +52,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -56,6 +60,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -89,6 +94,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -138,6 +144,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -11,6 +11,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -39,13 +40,14 @@
"import pandas as pd\n",
"from sklearn.manifold import TSNE\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"# Load the embeddings\n",
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"df = pd.read_csv(datafile_path)\n",
"\n",
"# Convert to a list of lists of floats\n",
"matrix = np.array(df.embedding.apply(eval).to_list())\n",
"matrix = np.array(df.embedding.apply(literal_eval).to_list())\n",
"\n",
"# Create a t-SNE model and transform the data\n",
"tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)\n",
@ -54,6 +56,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -20,6 +21,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -37,13 +39,14 @@
"import pandas as pd\n",
"from sklearn.manifold import TSNE\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"# Load the embeddings\n",
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"df = pd.read_csv(datafile_path)\n",
"\n",
"# Convert to a list of lists of floats\n",
"matrix = np.array(df.embedding.apply(eval).to_list())"
"matrix = np.array(df.embedding.apply(literal_eval).to_list())"
]
},
{
@ -68,6 +71,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -75,6 +79,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -82,6 +87,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -20,6 +21,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -29,20 +31,19 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
]
"text": []
}
],
"source": [
"!pip install nomic"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
@ -52,13 +53,14 @@
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"# Load the embeddings\n",
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"df = pd.read_csv(datafile_path)\n",
"\n",
"# Convert to a list of lists of floats\n",
"embeddings = np.array(df.embedding.apply(eval).to_list())\n",
"embeddings = np.array(df.embedding.apply(literal_eval).to_list())\n",
"df = df.drop('embedding', axis=1)\n",
"df = df.rename(columns={'Unnamed: 0': 'id'})\n"
]
@ -71,8 +73,7 @@
{
"name": "stderr",
"output_type": "stream",
"text": [
]
"text": []
}
],
"source": [
@ -88,6 +89,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -97,11 +99,65 @@
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": "meek-laborer: https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559",
"text/html": "\n <h3>Project: meek-laborer</h3>\n <script>\n destroy = function() {\n document.getElementById(\"iframe463f4614-7689-47e4-b55b-1da0cc679559\").remove()\n }\n </script>\n\n <h4>Projection ID: 463f4614-7689-47e4-b55b-1da0cc679559</h4>\n <div class=\"actions\">\n <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n <div class=\"action\" id=\"out\">\n <a href=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n </div>\n </div>\n \n <iframe class=\"iframe\" id=\"iframe463f4614-7689-47e4-b55b-1da0cc679559\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\">\n </iframe>\n\n <style>\n .iframe {\n /* vh can be **very** large in vscode ipynb. */\n height: min(75vh, 66vw);\n width: 100%;\n }\n </style>\n \n <style>\n .actions {\n display: block;\n }\n .action {\n min-height: 18px;\n margin: 5px;\n transition: all 500ms ease-in-out;\n }\n .action:hover {\n cursor: pointer;\n }\n #hide:hover::after {\n content: \" X\";\n }\n #out:hover::after {\n content: \"\";\n }\n </style>\n \n "
"text/html": [
"\n",
" <h3>Project: meek-laborer</h3>\n",
" <script>\n",
" destroy = function() {\n",
" document.getElementById(\"iframe463f4614-7689-47e4-b55b-1da0cc679559\").remove()\n",
" }\n",
" </script>\n",
"\n",
" <h4>Projection ID: 463f4614-7689-47e4-b55b-1da0cc679559</h4>\n",
" <div class=\"actions\">\n",
" <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n",
" <div class=\"action\" id=\"out\">\n",
" <a href=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n",
" </div>\n",
" </div>\n",
" \n",
" <iframe class=\"iframe\" id=\"iframe463f4614-7689-47e4-b55b-1da0cc679559\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\">\n",
" </iframe>\n",
"\n",
" <style>\n",
" .iframe {\n",
" /* vh can be **very** large in vscode ipynb. */\n",
" height: min(75vh, 66vw);\n",
" width: 100%;\n",
" }\n",
" </style>\n",
" \n",
" <style>\n",
" .actions {\n",
" display: block;\n",
" }\n",
" .action {\n",
" min-height: 18px;\n",
" margin: 5px;\n",
" transition: all 500ms ease-in-out;\n",
" }\n",
" .action:hover {\n",
" cursor: pointer;\n",
" }\n",
" #hide:hover::after {\n",
" content: \" X\";\n",
" }\n",
" #out:hover::after {\n",
" content: \"\";\n",
" }\n",
" </style>\n",
" \n",
" "
],
"text/plain": [
"meek-laborer: https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559"
]
},
"execution_count": 10,
"metadata": {},
@ -110,19 +166,16 @@
],
"source": [
"map"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
"outputs": [],
"source": []
}
],
"metadata": {

@ -23,6 +23,7 @@
"# imports\n",
"import pandas as pd\n",
"import numpy as np\n",
"from ast import literal_eval\n",
"\n",
"from sklearn.metrics import classification_report\n",
"\n",
@ -33,7 +34,7 @@
"datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
"\n",
"df = pd.read_csv(datafile_path)\n",
"df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
"df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n",
"\n",
"# convert 5-star rating to binary sentiment\n",
"df = df[df.Score != 3]\n",
@ -41,6 +42,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -109,6 +111,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -153,6 +156,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -197,6 +201,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [

Loading…
Cancel
Save