replace eval with safer literal_eval (#561)

11 months ago · 4fd2b1a6d2
parent 3bd18cc07d
commit 4fd2b1a6d2
13 changed files with 141 additions and 30 deletions
--- a/apps/web-crawl-q-and-a/web-qa.ipynb
+++ b/apps/web-crawl-q-and-a/web-qa.ipynb
@ -1114,10 +1114,11 @@
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n",
    "\n",
    "df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
-    "df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
+    "df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)\n",
    "\n",
    "df.head()"
   ]
--- a/apps/web-crawl-q-and-a/web-qa.py
+++ b/apps/web-crawl-q-and-a/web-qa.py
@ -15,6 +15,7 @@ import tiktoken
 import openai
 import numpy as np
 from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
+from ast import literal_eval

 # Regex pattern to match a URL
 HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
@ -300,7 +301,7 @@ df.head()
 ################################################################################

 df=pd.read_csv('processed/embeddings.csv', index_col=0)
-df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
+df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)

 df.head()

--- a/examples/Classification_using_embeddings.ipynb
+++ b/examples/Classification_using_embeddings.ipynb
@ -40,6 +40,7 @@
    "# imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
@ -49,7 +50,7 @@
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "\n",
    "df = pd.read_csv(datafile_path)\n",
-    "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)  # convert string to array\n",
+    "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)  # convert string to array\n",
    "\n",
    "# split data into train and test\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
@ -67,6 +68,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -105,6 +107,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Clustering.ipynb
+++ b/examples/Clustering.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -29,17 +30,19 @@
    "# imports\n",
    "import numpy as np\n",
    "import pandas as pd\n",
+    "from ast import literal_eval\n",
    "\n",
    "# load data\n",
    "datafile_path = \"./data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "\n",
    "df = pd.read_csv(datafile_path)\n",
-    "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)  # convert string to numpy array\n",
+    "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)  # convert string to numpy array\n",
    "matrix = np.vstack(df.embedding.values)\n",
    "matrix.shape\n"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -47,6 +50,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -155,6 +159,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -240,6 +245,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Clustering_for_transaction_classification.ipynb
+++ b/examples/Clustering_for_transaction_classification.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -12,6 +13,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -33,6 +35,7 @@
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
+    "from ast import literal_eval\n",
    "\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "COMPLETIONS_MODEL = \"text-davinci-003\"\n",
@ -42,6 +45,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -77,7 +81,7 @@
   ],
   "source": [
    "embedding_df = pd.read_csv(embedding_path)\n",
-    "embedding_df[\"embedding\"] = embedding_df.embedding.apply(eval).apply(np.array)\n",
+    "embedding_df[\"embedding\"] = embedding_df.embedding.apply(literal_eval).apply(np.array)\n",
    "matrix = np.vstack(embedding_df.embedding.values)\n",
    "matrix.shape"
   ]
@ -264,6 +268,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Multiclass_classification_for_transactions.ipynb
+++ b/examples/Multiclass_classification_for_transactions.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -15,6 +16,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -48,6 +50,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -247,6 +250,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -303,6 +307,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -663,6 +668,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -670,6 +676,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -914,6 +921,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -1078,9 +1086,10 @@
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "from ast import literal_eval\n",
    "\n",
    "fs_df = pd.read_csv(embedding_path)\n",
-    "fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(eval).apply(np.array)\n",
+    "fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
    "fs_df.head()"
   ]
  },
@ -1135,6 +1144,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -1144,6 +1154,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -1153,6 +1164,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -1703,6 +1715,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -1832,6 +1845,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -2159,6 +2173,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Regression_using_embeddings.ipynb
+++ b/examples/Regression_using_embeddings.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -27,6 +28,7 @@
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.model_selection import train_test_split\n",
@ -35,7 +37,7 @@
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "\n",
    "df = pd.read_csv(datafile_path)\n",
-    "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
+    "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n",
    "\n",
@ -79,6 +81,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Semantic_text_search_using_embeddings.ipynb
+++ b/examples/Semantic_text_search_using_embeddings.ipynb
@ -18,11 +18,12 @@
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "\n",
    "df = pd.read_csv(datafile_path)\n",
-    "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n"
+    "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n"
   ]
  },
  {
@ -101,6 +102,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -126,6 +128,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/User_and_product_embeddings.ipynb
+++ b/examples/User_and_product_embeddings.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -10,6 +11,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -38,9 +40,10 @@
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
+    "from ast import literal_eval\n",
    "\n",
    "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0)  # note that you will need to generate this file to run the code below\n",
-    "df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n",
+    "df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)\n",
    "X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n",
    "\n",
    "user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n",
@ -49,6 +52,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -56,6 +60,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -89,6 +94,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -138,6 +144,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Visualizing_embeddings_in_2D.ipynb
+++ b/examples/Visualizing_embeddings_in_2D.ipynb
@ -11,6 +11,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -39,13 +40,14 @@
    "import pandas as pd\n",
    "from sklearn.manifold import TSNE\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "# Load the embeddings\n",
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "df = pd.read_csv(datafile_path)\n",
    "\n",
    "# Convert to a list of lists of floats\n",
-    "matrix = np.array(df.embedding.apply(eval).to_list())\n",
+    "matrix = np.array(df.embedding.apply(literal_eval).to_list())\n",
    "\n",
    "# Create a t-SNE model and transform the data\n",
    "tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)\n",
@ -54,6 +56,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Visualizing_embeddings_in_W&B.ipynb
+++ b/examples/Visualizing_embeddings_in_W&B.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -20,6 +21,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -37,13 +39,14 @@
    "import pandas as pd\n",
    "from sklearn.manifold import TSNE\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "# Load the embeddings\n",
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "df = pd.read_csv(datafile_path)\n",
    "\n",
    "# Convert to a list of lists of floats\n",
-    "matrix = np.array(df.embedding.apply(eval).to_list())"
+    "matrix = np.array(df.embedding.apply(literal_eval).to_list())"
   ]
  },
  {
@ -68,6 +71,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -75,6 +79,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -82,6 +87,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
--- a/examples/Visualizing_embeddings_with_Atlas.ipynb
+++ b/examples/Visualizing_embeddings_with_Atlas.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -20,6 +21,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -29,20 +31,19 @@
  {
   "cell_type": "code",
   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
-     "text": [
-     ]
+     "text": []
    }
   ],
   "source": [
    "!pip install nomic"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
@ -52,13 +53,14 @@
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "# Load the embeddings\n",
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "df = pd.read_csv(datafile_path)\n",
    "\n",
    "# Convert to a list of lists of floats\n",
-    "embeddings = np.array(df.embedding.apply(eval).to_list())\n",
+    "embeddings = np.array(df.embedding.apply(literal_eval).to_list())\n",
    "df = df.drop('embedding', axis=1)\n",
    "df = df.rename(columns={'Unnamed: 0': 'id'})\n"
   ]
@ -71,8 +73,7 @@
    {
     "name": "stderr",
     "output_type": "stream",
-     "text": [
-     ]
+     "text": []
    }
   ],
   "source": [
@ -88,6 +89,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -97,11 +99,65 @@
  {
   "cell_type": "code",
   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
   "outputs": [
    {
     "data": {
-      "text/plain": "meek-laborer: https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559",
-      "text/html": "\n            <h3>Project: meek-laborer</h3>\n            <script>\n            destroy = function() {\n                document.getElementById(\"iframe463f4614-7689-47e4-b55b-1da0cc679559\").remove()\n            }\n        </script>\n\n        <h4>Projection ID: 463f4614-7689-47e4-b55b-1da0cc679559</h4>\n        <div class=\"actions\">\n            <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n            <div class=\"action\" id=\"out\">\n                <a href=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n            </div>\n        </div>\n        \n        <iframe class=\"iframe\" id=\"iframe463f4614-7689-47e4-b55b-1da0cc679559\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\">\n        </iframe>\n\n        <style>\n            .iframe {\n                /* vh can be **very** large in vscode ipynb. */\n                height: min(75vh, 66vw);\n                width: 100%;\n            }\n        </style>\n        \n        <style>\n            .actions {\n              display: block;\n            }\n            .action {\n              min-height: 18px;\n              margin: 5px;\n              transition: all 500ms ease-in-out;\n            }\n            .action:hover {\n              cursor: pointer;\n            }\n            #hide:hover::after {\n                content: \" X\";\n            }\n            #out:hover::after {\n                content: \"\";\n            }\n        </style>\n        \n            "
+      "text/html": [
+       "\n",
+       "            <h3>Project: meek-laborer</h3>\n",
+       "            <script>\n",
+       "            destroy = function() {\n",
+       "                document.getElementById(\"iframe463f4614-7689-47e4-b55b-1da0cc679559\").remove()\n",
+       "            }\n",
+       "        </script>\n",
+       "\n",
+       "        <h4>Projection ID: 463f4614-7689-47e4-b55b-1da0cc679559</h4>\n",
+       "        <div class=\"actions\">\n",
+       "            <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n",
+       "            <div class=\"action\" id=\"out\">\n",
+       "                <a href=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n",
+       "            </div>\n",
+       "        </div>\n",
+       "        \n",
+       "        <iframe class=\"iframe\" id=\"iframe463f4614-7689-47e4-b55b-1da0cc679559\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559\">\n",
+       "        </iframe>\n",
+       "\n",
+       "        <style>\n",
+       "            .iframe {\n",
+       "                /* vh can be **very** large in vscode ipynb. */\n",
+       "                height: min(75vh, 66vw);\n",
+       "                width: 100%;\n",
+       "            }\n",
+       "        </style>\n",
+       "        \n",
+       "        <style>\n",
+       "            .actions {\n",
+       "              display: block;\n",
+       "            }\n",
+       "            .action {\n",
+       "              min-height: 18px;\n",
+       "              margin: 5px;\n",
+       "              transition: all 500ms ease-in-out;\n",
+       "            }\n",
+       "            .action:hover {\n",
+       "              cursor: pointer;\n",
+       "            }\n",
+       "            #hide:hover::after {\n",
+       "                content: \" X\";\n",
+       "            }\n",
+       "            #out:hover::after {\n",
+       "                content: \"\";\n",
+       "            }\n",
+       "        </style>\n",
+       "        \n",
+       "            "
+      ],
+      "text/plain": [
+       "meek-laborer: https://atlas.nomic.ai/map/fddc0e07-97c5-477c-827c-96bca44519aa/463f4614-7689-47e4-b55b-1da0cc679559"
+      ]
     },
     "execution_count": 10,
     "metadata": {},
@ -110,19 +166,16 @@
   ],
   "source": [
    "map"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "outputs": [],
-   "source": [],
   "metadata": {
    "collapsed": false
-   }
+   },
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/examples/Zero-shot_classification_with_embeddings.ipynb
+++ b/examples/Zero-shot_classification_with_embeddings.ipynb
@ -23,6 +23,7 @@
    "# imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
+    "from ast import literal_eval\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
@ -33,7 +34,7 @@
    "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
    "\n",
    "df = pd.read_csv(datafile_path)\n",
-    "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
+    "df[\"embedding\"] = df.embedding.apply(literal_eval).apply(np.array)\n",
    "\n",
    "# convert 5-star rating to binary sentiment\n",
    "df = df[df.Score != 3]\n",
@ -41,6 +42,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -109,6 +111,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -153,6 +156,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -197,6 +201,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [