diff --git a/examples/Clustering.ipynb b/examples/Clustering.ipynb index c5960ab8..8849b725 100644 --- a/examples/Clustering.ipynb +++ b/examples/Clustering.ipynb @@ -29,12 +29,11 @@ "import pandas as pd\n", "import numpy as np\n", "\n", - "\n", "datafile_path = \"https://cdn.openai.com/API/examples/data/fine_food_reviews_with_embeddings_1k.csv\" # for your convenience, we precomputed the embeddings\n", "df = pd.read_csv(datafile_path)\n", - "df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n", + "df[\"babbage_similarity\"] = df.babbage_similarity.apply(eval).apply(np.array)\n", "matrix = np.vstack(df.babbage_similarity.values)\n", - "matrix.shape" + "matrix.shape\n" ] }, { @@ -77,12 +76,12 @@ "\n", "n_clusters = 4\n", "\n", - "kmeans = KMeans(n_clusters = n_clusters,init='k-means++',random_state=42)\n", + "kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n", "kmeans.fit(matrix)\n", "labels = kmeans.labels_\n", - "df['Cluster'] = labels\n", + "df[\"Cluster\"] = labels\n", "\n", - "df.groupby('Cluster').Score.mean().sort_values()" + "df.groupby(\"Cluster\").Score.mean().sort_values()\n" ] }, { @@ -125,22 +124,24 @@ "import matplotlib\n", "import matplotlib.pyplot as plt\n", "\n", - "tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)\n", + "tsne = TSNE(\n", + " n_components=2, perplexity=15, random_state=42, init=\"random\", learning_rate=200\n", + ")\n", "vis_dims2 = tsne.fit_transform(matrix)\n", "\n", - "x = [x for x,y in vis_dims2]\n", - "y = [y for x,y in vis_dims2]\n", + "x = [x for x, y in vis_dims2]\n", + "y = [y for x, y in vis_dims2]\n", "\n", - "for category, color in enumerate(['purple', 'green', 'red', 'blue']):\n", - " xs = np.array(x)[df.Cluster==category]\n", - " ys = np.array(y)[df.Cluster==category]\n", + "for category, color in enumerate([\"purple\", \"green\", \"red\", \"blue\"]):\n", + " xs = np.array(x)[df.Cluster == category]\n", + " ys = np.array(y)[df.Cluster == category]\n", " plt.scatter(xs, ys, color=color, alpha=0.3)\n", "\n", " avg_x = xs.mean()\n", " avg_y = ys.mean()\n", - " \n", - " plt.scatter(avg_x, avg_y, marker='x', color=color, s=100)\n", - "plt.title(\"Clusters identified visualized in language 2d using t-SNE\")" + "\n", + " plt.scatter(avg_x, avg_y, marker=\"x\", color=color, s=100)\n", + "plt.title(\"Clusters identified visualized in language 2d using t-SNE\")\n" ] }, { @@ -199,26 +200,32 @@ "\n", "for i in range(n_clusters):\n", " print(f\"Cluster {i} Theme:\", end=\" \")\n", - " \n", - " reviews = \"\\n\".join(df[df.Cluster == i].combined.str.replace(\"Title: \", \"\").str.replace(\"\\n\\nContent: \", \": \").sample(rev_per_cluster, random_state=42).values)\n", + "\n", + " reviews = \"\\n\".join(\n", + " df[df.Cluster == i]\n", + " .combined.str.replace(\"Title: \", \"\")\n", + " .str.replace(\"\\n\\nContent: \", \": \")\n", + " .sample(rev_per_cluster, random_state=42)\n", + " .values\n", + " )\n", " response = openai.Completion.create(\n", " engine=\"davinci-instruct-beta-v3\",\n", - " prompt=f\"What do the following customer reviews have in common?\\n\\nCustomer reviews:\\n\\\"\\\"\\\"\\n{reviews}\\n\\\"\\\"\\\"\\n\\nTheme:\",\n", + " prompt=f'What do the following customer reviews have in common?\\n\\nCustomer reviews:\\n\"\"\"\\n{reviews}\\n\"\"\"\\n\\nTheme:',\n", " temperature=0,\n", " max_tokens=64,\n", " top_p=1,\n", " frequency_penalty=0,\n", - " presence_penalty=0\n", + " presence_penalty=0,\n", " )\n", - " print(response[\"choices\"][0][\"text\"].replace('\\n',''))\n", + " print(response[\"choices\"][0][\"text\"].replace(\"\\n\", \"\"))\n", "\n", - " sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42) \n", + " sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)\n", " for j in range(rev_per_cluster):\n", " print(sample_cluster_rows.Score.values[j], end=\", \")\n", " print(sample_cluster_rows.Summary.values[j], end=\": \")\n", " print(sample_cluster_rows.Text.str[:70].values[j])\n", - " \n", - " print(\"-\" * 100)" + "\n", + " print(\"-\" * 100)\n" ] }, { @@ -237,11 +244,9 @@ } ], "metadata": { - "interpreter": { - "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" - }, "kernelspec": { - "display_name": "Python 3.7.3 64-bit ('base': conda)", + "display_name": "Python 3.9.9 ('openai')", + "language": "python", "name": "python3" }, "language_info": { @@ -256,7 +261,12 @@ "pygments_lexer": "ipython3", "version": "3.9.9" }, - "orig_nbformat": 4 + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97" + } + } }, "nbformat": 4, "nbformat_minor": 2