You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
openai-cookbook/examples/Clustering_for_transaction_...

290 lines
41 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clustering for Transaction Classification\n",
"\n",
"In this notebook \n",
"\n",
"To feed the model we use embeddings created using the approach displayed in the notebook [Multiclass classification for transactions Notebook](Multiclass_classification_for_transactions.ipynb), applied to the full 359 transactions in the dataset to give us a bigger pool for learning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from helpers import OPENAI_API_KEY\n",
"\n",
"openai.api_key = OPENAI_API_KEY\n",
"COMPLETIONS_MODEL = \"text-alpha-002-latest\"\n",
"\n",
"# This path leads to a file with embeddings created in the notebook linked above\n",
"embedding_path = 'data/transactions_with_embeddings_359.csv'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clustering\n",
"\n",
"We'll now try something different - lets see if the model can discern useful categories of its own for us to use to sift through and explore our data. \n",
"\n",
"This clustering approach draws heavily on the [Clustering Notebook](Clustering.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(359, 2048)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embedding_df = pd.read_csv(embedding_path)\n",
"embedding_df[\"babbage_similarity\"] = embedding_df.babbage_similarity.apply(eval).apply(np.array)\n",
"matrix = np.vstack(embedding_df.babbage_similarity.values)\n",
"matrix.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"n_clusters = 5\n",
"\n",
"kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n",
"kmeans.fit(matrix)\n",
"labels = kmeans.labels_\n",
"embedding_df[\"Cluster\"] = labels"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, 'Clusters identified visualized in language 2d using t-SNE')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEICAYAAABS0fM3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAABVRklEQVR4nO29eXycV3X//z6zL1pGm2VJI2/xFtshTmxlIUkT1iwNmNCUL2UNhORLf0ADbb4FSksohbbQlDRQ2hIaylIKBEMIpFlIAgkhcWI5dpzE8W7H1ki2rF0aSbPf3x9nRhrL2ixb+32/Xnpp5nmeeZ77zHI+955z7rlijMFisVgs8xfHdDfAYrFYLNOLFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ58xqIRCRL4jIf093O04HEXlYRD44wr4lImJExDVJ146KyLLsY7+I/EpEukTkpyLyXhH59QTPe5OI/P5M2zQZDH1PR3v/z+AaI34PReQKEdk7wfNO+H2dD0zG+yMii7LfSefZPO9MZ8YLgYi8R0S2ZT+cY9kf8uVn8fyTanyHYoy51hjzvcm+jog8KSIfGXLtAmPMoezTG4FKoMwY88fGmB8aY9462e0aypA2TcX1puT9z7ve08aYVVN1vdmMiFwiIo+JSLuItGQ7KFVT2QZjzNHsdzJ9ts8tIt8VkS+NcUxIRL4jIsdFpEdE9onIZ/L2GxF5WUQcedu+JCLfzT7O2bPokL//M9p1Z7QQiMifA/8C/D1qtBYB/wZsmsZmncRUCcgksBjYZ4xJTXdDLJYsJcA9wBL0+9kD/Nd0NmgauAsoAM4FioG3AweGHFMNvHuM84Sygpb7+8moRxtjZuRf9k2IAn88yjFfAP47+/gqIDJk/2vAm7OPLwK2Ad1AM/C17PajgMleKwpcmt3+YWA30AE8CizOO68BPgbsBw4Dkv0AT2TP/zKwboQ2Pwl8JPvYCdwJtAKHsuc0gCvvPbgXOAY0Al8CnNl9NwG/z76+I9uOa7P7vgykgVj2nv41r93Lgb8FEkAyu//m3Pny2rkaeAxoB/YC78rbVwb8MnuvW4G/y3/tkPt9GPj4kG07gXfmtyn7+DrgVdQANAK359/rkHPkv+4PgR3Z9jQAX8g7bsmQ9zT//d+Z97lHs8ddld13CfAs0Jk97qq8cy4Fnsq28zHgX8l+D4e5/6vI+16i38nbgZeALuAngG+E1w79TO7O3l838AJwxZDfwn3A97Pt2gVszNt/YfY96gF+mr3ul870/c3u/wBwBGgD/oaTf3cO4DPAwez++4DScdqAC4GeCX7vTnrfT8MeDPd9+Tvgmex792ugfDz3PuTat6K/twT6XfvVCO1+BXjHKO+JAT6N2p5cG78EfHe49o/3b9oN/ig3fA2QGu2GOD0h2AK8P/u4ALhkpDcOHXEcQFXZBfw18OyQD+MxoBTwA1ejP8wQKgrnAlUjtPlJBg3RR4E9QG32XL8d8iW8H/gWEAQWZL/8/zfvx5sEbkEF5U+BJkCGXmeEH/fAe5d3vt9nHwfRH/yHsvd/ASpWa7L7f4z+oIPAOtRoj/SD/ADwTN7zNahx9Q7TpmNkjRvaO7xwaNtGuJergPNQo/M69If9jlF+2B8Zpp23Zj+LIqAG/WFflz3nW7LPK/K+S18DvMAfoAbidIRgK9qrK0U7Gx8d4bUn3TfwPtQYuoC/AI6TFZHs5xnLttkJ/APwXHafBzVWtwFu4J2oQRqvEIz2/q5BDdvl2evciX4vc7+724DngHD2/foW8KNx2oBP5u5hAt+7k973idqD7PflILAS/a0/CfzjeO59mDZ9N/eej3LP/4mK+IeAFcPsN8AK1N7k7MgZC8FMdg2VAa3m7LkuksByESk3xkSNMc+NcuxHgX8wxuzOXv/vgfUisjjvmH8wxrQbY/qz5y5Ee9GSfd2xcbTpXcC/GGMajDHt6I8XABGpRH/UnzTG9BpjTqCjjvwh4RFjzLeN+jO/B1ShLrQz5XrgNWPMfxljUsaYHcDPgD/OBtH+CPh8tl2vZK89Evdz8nv3XuDnxpj4MMcmgTUiUmSM6TDGbB9PY40xTxpjXjbGZIwxLwE/Aq4c361CNub0JeDtxphu1OA+ZIx5KHvOx9De43UisgioA/7GGBM3xvwO+NV4r5Xl68aYpuxn/itg/XheZIz5b2NMW/Yz+WfUsObHH36fbXMa+AFwfnb7Jah4fN0YkzTG/BwVo3Exxvt7I9q7/b0xJgF8HjVEOT4KfM4YE8l+5l8AbhzLpSoir8ue6/9ln5/u924sTsce/JcxZl/2t34fg5/XWPc+ET4B/BD4OPCqiBwQkWuHHGPQ0cffiIhnhPO0ikhn3t+5o110JgtBG1B+Fn3wN6OqvkdE6kXk+lGOXQzcnXsTUfeIoD3FHA25B8aY36DugW8CJ0TkHhEpGkebqvPPg/ba8tvgBo7lteNb6Mggx/G8NvRlHxaM47pjsRi4OP+LhBrwhUAFalRGavdJGGN6gP9lUMD+BP2iD8cfoeJ3RESeEpFLx9NYEblYRH6bDTB2ocanfJyvrUV/3B80xuzLbl6Mil7+/V+OCm010GGM6c07zYj3PwLH8x73Mc7PTERuF5Hd2UyvTtR1mH+fQ8/ry/5+qoFGk+0yZsn//Ma67mjv70nf4ez3sC3v5YuB+/Pex92o23LEDouILEddircZY57Obj6t7904OB17MNLnNda9j0o2Uy8XzH04e45+Y8zfG2M2oJ3h+4Cfikhp/muNMQ8BEeD/jnD6cmNMKO9v92htmclCsAWIA+8Y5/G9QCD3JNuDqMg9N8bsN8b8CWpIvwJsFpEgwyt4A+qCyX8j/caYZ/OOOel1xpivZz+8NegX7P+No83HULdQjkVD2hDn5A+0yBizdhznPaV9p0kD8NSQ+y8wxvwp0IK67EZq93D8CPiTrGH3oS6wUxtsTL0xZhP6Gf0C/RHAqZ/twiEv/R/Ud1xrjCkG/gMV7lEREX/2Ov9ijHk4b1cD8IMh9x80xvwj+pmVZL87Oca6/zNGRK4A/hIdRZYYY0JojGHM+0TbXCMi+cfmf35n8v4eQ90+udf6UQOWowGNXeW/lz5jTOMI97kYeBz4O2PMD/J2ne73bqL24HQY696HMtRm/NAMBnOH9vrJjk7/HnWFLR3mfJ8D/oq8+5woM1YIjDFd6FDrmyLyDhEJiIhbRK4Vka8O85J9aA/oD0XEjfr1vbmdIvI+EakwxmRQHzVABv2CZYD8XPb/AD4rImuzry0WkT8eqa0iUpftNbnRL2Ase86xuA/4MxEJi0gJGlTL3f8xNDD1zyJSJCIOETlHRMbr8mgeck+nw4PAShF5f/Y9d2fv8dys2+HnwBeyn8ka4INjnO8htGf4ReAn2c/gJETEk+0hFRtjkmgQL3fcTmCtiKwXER/qXsinEGg3xsRE5CLgPeO8z+8Ae4wxQ79P/w28TUSuFhGniPhE5CoRCRtjjqBuor/Ntvly4G3jvN6ZUIgawhbAJSKfR+MZ42EL2gv/uIi4RGQTGizNcSbv72b0vXp91k3xBU4Wp/8AvpxzDYpIRfb6pyAiNcBv0OSG/8jfN4Hv3UTtwekw1r0PZczfpIj8Tfa35sl+Frdl23fKXBRjzJNocHms39+YzFghAMj6Qf8c/RBb0N7Fx9Fe3NBju4D/Dw22NKIGOZJ3yDXALhGJotkX784Ow/rQLJtnssPXS4wx96O9hB+LSDf6Zp+i2HkUAd9Gs3dyGQT/NI5b/DaakbQT2I5+0fP5ABqEejV77s2oe2I83I36YjtE5OvjfA0w4M55K+rOaUKHxl9h8If0cXR4fBwNgP3XGOeLo/f2ZrR3ORLvB17LvucfRd1RZF02X0R7ivvRbKl8/j/giyLSg3Ye7mN8vBu4QU7Ot77CGNOAJgz8FYPfu//H4O/lPcDFqMvwDjRTZ7J5FHgENXBH0M7GuNw7Wf/1O1F3SCcaA3kQHXGe0ftrjNmF+rV/jPaQo2j2XC4GdDc6mvh19vXPoe/dcHwENZRfyP9M8vaP+3s3UXsw0vlGuMZY9z6Ue9E
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"tsne = TSNE(\n",
" n_components=2, perplexity=15, random_state=42, init=\"random\", learning_rate=200\n",
")\n",
"vis_dims2 = tsne.fit_transform(matrix)\n",
"\n",
"x = [x for x, y in vis_dims2]\n",
"y = [y for x, y in vis_dims2]\n",
"\n",
"for category, color in enumerate([\"purple\", \"green\", \"red\", \"blue\",\"yellow\"]):\n",
" xs = np.array(x)[embedding_df.Cluster == category]\n",
" ys = np.array(y)[embedding_df.Cluster == category]\n",
" plt.scatter(xs, ys, color=color, alpha=0.3)\n",
"\n",
" avg_x = xs.mean()\n",
" avg_y = ys.mean()\n",
"\n",
" plt.scatter(avg_x, avg_y, marker=\"x\", color=color, s=100)\n",
"plt.title(\"Clusters identified visualized in language 2d using t-SNE\")\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cluster 0 Theme: Literary & Archival ItemsOne possible way to group these transactions into a meaningful cluster is by their theme or purpose. A common theme that appears in most of these transactions is literary and archival items, which suggests that these are payments for acquiring, preserving, or accessing various collections of books, manuscripts, papers, and other materials related to literature and history. This theme could be relevant for a library, a museum, a research institution, or a collector.Some possible indicators of this theme are:-\n",
"JISC SERVICES LTD SUBSCRIPTION ACCOUNT, Annual Subscription\n",
" Private Sale, Literary & Archival Items\n",
" ALDL, Oct19-Dec19 charge from Agency for Legal Deposit Libraries\n",
" ALDL, Legal Deposit Services\n",
" Private Sale, Literary & Archival Items\n",
" SONYA LEONARD, Literary and personal papers of Tom Leonard 1961 to 2018\n",
" FROST AND SULLIVAN LTD, Literary & Archival Items\n",
" Cengage Learning (Emea )Ltd, Literary & Archival Items\n",
" Agency for the Legal Deposit Libraries, Agency services\n",
" Robert Harland, Correspondance and Literary papers - Thomas Carlyle\n",
" ----------------------------------------------------------------------------------------------------\n",
"Cluster 1 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, but one possible theme that these transactions have in common is:- They are all payments to the same supplier, ECG Facilities Service, for various services related to facility management and maintenance.- They are all relatively large amounts, ranging from £27,013.16 to £125,000.00, with a mean of £54,798.67 and a standard deviation of £19,894.67.-\n",
"ECG FACILITIES SERVICE, Maintenance contract - all properties\n",
" ECG, This payment covers 16 invoices including upgrade to boiler control panel & remedial works following 5 year test\n",
" ECG FACILITIES SERVICE, This payment covers multiple invoices for facility management fees\n",
" ECG FACILITIES SERVICE, CB Bolier Replacement (1),USP Batteries,Gutter Works & Cleaning of pigeon fouling\n",
" ECG FACILITIES SERVICE, Facilities Management Charge\n",
" ECG FACILITIES SERVICE, Facilities Management Charge\n",
" ECG Facilities Service, Facilities Management Charge\n",
" ECG FACILITIES SERVICE, Maintenance contract - August 21 period\n",
" ECG Facilities Service, Facilities Management Charge\n",
" ECG Facilities Service, Facilities Management Charge\n",
" ----------------------------------------------------------------------------------------------------\n",
"Cluster 2 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, but one possible theme that these transactions have in common is:- They are all related to building or refurbishment projects at different locations in Edinburgh- They are all paid to either John Graham Construction Ltd or Arthur McKay Building Services, two contractors that provide similar services- They are all relatively large and consistent in value, ranging from around £27,000 to £125,000 per transaction\n",
"John Graham Construction Ltd, Causewayside Refurbishment\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" SJS Property Services, George IV Bridge Work\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" ARTHUR MCKAY BUILDING SERVICES, Causewayside Work\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" John Graham Construction Ltd, Causewayside Refurbishment\n",
" ----------------------------------------------------------------------------------------------------\n",
"Cluster 3 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, and different criteria or methods may yield different results. However, one possible theme that these transactions have in common is that they are all related to operational costs or capital expenditures for a large organization, such as a university, a hospital, or a government agency. Some possible sub-themes or categories within this theme are:- Electricity and utility services (EDF, EDF Energy)- IT work, equipment, and software\n",
"EDF, Electricity\n",
" m-hance, IT Work\n",
" EDF Energy, This payment covers 5 invoices for utility services\n",
" JISC Services Ltd, Managed router service charge annual subscription 01/04/22 to 31/03/23\n",
" ALDL, ALDL Charges\n",
" COMPUTACENTER UK, IT equipment\n",
"\n",
" BSI, Subscription\n",
" PHOENIX SOFTWARE LTD, IT Hardware plus 5 year licence\n",
" AM Phillip, Vehicle Purchase\n",
" XMA SCOTLAND LTD, Purchase of IT equipment and renewal of maintenance agreement. This payment covers 2 invoices\n",
" ----------------------------------------------------------------------------------------------------\n",
"Cluster 4 Theme: There are different ways to approach the task of clustering transactions, and different criteria to define meaningful clusters, but one possible theme that these transactions have in common is:- They are all related to property or facility costs, such as rent, rates, service charges, or maintenance.- They are all paid to either Glasgow City Council, Glasgow Life, or City of Edinburgh Council, which are public sector organisations that manage or provide various services in Scotland.- They are all relatively large amounts, ranging from\n",
"GLASGOW CITY COUNCIL, Kelvin Hall\n",
" XMA Scotland Ltd, Kelvin Hall\n",
" Creative Video Productions Ltd, Kelvin Hall\n",
" GLASGOW LIFE, Oct 20 to Dec 20 service charge - Kelvin Hall\n",
" CITY OF EDINBURGH COUNCIL, Rates for 33 Salisbury Place\n",
" Glasgow Life, Service Charges\n",
" City Of Edinburgh Council, Non Domestic Rates \n",
" Glasgow Life, Service charges\n",
" Glasgow Life, Kelvin Hall Service Charges\n",
" Glasgow City Council, Kelvin Hall\n",
" ----------------------------------------------------------------------------------------------------\n"
]
}
],
"source": [
"# Reading a review which belong to each group.\n",
"transactions_per_cluster = 10\n",
"\n",
"for i in range(n_clusters):\n",
" print(f\"Cluster {i} Theme:\", end=\" \")\n",
"\n",
" transactions = \"\\n\".join(\n",
" embedding_df[embedding_df.Cluster == i]\n",
" .combined.str.replace(\"Supplier: \", \"\")\n",
" .str.replace(\"Description: \", \": \")\n",
" .str.replace(\"Value: \", \": \")\n",
" .sample(transactions_per_cluster, random_state=42)\n",
" .values\n",
" )\n",
" response = openai.Completion.create(\n",
" engine=\"text-alpha-002-latest\",\n",
" prompt=f'''We want to group these transactions into meaningful clusters so we can target the areas we are spending the most money. \n",
" What do the following transactions have in common?\\n\\nTransactions:\\n\"\"\"\\n{transactions}\\n\"\"\"\\n\\nTheme:''',\n",
" temperature=0,\n",
" max_tokens=100,\n",
" top_p=1,\n",
" frequency_penalty=0,\n",
" presence_penalty=0,\n",
" )\n",
" print(response[\"choices\"][0][\"text\"].replace(\"\\n\", \"\"))\n",
"\n",
" sample_cluster_rows = embedding_df[embedding_df.Cluster == i].sample(transactions_per_cluster, random_state=42)\n",
" for j in range(transactions_per_cluster):\n",
" print(sample_cluster_rows.Supplier.values[j], end=\", \")\n",
" print(sample_cluster_rows.Description.values[j], end=\"\\n \")\n",
" #print(str(sample_cluster_rows.Value).values[j])\n",
"\n",
" print(\"-\" * 100)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}