From b01900d5d91203459611355fae723eafcc11336d Mon Sep 17 00:00:00 2001 From: Colin Jarvis Date: Thu, 20 Oct 2022 23:31:42 +0100 Subject: [PATCH] Initial commit of transaction classification notebooks --- .gitignore | 6 + ...ering_for_transaction_classification.ipynb | 289 +++ ...lass_classification_for_transactions.ipynb | 2201 +++++++++++++++++ examples/data/25000_spend_dataset_current.csv | 362 +++ examples/data/labelled_transactions.csv | 102 + 5 files changed, 2960 insertions(+) create mode 100644 examples/Clustering_for_transaction_classification.ipynb create mode 100644 examples/Multiclass_classification_for_transactions.ipynb create mode 100644 examples/data/25000_spend_dataset_current.csv create mode 100644 examples/data/labelled_transactions.csv diff --git a/.gitignore b/.gitignore index b6e4761..d4fa226 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,9 @@ dmypy.json # Pyre type checker .pyre/ + +# helpers +*helpers.py +*transactions*.jsonl +/examples/data/transactions* +*.DS_Store diff --git a/examples/Clustering_for_transaction_classification.ipynb b/examples/Clustering_for_transaction_classification.ipynb new file mode 100644 index 0000000..5dbaf27 --- /dev/null +++ b/examples/Clustering_for_transaction_classification.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clustering for Transaction Classification\n", + "\n", + "In this notebook \n", + "\n", + "To feed the model we use embeddings created using the approach displayed in the notebook [Multiclass classification for transactions Notebook](Multiclass_classification_for_transactions.ipynb), applied to the full 359 transactions in the dataset to give us a bigger pool for learning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.manifold import TSNE\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from helpers import OPENAI_API_KEY\n", + "\n", + "openai.api_key = OPENAI_API_KEY\n", + "COMPLETIONS_MODEL = \"text-alpha-002-latest\"\n", + "\n", + "# This path leads to a file with embeddings created in the notebook linked above\n", + "embedding_path = 'data/transactions_with_embeddings_359.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clustering\n", + "\n", + "We'll now try something different - lets see if the model can discern useful categories of its own for us to use to sift through and explore our data. \n", + "\n", + "This clustering approach draws heavily on the [Clustering Notebook](Clustering.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(359, 2048)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embedding_df = pd.read_csv(embedding_path)\n", + "embedding_df[\"babbage_similarity\"] = embedding_df.babbage_similarity.apply(eval).apply(np.array)\n", + "matrix = np.vstack(embedding_df.babbage_similarity.values)\n", + "matrix.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "n_clusters = 5\n", + "\n", + "kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n", + "kmeans.fit(matrix)\n", + "labels = kmeans.labels_\n", + "embedding_df[\"Cluster\"] = labels" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Clusters identified visualized in language 2d using t-SNE')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEICAYAAABS0fM3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAABVRklEQVR4nO29eXycV3X//z6zL1pGm2VJI2/xFtshTmxlIUkT1iwNmNCUL2UNhORLf0ADbb4FSksohbbQlDRQ2hIaylIKBEMIpFlIAgkhcWI5dpzE8W7H1ki2rF0aSbPf3x9nRhrL2ixb+32/Xnpp5nmeeZ77zHI+955z7rlijMFisVgs8xfHdDfAYrFYLNOLFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ58xqIRCRL4jIf093O04HEXlYRD44wr4lImJExDVJ146KyLLsY7+I/EpEukTkpyLyXhH59QTPe5OI/P5M2zQZDH1PR3v/z+AaI34PReQKEdk7wfNO+H2dD0zG+yMii7LfSefZPO9MZ8YLgYi8R0S2ZT+cY9kf8uVn8fyTanyHYoy51hjzvcm+jog8KSIfGXLtAmPMoezTG4FKoMwY88fGmB8aY9462e0aypA2TcX1puT9z7ve08aYVVN1vdmMiFwiIo+JSLuItGQ7KFVT2QZjzNHsdzJ9ts8tIt8VkS+NcUxIRL4jIsdFpEdE9onIZ/L2GxF5WUQcedu+JCLfzT7O2bPokL//M9p1Z7QQiMifA/8C/D1qtBYB/wZsmsZmncRUCcgksBjYZ4xJTXdDLJYsJcA9wBL0+9kD/Nd0NmgauAsoAM4FioG3AweGHFMNvHuM84Sygpb7+8moRxtjZuRf9k2IAn88yjFfAP47+/gqIDJk/2vAm7OPLwK2Ad1AM/C17PajgMleKwpcmt3+YWA30AE8CizOO68BPgbsBw4Dkv0AT2TP/zKwboQ2Pwl8JPvYCdwJtAKHsuc0gCvvPbgXOAY0Al8CnNl9NwG/z76+I9uOa7P7vgykgVj2nv41r93Lgb8FEkAyu//m3Pny2rkaeAxoB/YC78rbVwb8MnuvW4G/y3/tkPt9GPj4kG07gXfmtyn7+DrgVdQANAK359/rkHPkv+4PgR3Z9jQAX8g7bsmQ9zT//d+Z97lHs8ddld13CfAs0Jk97qq8cy4Fnsq28zHgX8l+D4e5/6vI+16i38nbgZeALuAngG+E1w79TO7O3l838AJwxZDfwn3A97Pt2gVszNt/YfY96gF+mr3ul870/c3u/wBwBGgD/oaTf3cO4DPAwez++4DScdqAC4GeCX7vTnrfT8MeDPd9+Tvgmex792ugfDz3PuTat6K/twT6XfvVCO1+BXjHKO+JAT6N2p5cG78EfHe49o/3b9oN/ig3fA2QGu2GOD0h2AK8P/u4ALhkpDcOHXEcQFXZBfw18OyQD+MxoBTwA1ejP8wQKgrnAlUjtPlJBg3RR4E9QG32XL8d8iW8H/gWEAQWZL/8/zfvx5sEbkEF5U+BJkCGXmeEH/fAe5d3vt9nHwfRH/yHsvd/ASpWa7L7f4z+oIPAOtRoj/SD/ADwTN7zNahx9Q7TpmNkjRvaO7xwaNtGuJergPNQo/M69If9jlF+2B8Zpp23Zj+LIqAG/WFflz3nW7LPK/K+S18DvMAfoAbidIRgK9qrK0U7Gx8d4bUn3TfwPtQYuoC/AI6TFZHs5xnLttkJ/APwXHafBzVWtwFu4J2oQRqvEIz2/q5BDdvl2evciX4vc7+724DngHD2/foW8KNx2oBP5u5hAt+7k973idqD7PflILAS/a0/CfzjeO59mDZ9N/eej3LP/4mK+IeAFcPsN8AK1N7k7MgZC8FMdg2VAa3m7LkuksByESk3xkSNMc+NcuxHgX8wxuzOXv/vgfUisjjvmH8wxrQbY/qz5y5Ee9GSfd2xcbTpXcC/GGMajDHt6I8XABGpRH/UnzTG9BpjTqCjjvwh4RFjzLeN+jO/B1ShLrQz5XrgNWPMfxljUsaYHcDPgD/OBtH+CPh8tl2vZK89Evdz8nv3XuDnxpj4MMcmgTUiUmSM6TDGbB9PY40xTxpjXjbGZIwxLwE/Aq4c361CNub0JeDtxphu1OA+ZIx5KHvOx9De43UisgioA/7GGBM3xvwO+NV4r5Xl68aYpuxn/itg/XheZIz5b2NMW/Yz+WfUsObHH36fbXMa+AFwfnb7Jah4fN0YkzTG/BwVo3Exxvt7I9q7/b0xJgF8HjVEOT4KfM4YE8l+5l8AbhzLpSoir8ue6/9ln5/u924sTsce/JcxZl/2t34fg5/XWPc+ET4B/BD4OPCqiBwQkWuHHGPQ0cffiIhnhPO0ikhn3t+5o110JgtBG1B+Fn3wN6OqvkdE6kXk+lGOXQzcnXsTUfeIoD3FHA25B8aY36DugW8CJ0TkHhEpGkebqvPPg/ba8tvgBo7lteNb6Mggx/G8NvRlHxaM47pjsRi4OP+LhBrwhUAFalRGavdJGGN6gP9lUMD+BP2iD8cfoeJ3RESeEpFLx9NYEblYRH6bDTB2ocanfJyvrUV/3B80xuzLbl6Mil7+/V+OCm010GGM6c07zYj3PwLH8x73Mc7PTERuF5Hd2UyvTtR1mH+fQ8/ry/5+qoFGk+0yZsn//Ma67mjv70nf4ez3sC3v5YuB+/Pex92o23LEDouILEddircZY57Obj6t7904OB17MNLnNda9j0o2Uy8XzH04e45+Y8zfG2M2oJ3h+4Cfikhp/muNMQ8BEeD/jnD6cmNMKO9v92htmclCsAWIA+8Y5/G9QCD3JNuDqMg9N8bsN8b8CWpIvwJsFpEgwyt4A+qCyX8j/caYZ/OOOel1xpivZz+8NegX7P+No83HULdQjkVD2hDn5A+0yBizdhznPaV9p0kD8NSQ+y8wxvwp0IK67EZq93D8CPiTrGH3oS6wUxtsTL0xZhP6Gf0C/RHAqZ/twiEv/R/Ud1xrjCkG/gMV7lEREX/2Ov9ijHk4b1cD8IMh9x80xvwj+pmVZL87Oca6/zNGRK4A/hIdRZYYY0JojGHM+0TbXCMi+cfmf35n8v4eQ90+udf6UQOWowGNXeW/lz5jTOMI97kYeBz4O2PMD/J2ne73bqL24HQY696HMtRm/NAMBnOH9vrJjk7/HnWFLR3mfJ8D/oq8+5woM1YIjDFd6FDrmyLyDhEJiIhbRK4Vka8O85J9aA/oD0XEjfr1vbmdIvI+EakwxmRQHzVABv2CZYD8XPb/AD4rImuzry0WkT8eqa0iUpftNbnRL2Ase86xuA/4MxEJi0gJGlTL3f8xNDD1zyJSJCIOETlHRMbr8mgeck+nw4PAShF5f/Y9d2fv8dys2+HnwBeyn8ka4INjnO8htGf4ReAn2c/gJETEk+0hFRtjkmgQL3fcTmCtiKwXER/qXsinEGg3xsRE5CLgPeO8z+8Ae4wxQ79P/w28TUSuFhGniPhE5CoRCRtjjqBuor/Ntvly4G3jvN6ZUIgawhbAJSKfR+MZ42EL2gv/uIi4RGQTGizNcSbv72b0vXp91k3xBU4Wp/8AvpxzDYpIRfb6pyAiNcBv0OSG/8jfN4Hv3UTtwekw1r0PZczfpIj8Tfa35sl+Frdl23fKXBRjzJNocHms39+YzFghAMj6Qf8c/RBb0N7Fx9Fe3NBju4D/Dw22NKIGOZJ3yDXALhGJotkX784Ow/rQLJtnssPXS4wx96O9hB+LSDf6Zp+i2HkUAd9Gs3dyGQT/NI5b/DaakbQT2I5+0fP5ABqEejV77s2oe2I83I36YjtE5OvjfA0w4M55K+rOaUKHxl9h8If0cXR4fBwNgP3XGOeLo/f2ZrR3ORLvB17LvucfRd1RZF02X0R7ivvRbKl8/j/giyLSg3Ye7mN8vBu4QU7Ot77CGNOAJgz8FYPfu//H4O/lPcDFqMvwDjRTZ7J5FHgENXBH0M7GuNw7Wf/1O1F3SCcaA3kQHXGe0ftrjNmF+rV/jPaQo2j2XC4GdDc6mvh19vXPoe/dcHwENZRfyP9M8vaP+3s3UXsw0vlGuMZY9z6Ue9EYWKeI/GKk06L31Yr+9t4C/KExJjrC8X+NJh0MpXPI9/rPR7uXXIaJxWKZR4jI88B/GGNGFfEJnLcAFZsVxpjDZ/PcM53ZfO8zekRgsVjODiJypYgszLqGPoimgT5yls79tqy7JoimUL6MpmrOeebKvVshsFjmB6tQF2QnOgfhRjO+FOfxsAl1YzShOe7vNvPH1TAn7n3SXUPZaP02NH3tehFZivrUytBJEe/P+jAtFovFMg1MxYjgNjR3OMdXgLuMMcvRAOjNU9AGi8VisYzApI4IRCSMzv77Mpr98zY0C2OhMSaVzSv/gjHm6tHOU15ebpYsWTJp7bRYLJa5yAsvvNBqjKkY67jJrpz5L+gkmMLs8zKg0wyWjYhw8mzdYVmyZAnbtm2blAZaLBbLXEVExjX7etJcQ6JTtk8YY16Y4OtvFV2HYFtLS8tZbp3FYrFYckxmjOAy4O0i8hoaHH4jOnEjJIP1g8LoZI9TMMbcY4zZaIzZWFEx5sjGYrFYLBNk0oTAGPNZY0zYGLMEncH5G2PMe9E6MzdmD/sg8MBktcFisVgsYzMd8wg+Dfy5iBxAYwb3TkMbLBaLxZJlqtbpfRJd0AGj69NeNNrxFovFYpk6Zut6uxbLHCUC1KNZ1hXoGjjhUV9hsZwpVggslhlDBA2ZhdB1W46g67OcgxaYFLRSshUIy9nFCoFlnjITe971qAg0o0siHEDXJGlHqy8bdInkPlQwNjH9bbbMBawQWOYwOWO/F61m4kB71A50yYh16Ho5UabXsNaja+Q8CPSgy+n60VGBE10e4E3oSqEHgUvzXmeFwHLmWCGwzDFyxn8fajRr0MKQrcCLqBB0oT3tZ4Bz0ZVFFzJoWA2jLzQ11v7TGW3Uo2vCNKAi0J89fybb7hp06epDwBIGF9MqQEcOFsuZY4XAMoeIoKWtWoA9qEHdgxrh3egiVUl0ca9e1PAmUP97G+py+QJqbO9ieGNvgE+ho4orgK3Za4AKyhWobz+E9ujHGm1szrYpii50l1vcKoOuTNmKikFHtt3F2f1R8pbgtVjOCCsEljlEbiXHVPZ/AjXuzaiBTWe35VxEoP73RtRF1J7dfzdqaK9DDXGuV1+DisDdwFWoUX4a8KEunCbgH4CrGVxXPbes8EhunEZUcBLoKKUr+zh/1FGYbX8rGiPoRsVqvMtXWyyjY4XAMofYhvauX0SNaAA1oEfQuYv96BLQKVQM0qjbpQE10mVojcQoOs8xgi6d3Isuk/1idvsfopPl70cNfc5Qx9Cf1IvABXntGs2NU4MaeA8qACF0dOLOntuVbc/b0eV809n7uhIbH7CcLawQWOYQgo4E/Kjxbcs+7kVHBpnsMQ7Am33uQXvza1ED+wv0Z7EeXS/eg66p/oPs89vQuEJh9vw594wne51ydG31fEZz49wIvMqgWLhQEXBl2/cH6Prodaf3Vlgsp4EVAssc4kLUWJejhrkIdfcsQI1xCO21e9HedwlQhYpAD52d3ezf78fvd+DxXMWSJSk8nl8Bv8qe/41o7OAX2fOVoca/MHu+AOomCqDum4LscZ2M7MapAz6PZg1tQV1Di9HMoA9jBcAyFVghsMwhrkWDxd0Mun2WArWosV4M/Ax1EQWzfwXAJbS3G7ZtC+DxFFFUFCKVirNr11u44IJX8s7/MXREUYcGgM8Hfo2KgJNBQbgNdVE1oyOBsdw4dViDb5lOrBBY5hBh4DNowNaFjgwCqHH+EGps38twqZ2vvXYPHk8BgQD09a2gpOQ5Kiu3Djn/Q8AN2etsyp6nn8GsoWrU1WONumV2YYXAMse4Hk3b3Ixm5Aw1zmGG6523tFRQVhYlkykimSwjEDhEWdmLtLWto6zsb1ERuBcdQdyVd54bJvuGLJZJxwqBZQ5y+q6WTKaOTOYBXC5DdfVPWLDgcY4fv4odO77PtdfWoga/AE0dhZHnGVgssw8rBJZ5xnCzfqGurp69e7tZtuy/KC9/gUjkD3n88X/nzW+uzb5OUOMPVgwscw0rBJZ5QD3qKtqHZvFcgebvPwV8FwhSXn4JsIZ4vJ7GxkJefllFIHySFylfDEJYEbDMFawQWOY49cCdaKqnA52x+yCaSbQEzdVvB16lvLwQDSp3UVOzLXvMUHJiYEXAMnewQmCZ42xGRaAMze6JAcfQyWBhNM3UQVNTkN279/Pii+WUlRVy5ZXNLF060jmtCFjmFtOxZrHFMoU0om6cLrRwWxydbBZHy1MnaW938eyzPpzOLioqwJgomzdXUF8/bY22WKYUKwSWOU4NOrP3GINlHtLoDGAB4hw+XERFRQciRfj93ZSXd9LeXsfmzdPTYotlqrFCYJnj3Ii6gU6gJSdyhdwKs/8X88orF+HzpYjFQiSTAXbv3oRImMbG6Wu1xTKV2BiBZY5TB9wO/D06KlgIvA2tM7QTjRnU8otfvBOXazBFqLMTamqmvLGznkgE6uuhpQUqKqCujiGZV5aZyKQKgYjUAt9Hp3oa4B5jzN0iUgr8BE3beA14lzGmYzLbYpnP1AHfYHBh+FwxuBXAJtasCfPQQ1BWBqGQikBbG3zoQ9PV3tlJJAIPPKDvYWUlRKP6fNMmFQMrEjMXMcZM3slFqoAqY8x2ESkEXgDeAdwEtBtj/lFEPgOUGGM+PdJ5Nm7caLZt2zZp7bScXWbuD37kJSTr62HzZmhs1JHAjTdquy3j5ytfgV/+Eo4e1ecLFoDPBy4XrFkDRUXwutdBQQEcOQKvvALLlsGqVYPv9cz83sxeROQFY8zGMY+bTCE45WIiDwD/mv27yhhzLCsWTxpjVo30OisEs4f8XmFBgfYKX3tNjYIxp/7AZ65oWE6H+np43/t0NOX1Qk+P/vl8sGKFjhA6O+Fd74KSEvjNb1QwurqgtFRFYsUKWL9+8HvT2Tk4mrBMjPEKwZTFCERkCbps0/NApTHmWHbXcdR1NPT4W4FbARYtWjR0t2VGcGoPu74+TCikP2yAPXvg5z+HdBqqqiCRgGQSNmyAP/xD7RkO70o4nQXgLdPN5s0Qj2vvP/cZu936uTc1qfH3+eC++8Dvh2PZX7/XC4WFsHcvNDfryMHhGPz+1NdbIZgKpkQIRKQALQT/SWNMt8jghBxjjBGRU4Ylxph7gHtARwRT0U7L6RBh0Oc+uEh7b+8mCgv1l7tvH/zoR2oUYjH98Xs86go4fBg+8xkVAZdL/fOvfz0sXAgvvxwhHD713CMvAG+ZDvJHc1u2qAFPJvV/JgNOpz4X0Z5/IgGplApGTiwKClQg3G7tCOzfD+Xlev6CAhUHy+Qz6UIgIm5UBH5ojPl5dnOziFTluYZOTHY7LGebetRQ5xZn1//nnFPPoUNhiorg8cf1xx0M6o/e5xvsIQYCcPCgGo3KShWJnTtVDDZtGv7cIy8Ab5lqhgaGi4rUwDscKgA5MfB6obhY96VS2vtPpfTP4VB3Iehx8bgKRo5oVN2FlslnUucRiHb97wV2G2O+lrfrl8AHs48/iHb3LLOKFjT7Jp8CVq9uobMTurvV7eN26+Ncb9Dh0Iyc115TgwG6v6VFn7/0EvT1tdDaeuq59ZqWmUB9PQMuQIcD3vAGfd7fryO84mIVfbeb7Gxt7eGvXav7QJ+DCoDXqx0Gt1sFpLtbYwQ2YD81TPaEssuA9wNvFJEXs3/XAf8IvEVE9gNvzj63zCoqUJdNPlFKSirYtEl7/Mmk/nm9g/7ivj51Ezmdui13TCajAtHbCyUlFRw+fOq5R14A3jLVtLQMGnKAlSvhpptg6VI17Mao0V+9WjsBtbXw9rfDH/0RfPCDGiMyRvdlMuoS/IM/0Nc0N+v3xwaKp45JdQ0ZY37PyBW63jSZ17ZMNrl1e2HoIu3hsP6Af/5z9R37/Wr4T2QdgE6n/vX3D+5Lp1UEzjsP3O464vHhz22ZGVRUqOsmF9QFqK6GO+7QXvzQTDBQV1J3t2YJveENg599MAgbN8I111jDP13YmcWWCZK/bu/wi7SvXasphI2N2vsLBrUH2NU1KAZOp/YIUykNKhoDe/eGWbFi9HNPJnZOwdjU1alhh5PTPa+8koGOwFA2bdL3trlZRwjvfKc1/DMFKwSWM2D49X9zrFqlQ/zjx9X4FxerCyAW0zRCl0sDxydOqCCce64KwTPPwKWXjn7uyaK+Hu68U7OYFi1S43bnnXD77VYM8gmHTzbsFRWDIjDaa6zhn5lYIbBMGnV1aujXrj251/ie9+j+3/1Ot5WUaMpgLn/8sss0K2U6DO/mzSoCZWX6PPd/82YrBEOxhn3uYIVgGpnuWbWRCDzyCGzbpm6ZCy+Ea689e20Yrdf4nveoq+iFF3SUEI9rjODii9WH3NwMDz4I996rYlJdDTffDNdff3baNhKNjToSyCcUGiybYLHMRawQTAORCHzzm+pjTae13sqll6rBm6pMiUgEvvc92LFDjXB/v84C3rcPbrttctqwfz/8+tcaN6ipgSuuUAPb3Ky56OvW6cigu1snnH3rWzpaqK3VkcQdd6jr6G1vG/kaxqioTZSaGr1WbiQAthKpZe5jhWCKiUTg7rtVBHy+wQJcbW3a252qKfX19XDoEHR06CSf0lI10Nu2wcMPwy23nPmIJX/SUXe3ZhF5PFp4rKcHfvADeP/7B8tMFBQM5o/v2KEikO+iOXQIPvlJfZ+GM/bGwKc+pef6whcm9r7ceKPGBMBWIp1LRLoi1DfV09LbQkWwgrpq9fPlbwsXhon0RE46Jlw8P3xfVgimmPp62L1bc+iLitSgORzaK3/xRXWTTAUtLeoGKSjQPH9QQUgkYPv2sUsKj4f8SUc//7mKjcejI4DVq/WYp5+GT3wC/ud/4IkndGSydi00NGjwOIcxGlA+cECN/V13nSwGORG4+24d0Ux0ZFBXp4HhzZt1tFJToyJg4wOzi3zDLyLsb91PwiRIpBJ4XB5+feDXdMW78Ll8VBZUsrd1L881Pke4KMzSkqUc6znGw/sf5pySc1hZvnLOi4IVgimmpUV7w0VFmjLpdmv2TDI56EefCioq1OjnRAD0eTCoRjTfiMP4ioANHUHs2zdo8NvadJuIxgJg0Pd+7JjGCs45Z7AX3tmpI4Vly/RYEfXdZzJq7GFQDIaKwFCROF3q6qzhn61EuiI8cuARnjj8BJUFldQU1PDIwUc41HGI5SXLCReG2d+2n53NOykPlHNR9UVsa9zGyy0vU+wtpthTTHtfO6+2vMqykmV0xjvpS/bxwN4H2LRq05wVAysEU0xFhfa8o1EVBBicYVlaOnUGqK5ORx9Hjw66X3IZPBs3qjGvHFITdrQiYMONIA4e1ElDS5fq/4MH1ZD7/ZpOmkppj3u4TJ3XvU5HJsXFJ4vDv/yLjhzyxeBsioBl9hLpivC9F7/HlsgW+pP9tPa18uzRZ+mOdxNwBdjXto9XWl7BgYNEKkFjTyNPvPYEhe5C3OImnorzcsvLHOk+gsfpoaGrgRN9J+iOd+N2unn4wMPcsuGW6b7NScEKwRRTVwfPPqsukYICLbnQ0aHGbrKCtMMRDg+6WJqbdWSyZIn2wK+5Jpvpc6SX1PF2Yl0xfMU+XAtLEX+Q++8/NW4w3Ahi3TpdfATU+Hd3q2uookLdYMXF8PnPqxEfmqlzwQU6cigsVDdRdfVgfCCXOXT33YOCYEXA8siBR9jXvo94Oo7P5WNf+z56Ej3EU3H8Lj8pk8ItbtpibRhjEISkO0lDTwMucZHMJCn1l9IT78EhDg50HGBp8VJWlqxEEH57+Ldcu/zaOTkqsEIwxYTDarTKy+G3v9XRQF3d9Pih6+rgq18dPiDcfaybLT9uJ1QmFIf8dHamOfpyO+VrHAQC/lPiBi0t8Nxzmo7a06MG/JprtAbNnj3amy8v11hINKq9/iuu0OuNlKlz6aW66hUMup3uuUfb+Rd/MSgCYEXAAtuObaPMX0ZrbyuRnggpk6LQXUh3vJtYKqaL5QLpTBqHw0Eqk6K9rx2XuEhLmrRJ0xHrIGMyeBwe3OLG4XCwv2M/tUW1LChYQH1TvRUCy9khHIZPf1r/ppuRJgVJpJE3X2bYd7yYti4npSGhMp4mGG+nqEhzKfPjBs8+Cz/9qWZCFRXp7OGf/EQXn3E6deUpv1+39/aqAKTT+vqxMnWGup16euADHzi5vcMFkC2zlwf3PMi9O+6lqaeJ6sJqbr7gZq5fPfokEjGCwYCAMQa3w00sGcNkDG6nm2Q6SSqTwoiOBpwOp44KTBJjDOX+cvqT/aRMilg6RrgwjFO0RO6RziO8a927aOmdmxVwrRDMIc7mBLXell4WLS5g8dLBKqA/frgIR2LweWurBoQbGnQk4HRq8FlE/6dSuiTh+943mB0VCOhrt2zREUCuh//+96u7bLhMnXy3kzFanuLJJ1VkfvWrwRgBWDGYCzy450HueOoOSnwl1BbX0hnr5I6n7gAYVQwurL6QZxqeIZFOsKhoEZGeCM3RZvxuP1UFVTRGG8kkM7jFjdftJZaKqSAYJ16nl6A7SDwdp9xXTpG3iL5kHwZD0B0k6Anic/kIuANT9TZMKVYI5giRCHz96/Dqq4OumS1b4M/+bGJiEKwIkogmSCfStO/XOEH6aBX9C0OAisDzz6txD4e1p+92q6FOp1UUyso0/nDeeWrMYTBQvGOHliOurBxcyPycc+CSS04VsFzg2hj4z/9U45+LFYio8QcrBnOFe3fcS4mvhLKA+gpz/+/dce+oQnDt8mtpjjbT2tdKf6qfRaFFYKA0UEpjTyPl/nJSnhS9yV6SJkllsBJB6Ix1UuwrptBXiM/lI+gJEvKFONRxiI1VGwl6ghgMnbFOrlw8NyvgWiGYI/zP/8BTT2nm0YIF6od/6in1y//lX57++QrDhTx/1/N0vNaBO+BGRAg0tfPy7mVEm6O0u6tIFxTiCfhYtUqFJ7fOQM5l1NWl230+LR2xf78Gxo8fV1fR0qUqKK++qim0nZ0aPB86X6GiQsXtvvtUBN72Nl0EPRjU/VYM5hZNPU3UFteetC3kC9HQ1TDq68LFYW5afxOVBZX89vBvWVCwgEVFi8iYDDWFNTgdTrY2bcXn1p590BOkta+VFWUrqAhWUFVQxd7WvcTSMcQhXL7ocgKeAM3RZt6w9A1zNlAMVghmNF2RLprqm+ht6SVYEaS6rpricPGwxz7xhKZ+5mftGKPbRxKCkc7fFemiaWsTLr8Ld8BNd6SbjMlQVhpgY0kTrzWUc9S1kIWlbZx3dQnl5QGuuUZjBD09aqCjUTXqn/iEGvhQSMUgGlUxeP3rtQ379+vxPp8eN9x8hVzJY5dLRwHvepeKzFVXDd5LvhiEQlYEZjPVhdV0xjoHRgIAnbFOqgurx3xtuDjMLRtu4drl11LfVM++1n0c7DhIXU0di0OLWRpaymOHHqOmqIZlJcvwu/zsad1Db7KXgCfAB9Z/gN5EL6+0vMKy0DJWla+a85PJwArBuIjUR9i9eTfdjd0U1RRx7o3nEq6b3C9GpD7C9m9vx6QNwYogqViKnqYeVm1aNawY9PerAcwnldL6RTk/fL7LpSvSxd4H9uIL+SioLCARTbD3gb2s2rSKpvomfCEfLp8LX8iHp8BDtCmKSRtqag2l7UcpLS7FXRlC2tqAAO98p44IfvN4iuONhiJ/ilve1sFf3Bqkh+KTCs+94Q1q+EENekmJtj9/CcP8+Qq54nXV1VqyOhhUERjq8sqJgRWB2c3NF9w8EBMI+UJ0xjrpiHXwyYs/Oe5zhIvDA8Y7N8u4OdrM6xa+jutWXHdSKYkPX/BhYLDcRG1xLe88951z3vjnY4VgDCL1EbbcuQV/mZ/iRcXEOmNsuXMLl95+6aSJQVekix3f3oHD5cC/wE+qP0XLqy1UrKmgqb5pWCFYu1b97Ol4gnRPPx2dhqY2H8uWO3B2d/PYT1r45lEvi6tSXHp9KeeUdFAW8tGRCPLy8z7au5wUuvvpffgEhaaXgsoCvMVe2va14SvxkUlnAMgkMvhDfpYEWjiYrqT1eC81Ge3pb1zXz3vW7mHxUieeAk9WXCKs2rSKG24YbHMuCwjUddTerhPN1q3TbcMtWj7eksdWBGY/uTjAvTvupaGrgerCaj558SfHzBoaiXxRyFHHqbna88nwD8UKwRjs3rwbf5mfQJlmC+T+7968+4yEIFIf4cX/epGWV1pwBpwse8MyznvveRSHi2mqbyKdSlNYUYiI4A64AYg2RXH5hv/IPvxh+PxnE7Qd6cXpcdDV7yXgS7HKE+H5H7RzNFWNv8BBd0+KnT8/wM7iYq54cyG7DgYoCKYpK0nR1+vlkd86uOYNxXii/ZSuKKXphSYSPQnEKWSSGZKxJMWLiikrF6oWtrO3sXCgp3/hgghlASfeIq1bkfs/VLzyy1OHQioE69ZpfCNXdO7KuRmTOysMVz68qkqzrubKqmrXr75+wobfcvpYIRiD7sZuihed3AP3hXx0He2a8Dkj9RGe+uJTxDvi+Ep9ZBIZ9ty/h97WXi657RJ6W7RHnoqlBkTA5XcRPRal5uLh6yHX1cFNb4nwyNMB2qJ+xJXm9ef3kn7lODvbK+n1B0jHBPBTvaiXZGs7Dz91DmtWpSkIaG/f64hTtsBDk5QT7NyFL+Rj2ZuXceChA2RSGbyFXtxBN52vdRLviuNr7eOGWy4gnDU42+7pwpO/ojmoW6l56EL0J/fwc2mv413parKZjnUQxkt++fBEQudkbN2qYrpokabn5gLwn//87BYDy9RhhWAMimqKiHXGBkYCALHOGEU1RaO8anR2b96NSRgC5QGcXif4AYG23W001TcNxARaXtXJKy6fi/6OfsQpVNeNHDA7p6SdT9+SQBxdPLalgP64sLvLweGuMlxpD6m0A5cjwysnKjnHfYRjzQ7OX9kPGTfJWJJkb5Jw3QKi6eBgrKDYx3nvOQ8E2g+207StiUBpgJKlJRRUF9C0tYnCqkKKw8UDKae5kQBAIpogWBEc9f2YSStdPfigrnswdB0EmBliMLR8eFmZzuXo7tYYS2WlCkR7O3znOzpSmM7Fjyyzg2kTAhG5BrgbcAL/aYz5x+lqy2jUXlHLM//wjPrry/24A24yiQzrP7R+zKyekfZ3N3ZjHAaHxzFwrLvATd+JPnpbellx3Qp6mnqoWFNB9FiUnmM9OF1OVr59JQceOcCxbccwYqi+sJrl1y4fuGa+IT5vRYzfPF/Awb4qYmkn3rQDJ4YCX5LWTheukhrOfX0hiUySTGcfvmIfC9YtIO0JUpHVvL7WPo7tOIYYoWpjFaXnlFK5rvIkQx/vjg+4fqrrqtn7wF6AgRhBrDPG4isXT8EndXa4995T10HIbZ8JQjBc+fD+fp3D0dMzOJkvFNKifWdaSnymMNx6AvPZp3+2mRYhEBEn8E3gLUAEqBeRXxpjXp2O9oxEV6SL7iPdnHP1ORx/8TjR41E8AQ8X3XYRhVWFI2bd5FIwR9pfVFNEf0s/mURGRwRAMprEW+hFRGiqbyIejRNrj+Er8VFzUQ2F4UIO/vog7fva8Zf5wUDDMw1Em6Osv2n9KYa4stRw2Zo2HnykgqAnSTLtoKgwRcCRoCctdLgqeO+H/GzdWjOwKExuTeELVnbz4ndfpONAB74SHwgcffoo/Z39rH776pOEIN/1UxwuHhhJRJujBCuCLL5y8Ygpr0OZjuysoTQ16Uggn1BIZ0/PBCoqtGR5fvlwh0OD7fkkk1rX6XRLiQ+lPlLP5t2baexupKaohhvPvZGqwqopNcqRrggP7H2AkC9EZUEl0USUX+z5Be9Y/Y5TMoNybdpYtZHaUO0YZz47bctf90AQWvta6Yh1UOorpSxQhsFgjJnRAjZdI4KLgAPGmEMAIvJjYBMwY4SgK9LF1m9spbell8KFhSy6fBH9rf30HO+h4ekGeo/14gv58BZ5advfxqHHDtG6r5Wn/u4pPAUeAEJLQqy4bgVlRWV4i7z0tfWx9RtbQaCvrQ8jhsKqQjKJDLHOGAvOX0D0RBR3wE3F6oqBHnV1XTVN9U30t/QTKA8MxA3EIfS39g/0yIca4traIOuvKMSRTtC6t4PjrU7i4qVieZCFS/3U1Q26DvL98z31jfS39uMv0xFQrDtGrCNG5+FOdt+/mzV/tAaD4dgLx2jb3Ya4BBEZGJ2M1/DnMx3ZWcNRXT18AbzqsVPYp4S6Ou3d55cPLyrS2kx+vwpCb6+6jpYtU4HPZ7RS4kOpj9Rz55Y7KfOXsah4EZ2xTr741BdZU7GG1y183YBRHlqrfzjxqAtPPFjxyIFH2H5sO409jQhCW38bAVeA6sJqwsXhU4SiJ97DB37xAS5YeAFfu+ZrE77uWORf1ylOnm54mmg8itflpdhbTGNXI7F0jAJvAVfUXjGj1zWYLiGoAfL7WBHg4vwDRORW4FaARUNrFE8yud58X2sfBQsL6G3t5cjTRyg/t1yfN/fS1dDF0jctJdocZef3dtJ7opd4NE46liYRTeDyuug0nbzy41dY9+51+Ep8tOxqIZ1Ms3rTalx+F4cfP0y0OYqvxMfqG1ar4fW7h826iZ6Ikkqk8Jf4B9rp8rvob++nt6VXy+qKnGKIL22Dp5/2seyqItb4NNe/tVUrf8Lw/vltD/XqtUr9xLpjtO1tw+V14SvzEWuLse9/92HShr7WPhwuByW1JaeMTk6XycrOOl1uvnkwJpArgNfRoSWwZwL55cOPH9ey3hs2qLvI69V5FoWFKujLl+sorygvnDVcau5IbN69mTJ/2UmlHg51HGJXyy4uX3w5AEVePXmuKudw4nHnlju5/dLbJyQG9ZF6vvXCt+iOdVPkK8Lr8HKo/RBtsTZeOvES39/5fQRh/cL1LCpehDGG+169jyePPKmlIbK/i8mgvqmekC9EkbeIXSd2UeYvo72/nUQ8wbKSZUR6IgCU+cs42HGQS2svHXjdSCOZ6RoxzNhgsTHmHuAegI0bN5qpuGbOp3/w0YO4fC4y6QwnXjlBd2M3khGijVG8hV6ClUFSfSlaXm6hdW8r/W39YMBkDA6XA3EI6UQaYwwmaWh4poGyVWWIU/B4PDQ+30isK0b1hmoq1lWw4SMbANh2z7aB0UQOT4GH7f+5HQyUnFNyUiZRqj+Fy+MiUB7g0U89ii/k44KPXHBSXOL159fQ3FxES4saNI9HS0Nfc83I70OwIojL4yLVnyLaFMXt0+v5inwULSqi+7VuOo90UrK0hNDSEN4iL8m+5Emjk9NlMrKzJkIuDnDvvaeugzBTGK58eDisGUX5QWEYnK+R7/obb2puY3cji4pP7oSJCD3xnpO2FXgKaI7qMGM48chtH0sIhhrFcGGYb2//NslMkqAnSCqT4nj0OIWeQnoSPXTEOthxfAeVgUpa+lpIppI8evhRth3bxoaqDVwWvmzcIpC79t7WvQNunbGWqGzpbaGyQFdv6op3UeIrIZVODezPPfa7/HTEOk55r4ZzeU3XiGG6hKARyHfghbPbpo18nz4OiEfjnHjlBC6fC5MyiEdoO9CGt9TL0jcuhQwcevwQHYc6wAkmYTBpDQBn0hlM0hDviuPwOuht7SXWFaN1dyvJWBJ/iZ+FFyykuLaYw08cZvk1y0fMuon3xMHA/v/dT82lNZQuLyVQHgADsY4YvnIfz33tORqeaaB0dSmte1sJXxomtDikrqWte7jhravZGykad+ZIdV01J3adoONAB/0d/bgDbhLRBIU1hVRvqIYMxHpiLDhvAeLQH1r+6GQiTEZ21kTJX/xmpjLcSG64VNHcfI2JpObWFNWcUurBGEOht/Ck46KJKBVBHWYMJx4hX4ijXUdHvdZwRvHbO75NV7yLUm8pnfFOehO9eJwe2vrb8Lv8FDoLOdp9lEQ6wbLiZXxrx7do7W+lrrqODQs3cLjrMJGuyJhGNXftdCbN4c7DOMVJZ38nfrefpp6mEQ1zRbCCaCJKkbeIYm8x/al+XE7XwLoHLqea1/5UP8XeYlp7W/l9w+850HaAn+3+GbFkjIUFC/G5fJzoO4HBUFNYg8fpmfKV0KZLCOqBFSKyFBWAdwPvmaa2AAyUVfAWefGH/Jx45YSmPTrApA3x7jjeIi++Yh/B8iDx7jhL37SU4zuPk4wmyaQ0WpdO6KIXOMEddJPqSdHr6CXWGkNE8Jf6ySQzNDzdQPriNKVLS0fNuol3xbnu36/juX9+jufvfh5PwIPb7wYHlK0u49XNr3L8heMsvGAhBdUFdEe6adzaiLfQq4KBri1www3jN6jF4WLW37SeAw8foPtn3aR6UyxYs4CFGxYSLNfRgrfQO+zoZKxU0ZE498Zz2XLnFkBHArHOGP1t/az/0PoJnc+inElq7o3n3sidW3ShiFypB7fDzZqKNXTHuynwFBBNRE+qyjmceHTGOqkpGn7+S458NwuoyymVTnGi9wQNXQ1Ek1EdiQi64pjbTzgYptRXyv6O/RzvPQ5AXXUdl9deTl+yj3UV68a1kEzu2rtO7KLAU0DAHaAv2cfx3uOsrVg74jnqqut4YK8Ouc4pOYenG57G4/DgdXlp62ujyF1ELB2jrb+NteVr2fzqZl5qfolkJglAT7yHjMngFCe1RbWEi8Mc6jhEa2/rlBe4c4x9yNnHGJMCPg48CuwG7jPG7JqOtuTobekdcMuUrigl1hXD4XaAgYrzKvCV+Kg8vxKTUVGIdcaoPL+S8pXluINuXAGXikbSkE6kcXqc6rapCCBGKFlegsPjGJgp7PA4aN7ZTMV5FQO96Fyw1x1wE23WoPGqTasI1Ya4+q6rufi2izn8xGHcfjfX/9v1NPy+YUAEll+3HAz4S/0kuhO0728HVFAm0ksvDhez4ZYNbPrPTax+52rCl4YJlAaId8fxl/upWFtBX2sfid4EiWiC/rZ+/OX+Uec5jEa4Lsylt1+Kt9BL19EuvIXeKQ8UW06mLlzH7ZfeTqG3kKNdRyn0FvL5Kz/PJy7+BAG3VuUMuAMn9ZhvPPdG2vrbaOtrI51J09bXRlt/Gzeee+Oo12rpbaHAc3JkO51Jc6DtwIB7J2mSJFIJHOLAgYP2/nYqAycvrL2xaiM+t4+LwxezOLR4XAvJ5K7dFe/C79IYnM/loyvWRYGnYMRzhIvDbFq1iYA7QNqkuaz2Mq5Zfg3nLTiPQm8h51WexzXLr+Gy2st46fhLHOg4QDKTxOvy6r2kE8QzcXBAa38rx6LHMMaQJk19U/2Y7T6bTFuMwBjzEPDQdF1/KPlumUB5gMrzKtWYCgTLg5TdUEa0ITrQC1585WKa6ptY8sYl4ICOgx2k+lKk0diAv9RP6cpSlr5xKVu/uZWKNRWk42kS3Vrj3+V3kU6mcfvcA71qYMSsGxHh6ruuBuD5u5/n+bufB2DBeQs455pzBgQmlUiRSWWIdcWA8U3oGo3hUkLX37Qe4KQ5DbWX1Z40p2EihOvC1vDPMOrCdSf59usj9Xxj6zcGMoKqC6qpb6rnof0PDQQ7b7/0djbv3szRrqPUFNXwofUfGjM+kO9myRHpjlDoLSRcFGbnsZ24xU3SkcTn1DUDMLCzZedJ54n0RLhm+TWICN3x7gGX1XiunXPvBNwBYqkYxb7ik9xewzFcHaPh2N60HY/Tg8/tI5VOYTIGp8NJOpMmk8kgLqE70U1XTMXori130do3dSODGRssnmqGumVKV5bS29JL7WW1A/52j89zUvXP/Q/tJ7Q4xKq3raJ9fzvHth/D4XbgK/Wx9sa1gE64Ci4IEuuMUVhVSHemG5fHRbIvibfAe1oTrnJikBMBgCVvWEI6nsYRcFBQVUDLrhacLq33kxu5nOmErpHEacNHNsBHzujUlhnM0OCtW9z84OUfDGQENXQ1cMdTd7Bp1SbOX3j+ScHOr7zlK6d1rXw3S87l1JPooa6qjr50H06nk7A/jNfppTPeSU1hDXva9tCT6GFD1QbuuOIOvvb81/jVvl9hjOH/rP0/dMW7xrWQTO7aCwsWsqtlF/3JftKZNIuLF5+1xWh6Ej10xbroT/WTSCUwGFwOlwqByWCMoSfeg8vhoixYRrGvmGcanqE52sxN62+adDGwQpBlaM+3uLaY8O1heiI9I06Oyo0iAuUBAuUBSleU8tqTr+Ev8WMyZmAewIY/3cC2b2wDoGBhAT0NPcQ6Y5z3/vNGLCs9HMYYHv3Uoydta6xvHAggO5wO/KV+HG4H3pB3YORyJr10y/xkuODtd3d8l6rCqgH/fywdo8RXws7mnVxQdcEpqaSnQ87NkisXXRGsYEPVBkSERYFFRLojJNIJAGo8NWTI0BnvZEPVBupvqUdEWF+1nj996E95cP+DGAz/ft2/j6sd+dfuS/YNZA3VFteelXTOSFeEDBk8Tg/JdJIYMZLpJGIEl8OF2+XG7/LTn+rH5/QR8oZYGlqK2+mmtb91Qu/n6WKFII9he76jjGiHjiKcHielK0spWFBwingEy4PsuHcHPU09hJaHuODmC1h9/epxty0nAs/f/TwX33YxV9919cDz/ADyOW89Z1wumpkwi9cycxkueNuX6qM3ORhv6kv2EfKFaO1rHdiWnx55ugx1s4QLwwPB6pWlK3m64WnECF6Xl91tu1lRuoJ/u/bfBmIItaFafvUnv+JTj36Ku5+/m39+7p+56+q7xpVCOl4Xz0Sob6rnkppLiCfj7GrZhcHQnekmQ4YKfwWVhZU0dDdgMCwvWc55ledR7CvGGEN7f/u44hxnihWCM2BY//kHh59Qtfr61adl+PMZTgSGxgwWrFswsH009jy4h2e++gytL7fiKfZQfVE18Z44T33xKRZfsRhjzEBpi/JV5aOuimaZu+TnyOdYEFygRinPZb63fS9ucfPrg7/GYOhL9FERrKA+Un/S4i8T6VnngtWbd2+mO97NZeHLQGDn8Z1sqNrAv137b1xUe9FJrxER7rpal6oL+UKTNpnsdGjpbWFxaDHXr7qe6qJq9rfvJ5aI0Z/uZ2XpSgo8BdRV15HIJAj5QgTcmu3Xn+rH4/KMK85xpogxUzJX64zYuHGj2bZt23Q3Y1oYSQTGuz+fPQ/u4ak7nqKvtQ+cIEZI9adYWLeQRFcCh9tB6fJSHE4HJm2oWFuBw+k4LfeVZfYxXEmISE+EvmTfScHb7U3beezQYywrWUZrXyu7TuyiL9XHhZUX0p3opjPeidfpJegO0p3o5uKai1ldvppIT4QT0RNndd3frQ1b+dmen41YxmIyZxSfLvfvvv+U97I73k3AHeCGc28A1H30vRe/x772fZT5tT5RR6yD5SXLzyhGICIvGGM2jnXctKSPWsaPiOAL+UY08rmRwcW3XYwv5Bv1y7/j3h1aRA5weV24/W5cfhfHXziOL+Sjt7kXb4GXQFlAi8kdj+IL+Wiqb5rUe7RMH7mSED3xHhYVL6In3sOdW+7ELW46Y510x3W96u54N+XBct656p3sbN7JC8deQESoClax88RODncdJpPJkEgl6Ip30dHfwW9f+y3f2fEdjnQcoSfRw49e/hFf/N0XqY+cWWpkfaSef37un09pc/55Z4oIgAajh76XnbFO6qoHhStcHOaD6z/IFYuuoC/VRywZ47Lay6YkUAx2RDBrGKuHM54e0D0X30NxbTEnXj5BJp3B5XGRyWSIHosSfn2Y3uNaAlscoi6ijhjnvPUcos1RNt46ZqfCMgv59GOfpifec9IEsLa+Ngq9hXziok8MW/Ihloqx49gO2vvbcTld9CZ6cYmLpEniFCfJTBJjDIl0gqAnSDKdpLqwGoc4CBfpJLCvvvWrEzZwo7X5dLOVporpqik03hGBjRHMEsYy8uPpARVWF2oaa20hHXs7SJEinUzj8DiIdegEudw8iVR/Cm+x94znIVjGZjoD96OVhBgaQL1/9/10xbvo7O+kP92Pz+WjP9lPZ6wTj1MnYybSCZ1Zn8lWx032kzZpmnubWRJagtfp5XD3YR4+8PCEyyhMtIzFdDKZweizgXUNnSUi9REe+/Rj/Ox9P+OxTz9GpD4y3U06hQtuvoBYRwxBKF5eTCaZIdGdoKquisVXLqbqwiri0Th9bX0kogkKFhYMlMG2TA658tvHdx6n8flGXvyvF/nxO37M77/2+ym5fq4kRD4jlYRo6W0hmU7icDjwu/wkMgl6k72ICGmT1pEABgykSePAocKAAwQEQRxCub+c7U3bp6TNlvFhheAskPsxx3viFC8qJt4TZ8udW2acGKy+fjVX/u2VeAo9pPpSLLxwIe/4wTv4wEMf4OJPXExxbTEly0rwFnoJLQ1RXFtsA8WTzO7Nu4l1xWh+sZlMKoOvVGM4W76yhT0P7pn0659OSYiKYAUuh4uMyeB3+XGKk7RJ43F4cIoTpzhxiIOMyeB1evG5fDpZCkPIEyJlUlpoLbgQIxN3SU+0jIVlZKxr6CwwU2rpj4eR0lgnuqCM5czobuym42AHLr8G70HXoYi1x9hx744JpxyPl/wUzbFKQtRV1/HwgYfxOr24xMXRrqP4nD7KgmX0JnoHYgNd8S68Li8LAgsQEQxaTsHv8rOoeBHxdJwrwldMSZst48MKwVlgptTSt8w+imqKONR1aGAkAJBJZvCGvPQ09YzyyrPH0HpCIxEuDnPLBbfw7e3fZqFrIevNejr6O2jrb6MsUKallvtaKfQW4nV6McawMLiQhQU6AqgtqiXgCRCSEO397Xz0wY/SG++lwFPAktCSMev/T6TNM4WZsgDNSFghOE2GC+zNpFr6ltnFuTeey87/3km8R8ucZ5IZ0vE0wQVBCqsLxz7BFFMXrhtYs3hv614OdR6ipqCGE30n2H5sO8W+YlaUrCCeidPe386Gqg2cU3oOgpAxGUSEA20H2NWyC4/DQ2NPI6lMiu5Y95j1/2crM2kBmpGYt0LQFeniwMMHaNrehBihamPVwAIxIzHSurrLrl3GoYcPAbaWvuX0CNeFueQvLmHLV7YQa4/hDXkJLtAsrQtuvmCaWzc8+Rkw+T3dCxZeMOZC7ffvvp9dLbsoD5RztOsoRb4iMNCT6hmz/v9sZbhyHbntM+U+57QQ5Jae7G3RzAZEl5MUh+gSky39OsFK4MCjB9j/4H76O/vpeq2LjGQoX1XORR+/aMBPO1IsoH13O5fefim7N++m62gXRTVFrP/Q+hkXH7DMTC7/88spX1k+UIuqsLrwtGtRTRenmxbZ0ttCIpWgxF9CX7KPoFtFL5qIDtT/n2itopnKcOU6Ztp9zlkhyF96UpxCw9MNGGNY/AeLOfHyCY7vPE7pilI8BR7i3XF6mnroPNJJojuBN+TFIQ7a97XzxGefADTImh8LaOnysOdYER1RF56+Ls79RJi3fMUafsvEOJNaVGdKpCvCIwceYduxbYgRLqy+cNLq4FcEK/C4PMRSMQLuAIlMAowu6zie+v+zkeHWWphp9zlnhSB/6ckTu07gL9OVhzoOdpBOphGn0N3YTawjRsfBDmLdMWIdMdw+N74CH5mUpr2JCD/96ms0/e9qHn3kTfTGXPi8GYoCKTYsbac22E6iMMgDD+j6sBNdFtBimQ6Gq3EzmXXw66rrePXEq+xr30fIG+Jgx0FSmRS1hbUsDC48a/X/ZxLDrbUw0+5zzs4jyF96Mt4Vx+V34fK5iHXF8BX7tKjTgQ4yyQwItPX5eTm2nK2J9ezsXkpnugCTMRzpX8B9L67i2WehlyCpNLT0eGju9PLs7lKONLlZ/cZqQiFdJNximU3UN9XT0t9CeaCcoCdIgaeAMn/ZQB38s01+TR2n00lNYQ1rK9byuoWvo7a4dkYFUM8W+UtaDre850xgzo4I8pee9BZ7SfWnAPAV+yhdUcrhJw8jTs1k6EgGeKWvCqczSjDTQzxTxEs9S1gXPMT26FLE66G3FwpDbhyF0NycJpEU0n6htXQl5SuCZDLQPHNcfhbLuMj32efwu/yTWgc/XBzmIxs+wkfm0fJ2tsTENFFdV02sM0a8O07JOSX0t/XT19pHyTklOD1OghVBqjZUkYlnaEzXECoXQgscmHQGZ7IXbzrGkb4KOhNBQouKSCTA7QaX301BiQ9nMECotoQ+kw12RaFi5rj8LJZxke+zzzGVdfAtM4M5OyLIXzQm0Zug9rJaEMikM7gDblZevxJ3wI23yMuhR4pZ6uylbU8LHQc6dH1iZxxTXsn555YTiRbi6YJkEjwecDrB4YBEAkpLobsbOjvhypnj8rNYxkW+z35oHfz8MsmWuc2cFQIYvWxCLqsIoKQwRU+3i9JlpZz3J+cRLA/S3Q2BgAZ/v/hFCAahpQXicUiltPfv8cCqVXrclVfaQLFl9pHz2ednDV1We9mkZQ1ZZiaTth6BiPwT8DYgARwEPmSM6czu+yxwM5AG/swY8+hI54HJW48gN8/g0N4Ezx1aQO26IioXB4lGtYefywKqr4fvfAe2bNHtJSVwySXw4Q9Dne00WSyWGcp41yOYTCF4K/AbY0xKRL4CYIz5tIisAX4EXARUA48DK40x6ZHONRUL00QiavBbWrS3X1dne/gWi2V2M+0L0xhjfp339DkgVyN2E/BjY0wcOCwiB1BR2DJZbRkP4bA1/BaLZX4yVTGCDwM/yT6uQYUhRyS77SRE5FbgVoBFixYN3W2xWCxjEumK8PCBh9netB0jho1VGzm/8nwiPZEZWwl0Ojij9FEReVxEXhnmb1PeMZ8DUsAPT+fcxph7jDEbjTEbK2xepsViOU0iXRG+++J3eabhGXxuHwFXgEcOPMIXn/oiDV0NVBZU0pfs44G9DxDpmlmLSE01ZzQiMMa8ebT9InITcD3wJjMYjGgEavMOC2e3WSwWy1mjvqme1v5WyvxlBNxaIDKRSZAwCY73HmdpydIZWQl0Opg015CIXAP8JXClMaYvb9cvgf8Rka+hweIVwNbJaofFYpmf5GZNl/pLB7al0ikcxkFXbHDRqNOuBFpfD5s3Q2Mj1NTAjTfO+vTByYwR/CvgBR4TEYDnjDEfNcbsEpH7gFdRl9HHRssYslgslomQmzXdn+ofGBG4nC7i6TjFvsH5RaNWAh2aTuh2ww9+AGVlsGiR5pN/8Ytw+eWaVz5LUw4nM2to+Sj7vgx8ebKubbFYLHXVdew6sYsDHQd0dTQEj8ODyegSmhmTGb0SaCQCDzwAoRBUVmodme9+FxYuVGF44gno6tISA01N8PnP6zGzsBTxnJ5ZbLFY5i/h4jA3rb/ppKyha5ZfM5A11BxtpiJYwZWLrxw+PlBfryJQlF1HoKgI+vpg/344cgS8Xq01IwJ798K2bXDRRYOvtUJgsVgs00+4OMwtG26BDSdvr2McPv2WFh0J5FNZCU8+qaIQCGjNGbdbC5A99pgKQUHBrCtFPGerj1osFssZUVGhrp581q9XV5DDAcboXzKp7qL2dj1mFpYitkJgsVgsw1FXp8Hg7m7IZPR/WRmcd55Wnuzr02qUFRWQTmuwOFeKeJZlEVnXkMVimTOc1Zph4bAGfevr1dVTUaFlhhcvhjvuUMMfCsGxY/p35ZWzthSxFQKLxTInGC7JZyCBhwg88ogGdEXgwgvh2mvHNtjDFSHLPb/3XmhogOpq+Oxn4frrJ+W+poJJqz56NpmK6qMWi2UWE4lw/zca6Gvpo2hhAFasgPJyug+3Edj9Ajc0fVPdNsuXazC3owMWLNDjjJm1+f9jMd7qozZGYLFYZjfZoUBLq4OChUHN5Hn+edi/j4Jdz9Gyr119/H6/9uAzGV1VautW2LVLhw99fTp8iMzPmkPWNWSxWGY32Xz/ikohGndTFMj2b595lmjlCircEQ3mFhQMTv4yBlwuzfhxOAbnCpxJ/v8sXtTECoHFYpndZPP961Z08cDzmvdf4PURPd5HZ3kBVy7vgKMuFQGPB3p79bHfD8V5S9kOzf83RuMJQ8kZ/BMn1L2UyxAaLkBx0UV6fL44wIwTDCsEFotldpPN9w+XO9h0cTP1+4tpbhYqFgS5cu1BwoWV0FWsReK8Xh0JpFLg82mMIEd+/v8XvqBpoHfdpa/LGW4ROHAAYjH47W8Hs4S6u9UltXChppi2tcHBg/CLX2gdov5+dVf99KdaqG79+mEi2tMnBlYILBbL7OTBBzVz5/Bh7b2/5S2Er7iC8NoTUNOpvfGtW8ETgquugmefhX37YMkSuOKKwRFCJsPAQuVXXqnnikT03Nu2wbnnqpHu74enn9ZRg9Op51q3TrORYjFYtgyOHlXBWLAAWlv1vF1davirqvT1L72k5wiHVTQaGuBLX4Krr5620YEVAovFMvt48MHBXP7ly+H4cfj5z9Ugv+lNg7n8VVXam+/t1fTOfEObc/HkzxEIh3X7+vVqmB99VHv3paWwdq0GlTs7dduSJSoiuRnFqZSOFtrb1fj7/RqDaGrS1y1erPMN3G593toKv/sdrF6tgpQLWE/D6MAKgcVimX3ce6+KQFmZPq+pUVdPYyPccMPgcaMtRj7SvkceUYO+bJka7z179PzptG6PRtUlVFiocYJEQvft3q0CAGrkPR4VB7dbBcHphJ4edU/t3auvB9i5U+8jFFLX0jQUrLNCYLFYZh9NTVBbe/K2UEjdLGdCJKLlpXNzDQoL1eg3NuofqEHPZHQUkkzqnzE6EshlIxmjIpBMalwhHteKpTmKi/V5NKoCtnatHrNrl7qgphgrBBaLZfZRXa0umtyIAPR5dfWZnbe+Xg3/wYOaUrp4sQpAX94iiyIqBLnevcOhz0UGBcDpHBSIVEqPS2fX38oFq3OjBL9f3VNFRfo852qawnRUO6HMYrHMPm6+WXvsbW1qYNva9PnNN5/ZeVtatIeeSxv1+9XI5+PzDfb6/X4dNeRGBum0CgOoyyiVUrEKh3V7MqlxjHAYystVdAIBdSO99tpg8bpcvYy+vimZ8GZHBBaLZfaRq+uTX+/nk58883o/FRXqmlmzRgO727ap3z8QUMMdi2lcIBjUXn4iodeOxdRYi6g45Hr9Tqcek8moOyj3uKFBjwuF9Lq50cPateryGm5RHJi0+IEVAovFMju5/vqzX+itrg4efliN+NGj2ktfvVrFIZehtH07HDqkQgA6Ili3ToPFuZhAbkGb4mJ19XR363Hl5bq/rEwFob9f01ODQRUCp1Pb8NBDpy6KM4kL3ljXkMViseQIh+EjH4Hf/1573xs2wEc/Cuefry6hzk5NLV23ToO76bQa+kRC4wl+vwrGypXwB3+gLh+PRw3/ZZep0V+zRkcP6bSmigYCauDXrh1MHR1uUZxJXPDGjggsFosln4sugre8RQ37296mk8P++q91TkFrqxr94mJ1EeXmF3R1Dc4sXrBAhSAWUxE4dkzPW14+UBWVw4c1CF1UpPMXhgaC6+o0JgA6Esif8DYJTHoZahH5C+BOoMIY0yoiAtwNXAf0ATcZY7aPdg5bhtpisUw5Q2sN5WfxHDyoPfljx9St4/NpsPrYMR0V+Hzq2qmqUreQiE5AyzfqY00cOwtZQ+MtQz2pIwIRqQXeChzN23wtsCL7dzHw79n/FovFMnMYWnAufwLaPfeooV+0CPbvV8NeXKyjiPe+91QDDsPPYh6N0SbDnWUm2zV0F/CXwAN52zYB3zc6FHlOREIiUmWMOTbJbbFYLJazQ86HX16uf6A9/0BgZAM+g0tST1qwWEQ2AY3GmJ1DdtUA+dP/ItltQ19/q4hsE5FtLS0tk9VMi8ViOX2GW9h+Fi5an+OMRgQi8jiwcJhdnwP+CnULTQhjzD3APaAxgomex2KxWM46Iy1sP4N7/aNxRkJgjHnzcNtF5DxgKbBTY8OEge0ichHQCOQXCQlnt1ksFsvsYQp9+JPNpLiGjDEvG2MWGGOWGGOWoO6fC40xx4FfAh8Q5RKgy8YHLBaLZfqYjnkED6GpowfQ9NEPTUMbLBaLxZJlSoQgOyrIPTbAx6biuhaLxWIZG1tiwmKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnmOFwGKxWOY5VggsFotlnjOpQiAinxCRPSKyS0S+mrf9syJyQET2isjVk9kGi8VisYyOa7JOLCJvADYB5xtj4iKyILt9DfBuYC1QDTwuIiuNMenJaovFYrFYRmYyRwR/CvyjMSYOYIw5kd2+CfixMSZujDkMHAAumsR2WCwWi2UUJlMIVgJXiMjzIvKUiNRlt9cADXnHRbLbTkJEbhWRbSKyraWlZRKbabFYLPObM3INicjjwMJhdn0ue+5S4BKgDrhPRJaN99zGmHuAewA2btxozqSdFovFYhmZMxICY8ybR9onIn8K/NwYY4CtIpIByoFGoDbv0HB2m8VisVimgcl0Df0CeAOAiKwEPEAr8Evg3SLiFZGlwApg6yS2w2KxWCyjMGlZQ8B3gO+IyCtAAvhgdnSwS0TuA14FUsDHbMaQxWKxTB+TJgTGmATwvhH2fRn48mRd22KxWCzjx84stlgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnmOFQKLxWKZ51ghsFgslnnOpAmBiKwXkedE5EUR2SYiF2W3i4h8XUQOiMhLInLhZLXBYrFYLGMzmSOCrwJ/a4xZD3w++xzgWmBF9u9W4N8nsQ0Wi8ViGYPJFAIDFGUfFwNN2cebgO8b5TkgJCJVk9gOi8VisYyCaxLP/UngURG5ExWc12e31wANecdFstuO5b9YRG5FRwwsWrRoEptpsVgs85szEgIReRxYOMyuzwFvAj5ljPmZiLwLuBd483jPbYy5B7gHYOPGjeZM2mmxWCyWkTkjITDGjGjYReT7wG3Zpz8F/jP7uBGozTs0nN1msVgslmlgMmMETcCV2cdvBPZnH/8S+EA2e+gSoMsYc2y4E1gsFotl8pnMGMEtwN0i4gJiZP39wEPAdcABoA/40CS2wWKxWCxjMGlCYIz5PbBhmO0G+NhkXddisVgsp4edWWyxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzzHCsEFovFMs+xQmCxWCzznMlcmGZuE4lAfT20tEBFBdTVQTg83a2yWCyW08aOCCZCJAIPPAB9fVBZqf8feEC3WywWyyzDCsFEqK+HUAiKisDh0P+hkG63WCyWWYYVgonQ0gIFBSdvKyjQ7RaLxTLLsEIwESoqIBo9eVs0qtstFotllmGFYCLU1UFnJ3R3Qyaj/zs7dbvFYrHMMs5ICETkj0Vkl4hkRGTjkH2fFZEDIrJXRK7O235NdtsBEfnMmVx/2giHYdMmCASguVn/b9pks4YsFsus5EzTR18B3gl8K3+jiKwB3g2sBaqBx0VkZXb3N4G3ABGgXkR+aYx59QzbMfWEw9bwWyyWOcEZCYExZjeAiAzdtQn4sTEmDhwWkQPARdl9B4wxh7Kv+3H22NknBBaLxTJHmKwYQQ3QkPc8kt020naLxWKxTBNjjghE5HFg4TC7PmeMeeDsN2ngurcCtwIsWrRosi5jsVgs854xhcAY8+YJnLcRqM17Hs5uY5TtQ697D3APwMaNG80E2mCxWCyWcTBZrqFfAu8WEa+ILAVWAFuBemCFiCwVEQ8aUP7lJLXBYrFYLOPgjILFInID8A2gAvhfEXnRGHO1MWaXiNyHBoFTwMeMMensaz4OPAo4ge8YY3aNdZ0XXnihVUSOnElbJ0A50DrF15xM7P3MfObaPdn7mX4Wj+cgMcZ6XYZDRLYZYzaOfeTswN7PzGeu3ZO9n9mDnVlssVgs8xwrBBaLxTLPsUIwMvdMdwPOMvZ+Zj5z7Z7s/cwSbIzAYrFY5jl2RGCxWCzzHCsEFovFMs+xQjAEEfk7EXlJRF4UkV+LSHV2u4jI17Pls18SkQunu63jQUT+SUT2ZNt8v4iE8vYNWyp8JjOR0ucznblQml1EviMiJ0TklbxtpSLymIjsz/4vmc42jhcRqRWR34rIq9nv2m3Z7bPyfsaDFYJT+SdjzOuMMeuBB4HPZ7dfi86QXoHWQPr36WneafMYsM4Y8zpgH/BZOKVU+DXAv4mIc9paOX5ypc9/l79xtt5Pto3fRL9fa4A/yd7LbOO76Puez2eAJ4wxK4Anss9nAyngL4wxa4BLgI9lP5PZej9jYoVgCMaY7rynQSAXTd8EfN8ozwEhEama8gaeJsaYXxtjUtmnz6H1nSCvVLgx5jCQXyp8xmKM2W2M2TvMrll5P2gbDxhjDhljEkCuNPuswhjzO6B9yOZNwPeyj78HvGMq2zRRjDHHjDHbs497gN1oleRZeT/jwQrBMIjIl0WkAXgvgyOCuVBC+8PAw9nHc+F+8pmt9zNb2z0eKo0xx7KPjwOV09mYiSAiS4ALgOeZA/czEme6QtmsZKzS2saYzwGfE5HPAh8H7pjSBp4m4ykVLiKfQ4e8P5zKtk2E6Sp9bpk8jDFGRGZVrrqIFAA/Az5pjOnOX4BrNt7PaMxLITiN0to/BB5ChWC00trTylj3IyI3AdcDbzKDE0dm7f2MwIy9nzGYre0eD80iUmWMOZZ1o56Y7gaNFxFxoyLwQ2PMz7ObZ+39jIV1DQ1BRFbkPd0E7Mk+/iXwgWz20CVAV94wccYiItcAfwm83RjTl7drpFLhs5XZej9zuTT7L4EPZh9/EJgVoznRrv+9wG5jzNfyds3K+xkPdmbxEETkZ8AqIAMcAT5qjGnMfjn+Fc2M6AM+ZIzZNn0tHR/Z9aK9QFt203PGmI9m930OjRuk0OHvw8OfZeYwpPR5J/CiMebq7L5Zdz8AInId8C8Mlmb/8vS26PQRkR8BV6GlmpvRUfQvgPuARehv6V3GmKEB5RmHiFwOPA28jNoBgL9C4wSz7n7GgxUCi8VimedY15DFYrHMc6wQWCwWyzzHCoHFYrHMc6wQWCwWyzzHCoHFYrHMc6wQWCwWyzzHCoHFYrHMc/5/SKLpCDe8EKIAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "tsne = TSNE(\n", + " n_components=2, perplexity=15, random_state=42, init=\"random\", learning_rate=200\n", + ")\n", + "vis_dims2 = tsne.fit_transform(matrix)\n", + "\n", + "x = [x for x, y in vis_dims2]\n", + "y = [y for x, y in vis_dims2]\n", + "\n", + "for category, color in enumerate([\"purple\", \"green\", \"red\", \"blue\",\"yellow\"]):\n", + " xs = np.array(x)[embedding_df.Cluster == category]\n", + " ys = np.array(y)[embedding_df.Cluster == category]\n", + " plt.scatter(xs, ys, color=color, alpha=0.3)\n", + "\n", + " avg_x = xs.mean()\n", + " avg_y = ys.mean()\n", + "\n", + " plt.scatter(avg_x, avg_y, marker=\"x\", color=color, s=100)\n", + "plt.title(\"Clusters identified visualized in language 2d using t-SNE\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cluster 0 Theme: Literary & Archival ItemsOne possible way to group these transactions into a meaningful cluster is by their theme or purpose. A common theme that appears in most of these transactions is literary and archival items, which suggests that these are payments for acquiring, preserving, or accessing various collections of books, manuscripts, papers, and other materials related to literature and history. This theme could be relevant for a library, a museum, a research institution, or a collector.Some possible indicators of this theme are:-\n", + "JISC SERVICES LTD SUBSCRIPTION ACCOUNT, Annual Subscription\n", + " Private Sale, Literary & Archival Items\n", + " ALDL, Oct19-Dec19 charge from Agency for Legal Deposit Libraries\n", + " ALDL, Legal Deposit Services\n", + " Private Sale, Literary & Archival Items\n", + " SONYA LEONARD, Literary and personal papers of Tom Leonard 1961 to 2018\n", + " FROST AND SULLIVAN LTD, Literary & Archival Items\n", + " Cengage Learning (Emea )Ltd, Literary & Archival Items\n", + " Agency for the Legal Deposit Libraries, Agency services\n", + " Robert Harland, Correspondance and Literary papers - Thomas Carlyle\n", + " ----------------------------------------------------------------------------------------------------\n", + "Cluster 1 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, but one possible theme that these transactions have in common is:- They are all payments to the same supplier, ECG Facilities Service, for various services related to facility management and maintenance.- They are all relatively large amounts, ranging from £27,013.16 to £125,000.00, with a mean of £54,798.67 and a standard deviation of £19,894.67.-\n", + "ECG FACILITIES SERVICE, Maintenance contract - all properties\n", + " ECG, This payment covers 16 invoices including upgrade to boiler control panel & remedial works following 5 year test\n", + " ECG FACILITIES SERVICE, This payment covers multiple invoices for facility management fees\n", + " ECG FACILITIES SERVICE, CB Bolier Replacement (1),USP Batteries,Gutter Works & Cleaning of pigeon fouling\n", + " ECG FACILITIES SERVICE, Facilities Management Charge\n", + " ECG FACILITIES SERVICE, Facilities Management Charge\n", + " ECG Facilities Service, Facilities Management Charge\n", + " ECG FACILITIES SERVICE, Maintenance contract - August 21 period\n", + " ECG Facilities Service, Facilities Management Charge\n", + " ECG Facilities Service, Facilities Management Charge\n", + " ----------------------------------------------------------------------------------------------------\n", + "Cluster 2 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, but one possible theme that these transactions have in common is:- They are all related to building or refurbishment projects at different locations in Edinburgh- They are all paid to either John Graham Construction Ltd or Arthur McKay Building Services, two contractors that provide similar services- They are all relatively large and consistent in value, ranging from around £27,000 to £125,000 per transaction\n", + "John Graham Construction Ltd, Causewayside Refurbishment\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " SJS Property Services, George IV Bridge Work\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " ARTHUR MCKAY BUILDING SERVICES, Causewayside Work\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " John Graham Construction Ltd, Causewayside Refurbishment\n", + " ----------------------------------------------------------------------------------------------------\n", + "Cluster 3 Theme: There are different ways to approach the task of grouping transactions into meaningful clusters, and different criteria or methods may yield different results. However, one possible theme that these transactions have in common is that they are all related to operational costs or capital expenditures for a large organization, such as a university, a hospital, or a government agency. Some possible sub-themes or categories within this theme are:- Electricity and utility services (EDF, EDF Energy)- IT work, equipment, and software\n", + "EDF, Electricity\n", + " m-hance, IT Work\n", + " EDF Energy, This payment covers 5 invoices for utility services\n", + " JISC Services Ltd, Managed router service charge annual subscription 01/04/22 to 31/03/23\n", + " ALDL, ALDL Charges\n", + " COMPUTACENTER UK, IT equipment\n", + "\n", + " BSI, Subscription\n", + " PHOENIX SOFTWARE LTD, IT Hardware plus 5 year licence\n", + " AM Phillip, Vehicle Purchase\n", + " XMA SCOTLAND LTD, Purchase of IT equipment and renewal of maintenance agreement. This payment covers 2 invoices\n", + " ----------------------------------------------------------------------------------------------------\n", + "Cluster 4 Theme: There are different ways to approach the task of clustering transactions, and different criteria to define meaningful clusters, but one possible theme that these transactions have in common is:- They are all related to property or facility costs, such as rent, rates, service charges, or maintenance.- They are all paid to either Glasgow City Council, Glasgow Life, or City of Edinburgh Council, which are public sector organisations that manage or provide various services in Scotland.- They are all relatively large amounts, ranging from\n", + "GLASGOW CITY COUNCIL, Kelvin Hall\n", + " XMA Scotland Ltd, Kelvin Hall\n", + " Creative Video Productions Ltd, Kelvin Hall\n", + " GLASGOW LIFE, Oct 20 to Dec 20 service charge - Kelvin Hall\n", + " CITY OF EDINBURGH COUNCIL, Rates for 33 Salisbury Place\n", + " Glasgow Life, Service Charges\n", + " City Of Edinburgh Council, Non Domestic Rates \n", + " Glasgow Life, Service charges\n", + " Glasgow Life, Kelvin Hall Service Charges\n", + " Glasgow City Council, Kelvin Hall\n", + " ----------------------------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# Reading a review which belong to each group.\n", + "transactions_per_cluster = 10\n", + "\n", + "for i in range(n_clusters):\n", + " print(f\"Cluster {i} Theme:\", end=\" \")\n", + "\n", + " transactions = \"\\n\".join(\n", + " embedding_df[embedding_df.Cluster == i]\n", + " .combined.str.replace(\"Supplier: \", \"\")\n", + " .str.replace(\"Description: \", \": \")\n", + " .str.replace(\"Value: \", \": \")\n", + " .sample(transactions_per_cluster, random_state=42)\n", + " .values\n", + " )\n", + " response = openai.Completion.create(\n", + " engine=\"text-alpha-002-latest\",\n", + " prompt=f'''We want to group these transactions into meaningful clusters so we can target the areas we are spending the most money. \n", + " What do the following transactions have in common?\\n\\nTransactions:\\n\"\"\"\\n{transactions}\\n\"\"\"\\n\\nTheme:''',\n", + " temperature=0,\n", + " max_tokens=100,\n", + " top_p=1,\n", + " frequency_penalty=0,\n", + " presence_penalty=0,\n", + " )\n", + " print(response[\"choices\"][0][\"text\"].replace(\"\\n\", \"\"))\n", + "\n", + " sample_cluster_rows = embedding_df[embedding_df.Cluster == i].sample(transactions_per_cluster, random_state=42)\n", + " for j in range(transactions_per_cluster):\n", + " print(sample_cluster_rows.Supplier.values[j], end=\", \")\n", + " print(sample_cluster_rows.Description.values[j], end=\"\\n \")\n", + " #print(str(sample_cluster_rows.Value).values[j])\n", + "\n", + " print(\"-\" * 100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/Multiclass_classification_for_transactions.ipynb b/examples/Multiclass_classification_for_transactions.ipynb new file mode 100644 index 0000000..7029ff2 --- /dev/null +++ b/examples/Multiclass_classification_for_transactions.ipynb @@ -0,0 +1,2201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiclass Classification for Transactions\n", + "\n", + "For this notebook we will be looking to classify a public dataset of transactions into a number of categories that we have predefined. These approaches should be replicable to any multiclass classificaiton use case where we are trying to fit transactional data into predefined categories, and by the end of running through this you should have a few approaches for dealing with both labelled and unlabelled datasets.\n", + "\n", + "The different approaches we'll be taking in this notebook are:\n", + "- **Zero-shot Classification:** First we'll do zero shot classification to put transactions in one of five named buckets using only a prompt for guidance\n", + "- **Classification with Embeddings:** Following this we'll create embeddings on a labelled dataset, and then use a traditional classification model to test their effectiveness at identifying our categories\n", + "- **Fine-tuned Classification:** Lastly we'll produce a fine-tuned model trained on our labelled dataset to see how this compares to the zero-shot and few-shot classification approaches" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload " + ] + }, + { + "cell_type": "code", + "execution_count": 311, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from helpers import OPENAI_API_KEY\n", + "\n", + "openai.api_key = OPENAI_API_KEY\n", + "COMPLETIONS_MODEL = \"text-davinci-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load dataset\n", + "\n", + "We're using a public transaction dataset of transactions over £25k for the Library of Scotland. The dataset has three features that we'll be using:\n", + "- Supplier: The name of the supplier\n", + "- Description: A text description of the transaction\n", + "- Value: The value of the transaction in GBP\n", + "\n", + "**Source**:\n", + "\n", + "https://data.nls.uk/data/organisational-data/transactions-over-25k/" + ] + }, + { + "cell_type": "code", + "execution_count": 312, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "359" + ] + }, + "execution_count": 312, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transactions = pd.read_csv('./data/25000_spend_dataset_current.csv', encoding= 'unicode_escape')\n", + "len(transactions)" + ] + }, + { + "cell_type": "code", + "execution_count": 313, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)
021/04/2016M & J Ballantyne LtdGeorge IV Bridge Work35098.0
126/04/2016Private SaleLiterary & Archival Items30000.0
230/04/2016City Of Edinburgh CouncilNon Domestic Rates40800.0
309/05/2016Computacenter UkKelvin Hall72835.0
409/05/2016John Graham Construction LtdCausewayside Refurbishment64361.0
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "0 21/04/2016 M & J Ballantyne Ltd George IV Bridge Work \n", + "1 26/04/2016 Private Sale Literary & Archival Items \n", + "2 30/04/2016 City Of Edinburgh Council Non Domestic Rates \n", + "3 09/05/2016 Computacenter Uk Kelvin Hall \n", + "4 09/05/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "\n", + " Transaction value (£) \n", + "0 35098.0 \n", + "1 30000.0 \n", + "2 40800.0 \n", + "3 72835.0 \n", + "4 64361.0 " + ] + }, + "execution_count": 313, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transactions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 316, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31\n", + "8\n", + "All good\n" + ] + } + ], + "source": [ + "def request_completion(prompt):\n", + " \n", + " completion_response = openai.Completion.create(\n", + " prompt=prompt,\n", + " temperature=0,\n", + " max_tokens=5,\n", + " top_p=1,\n", + " frequency_penalty=0,\n", + " presence_penalty=0,\n", + " model=COMPLETIONS_MODEL\n", + " )\n", + " \n", + " return completion_response\n", + "\n", + "def classify_transaction(transaction,prompt):\n", + " \n", + " prompt = prompt.replace('SUPPLIER_NAME',transaction['Supplier'])\n", + " prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n", + " prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n", + " \n", + " classification = request_completion(prompt)['choices'][0]['text'].replace('\\n','')\n", + " \n", + " return classification\n", + "\n", + "import json\n", + "\n", + "def check_finetune_classes(train_file,valid_file):\n", + "\n", + " train_classes = set()\n", + " valid_classes = set()\n", + " with open(train_file, 'r') as json_file:\n", + " json_list = list(json_file)\n", + " print(len(json_list))\n", + "\n", + " for json_str in json_list:\n", + " result = json.loads(json_str)\n", + " train_classes.add(result['completion'])\n", + " #print(f\"result: {result['completion']}\")\n", + " #print(isinstance(result, dict))\n", + "\n", + " with open(valid_file, 'r') as json_file:\n", + " json_list = list(json_file)\n", + " print(len(json_list))\n", + "\n", + " for json_str in json_list:\n", + " result = json.loads(json_str)\n", + " valid_classes.add(result['completion'])\n", + " #print(f\"result: {result['completion']}\")\n", + " #print(isinstance(result, dict))\n", + " \n", + " if len(train_classes) == len(valid_classes):\n", + " print('All good')\n", + " \n", + " else:\n", + " print('Classes do not match, please prepare data again')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zero-shot Classification\n", + "\n", + "We'll first assess the performance of the base models at classifying these transactions using a simple prompt. We'll provide the model with 5 categories and a catch-all of \"Could not classify\" for ones that it cannot place." + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": {}, + "outputs": [], + "source": [ + "zero_shot_prompt = '''You are a data expert working for the National Library of Scotland. \n", + " You are analysing all transactions over £25,000 in value and classifying them into one of five categories.\n", + " The five categories are Building Improvement, Literature & Archive, Utility Bills, Professional Services and Software/IT.\n", + " If you can't tell what it is, say Could not classify\n", + " \n", + " Transaction:\n", + " \n", + " Supplier: SUPPLIER_NAME\n", + " Description: DESCRIPTION_TEXT\n", + " Value: TRANSACTION_VALUE\n", + " \n", + " \n", + " The classification is:'''" + ] + }, + { + "cell_type": "code", + "execution_count": 315, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Building Improvement\n" + ] + } + ], + "source": [ + "# Get a test transaction\n", + "transaction = transactions.iloc[0]\n", + "\n", + "# Interpolate the values into the prompt\n", + "prompt = zero_shot_prompt.replace('SUPPLIER_NAME',transaction['Supplier'])\n", + "prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n", + "prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n", + "\n", + "# Use our completion function to return a prediction\n", + "completion_response = request_completion(prompt)\n", + "print(completion_response['choices'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our first attempt is correct, M & J Ballantyne Ltd are a house builder and the work they performed is indeed Building Improvement.\n", + "\n", + "Lets expand the sample size to 25 and see how it performs, again with just a simple prompt to guide it" + ] + }, + { + "cell_type": "code", + "execution_count": 291, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \n" + ] + } + ], + "source": [ + "test_transactions = transactions.iloc[:25]\n", + "test_transactions['Classification'] = test_transactions.apply(lambda x: classify_transaction(x,zero_shot_prompt),axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 292, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " Building Improvement 14\n", + " Could not classify 5\n", + " Literature & Archive 3\n", + " Software/IT 2\n", + " Utility Bills 1\n", + "Name: Classification, dtype: int64" + ] + }, + "execution_count": 292, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_transactions['Classification'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 293, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)Classification
021/04/2016M & J Ballantyne LtdGeorge IV Bridge Work35098.0Building Improvement
126/04/2016Private SaleLiterary & Archival Items30000.0Literature & Archive
230/04/2016City Of Edinburgh CouncilNon Domestic Rates40800.0Utility Bills
309/05/2016Computacenter UkKelvin Hall72835.0Software/IT
409/05/2016John Graham Construction LtdCausewayside Refurbishment64361.0Building Improvement
509/05/2016A McGillivrayCausewayside Refurbishment53690.0Building Improvement
616/05/2016John Graham Construction LtdCausewayside Refurbishment365344.0Building Improvement
723/05/2016Computacenter UkKelvin Hall26506.0Software/IT
823/05/2016ECG Facilities ServiceFacilities Management Charge32777.0Building Improvement
923/05/2016ECG Facilities ServiceFacilities Management Charge32777.0Building Improvement
1030/05/2016ALDLALDL Charges32317.0Could not classify
1110/06/2016Wavetek LtdKelvin Hall87589.0Could not classify
1210/06/2016John Graham Construction LtdCausewayside Refurbishment381803.0Building Improvement
1328/06/2016ECG Facilities ServiceFacilities Management Charge32832.0Building Improvement
1430/06/2016Glasgow City CouncilKelvin Hall1700000.0Building Improvement
1511/07/2016Wavetek LtdKelvin Hall65692.0Could not classify
1611/07/2016John Graham Construction LtdCausewayside Refurbishment139845.0Building Improvement
1715/07/2016Sotheby'SLiterary & Archival Items28500.0Literature & Archive
1818/07/2016ChristiesLiterary & Archival Items33800.0Literature & Archive
1925/07/2016A McGillivrayCausewayside Refurbishment30113.0Building Improvement
2031/07/2016ALDLALDL Charges32317.0Could not classify
2108/08/2016ECG Facilities ServiceFacilities Management Charge32795.0Building Improvement
2215/08/2016Creative Video Productions LtdKelvin Hall26866.0Could not classify
2315/08/2016John Graham Construction LtdCausewayside Refurbishment196807.0Building Improvement
2424/08/2016ECG Facilities ServiceFacilities Management Charge32795.0Building Improvement
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "0 21/04/2016 M & J Ballantyne Ltd George IV Bridge Work \n", + "1 26/04/2016 Private Sale Literary & Archival Items \n", + "2 30/04/2016 City Of Edinburgh Council Non Domestic Rates \n", + "3 09/05/2016 Computacenter Uk Kelvin Hall \n", + "4 09/05/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "5 09/05/2016 A McGillivray Causewayside Refurbishment \n", + "6 16/05/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "7 23/05/2016 Computacenter Uk Kelvin Hall \n", + "8 23/05/2016 ECG Facilities Service Facilities Management Charge \n", + "9 23/05/2016 ECG Facilities Service Facilities Management Charge \n", + "10 30/05/2016 ALDL ALDL Charges \n", + "11 10/06/2016 Wavetek Ltd Kelvin Hall \n", + "12 10/06/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "13 28/06/2016 ECG Facilities Service Facilities Management Charge \n", + "14 30/06/2016 Glasgow City Council Kelvin Hall \n", + "15 11/07/2016 Wavetek Ltd Kelvin Hall \n", + "16 11/07/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "17 15/07/2016 Sotheby'S Literary & Archival Items \n", + "18 18/07/2016 Christies Literary & Archival Items \n", + "19 25/07/2016 A McGillivray Causewayside Refurbishment \n", + "20 31/07/2016 ALDL ALDL Charges \n", + "21 08/08/2016 ECG Facilities Service Facilities Management Charge \n", + "22 15/08/2016 Creative Video Productions Ltd Kelvin Hall \n", + "23 15/08/2016 John Graham Construction Ltd Causewayside Refurbishment \n", + "24 24/08/2016 ECG Facilities Service Facilities Management Charge \n", + "\n", + " Transaction value (£) Classification \n", + "0 35098.0 Building Improvement \n", + "1 30000.0 Literature & Archive \n", + "2 40800.0 Utility Bills \n", + "3 72835.0 Software/IT \n", + "4 64361.0 Building Improvement \n", + "5 53690.0 Building Improvement \n", + "6 365344.0 Building Improvement \n", + "7 26506.0 Software/IT \n", + "8 32777.0 Building Improvement \n", + "9 32777.0 Building Improvement \n", + "10 32317.0 Could not classify \n", + "11 87589.0 Could not classify \n", + "12 381803.0 Building Improvement \n", + "13 32832.0 Building Improvement \n", + "14 1700000.0 Building Improvement \n", + "15 65692.0 Could not classify \n", + "16 139845.0 Building Improvement \n", + "17 28500.0 Literature & Archive \n", + "18 33800.0 Literature & Archive \n", + "19 30113.0 Building Improvement \n", + "20 32317.0 Could not classify \n", + "21 32795.0 Building Improvement \n", + "22 26866.0 Could not classify \n", + "23 196807.0 Building Improvement \n", + "24 32795.0 Building Improvement " + ] + }, + "execution_count": 293, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_transactions.head(25)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initial results are pretty good even with no labelled examples! The ones that it could not classify were tougher cases with few clues as to their topic, but maybe if we clean up the labelled dataset to give more examples we can get better performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification with Embeddings\n", + "\n", + "Lets create embeddings from the small set that we've classified so far - we've made a set of labelled examples by running the zero-shot classifier on 101 transactions from our dataset and manually correcting the 15 **Could not classify** results that we got\n", + "\n", + "### Create embeddings\n", + "\n", + "This initial section reuses the approach from the [Obtain_dataset Notebook](Obtain_dataset.ipynb) to create embeddings from a combined field concatenating all of our features" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)Classification
015/08/2016Creative Video Productions LtdKelvin Hall26866Other
129/05/2017John Graham Construction LtdCausewayside Refurbishment74806Building Improvement
229/05/2017Morris & Spottiswood LtdGeorge IV Bridge Work56448Building Improvement
331/05/2017John Graham Construction LtdCausewayside Refurbishment164691Building Improvement
424/07/2017John Graham Construction LtdCausewayside Refurbishment27926Building Improvement
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "0 15/08/2016 Creative Video Productions Ltd Kelvin Hall \n", + "1 29/05/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "2 29/05/2017 Morris & Spottiswood Ltd George IV Bridge Work \n", + "3 31/05/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "4 24/07/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "\n", + " Transaction value (£) Classification \n", + "0 26866 Other \n", + "1 74806 Building Improvement \n", + "2 56448 Building Improvement \n", + "3 164691 Building Improvement \n", + "4 27926 Building Improvement " + ] + }, + "execution_count": 317, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('./data/labelled_transactions.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)Classificationcombined
015/08/2016Creative Video Productions LtdKelvin Hall26866OtherSupplier: Creative Video Productions Ltd; Desc...
129/05/2017John Graham Construction LtdCausewayside Refurbishment74806Building ImprovementSupplier: John Graham Construction Ltd; Descri...
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "0 15/08/2016 Creative Video Productions Ltd Kelvin Hall \n", + "1 29/05/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "\n", + " Transaction value (£) Classification \\\n", + "0 26866 Other \n", + "1 74806 Building Improvement \n", + "\n", + " combined \n", + "0 Supplier: Creative Video Productions Ltd; Desc... \n", + "1 Supplier: John Graham Construction Ltd; Descri... " + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['combined'] = \"Supplier: \" + df['Supplier'].str.strip() + \"; Description: \" + df['Description'].str.strip() + \"; Value: \" + str(df['Transaction value (£)']).strip()\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101" + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import GPT2TokenizerFast\n", + "tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", + "\n", + "df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))\n", + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_path = './data/transactions_with_embeddings_100.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 321, + "metadata": {}, + "outputs": [], + "source": [ + "from openai.embeddings_utils import get_embedding\n", + "\n", + "df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n", + "df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n", + "df.to_csv(embedding_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use embeddings for classification\n", + "\n", + "Now that we have our embeddings, let see if classifying these into the categories we've named gives us any more success.\n", + "\n", + "For this we'll use a template from the [Classification_using_embeddings](Classification_using_embeddings.ipynb) notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 309, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0DateSupplierDescriptionTransaction value (£)Classificationcombinedn_tokensbabbage_similaritybabbage_search
0015/08/2016Creative Video Productions LtdKelvin Hall26866OtherSupplier: Creative Video Productions Ltd; Desc...136[-0.009802100248634815, 0.022551486268639565, ...[-0.00232666521333158, 0.019198870286345482, 0...
1129/05/2017John Graham Construction LtdCausewayside Refurbishment74806Building ImprovementSupplier: John Graham Construction Ltd; Descri...140[-0.009065819904208183, 0.012094118632376194, ...[0.005169447045773268, 0.00473341578617692, -0...
2229/05/2017Morris & Spottiswood LtdGeorge IV Bridge Work56448Building ImprovementSupplier: Morris & Spottiswood Ltd; Descriptio...141[-0.009000026620924473, 0.02405017428100109, -...[0.0028343256562948227, 0.021166473627090454, ...
3331/05/2017John Graham Construction LtdCausewayside Refurbishment164691Building ImprovementSupplier: John Graham Construction Ltd; Descri...140[-0.009065819904208183, 0.012094118632376194, ...[0.005169447045773268, 0.00473341578617692, -0...
4424/07/2017John Graham Construction LtdCausewayside Refurbishment27926Building ImprovementSupplier: John Graham Construction Ltd; Descri...140[-0.009065819904208183, 0.012094118632376194, ...[0.005169447045773268, 0.00473341578617692, -0...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Date Supplier \\\n", + "0 0 15/08/2016 Creative Video Productions Ltd \n", + "1 1 29/05/2017 John Graham Construction Ltd \n", + "2 2 29/05/2017 Morris & Spottiswood Ltd \n", + "3 3 31/05/2017 John Graham Construction Ltd \n", + "4 4 24/07/2017 John Graham Construction Ltd \n", + "\n", + " Description Transaction value (£) Classification \\\n", + "0 Kelvin Hall 26866 Other \n", + "1 Causewayside Refurbishment 74806 Building Improvement \n", + "2 George IV Bridge Work 56448 Building Improvement \n", + "3 Causewayside Refurbishment 164691 Building Improvement \n", + "4 Causewayside Refurbishment 27926 Building Improvement \n", + "\n", + " combined n_tokens \\\n", + "0 Supplier: Creative Video Productions Ltd; Desc... 136 \n", + "1 Supplier: John Graham Construction Ltd; Descri... 140 \n", + "2 Supplier: Morris & Spottiswood Ltd; Descriptio... 141 \n", + "3 Supplier: John Graham Construction Ltd; Descri... 140 \n", + "4 Supplier: John Graham Construction Ltd; Descri... 140 \n", + "\n", + " babbage_similarity \\\n", + "0 [-0.009802100248634815, 0.022551486268639565, ... \n", + "1 [-0.009065819904208183, 0.012094118632376194, ... \n", + "2 [-0.009000026620924473, 0.02405017428100109, -... \n", + "3 [-0.009065819904208183, 0.012094118632376194, ... \n", + "4 [-0.009065819904208183, 0.012094118632376194, ... \n", + "\n", + " babbage_search \n", + "0 [-0.00232666521333158, 0.019198870286345482, 0... \n", + "1 [0.005169447045773268, 0.00473341578617692, -0... \n", + "2 [0.0028343256562948227, 0.021166473627090454, ... \n", + "3 [0.005169447045773268, 0.00473341578617692, -0... \n", + "4 [0.005169447045773268, 0.00473341578617692, -0... " + ] + }, + "execution_count": 309, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "\n", + "fs_df = pd.read_csv(embedding_path)\n", + "fs_df[\"babbage_similarity\"] = fs_df.babbage_similarity.apply(eval).apply(np.array)\n", + "fs_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 310, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + "Building Improvement 0.92 1.00 0.96 11\n", + "Literature & Archive 1.00 1.00 1.00 3\n", + " Other 0.00 0.00 0.00 1\n", + " Software/IT 1.00 1.00 1.00 1\n", + " Utility Bills 1.00 1.00 1.00 5\n", + "\n", + " accuracy 0.95 21\n", + " macro avg 0.78 0.80 0.79 21\n", + " weighted avg 0.91 0.95 0.93 21\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " list(fs_df.babbage_similarity.values), fs_df.Classification, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "clf = RandomForestClassifier(n_estimators=100)\n", + "clf.fit(X_train, y_train)\n", + "preds = clf.predict(X_test)\n", + "probas = clf.predict_proba(X_test)\n", + "\n", + "report = classification_report(y_test, preds)\n", + "print(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Performance for this model is pretty strong, so creating embeddings and using even a simpler classifier looks like an effective approach as well, with the zero-shot classifier helping us do the initial classification of the unlabelled dataset.\n", + "\n", + "Lets take it one step further and see if a fine-tuned model trained on this same labelled datasets gives us comparable results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-tuned Transaction Classification\n", + "\n", + "For this use case we're going to try to improve on the few-shot classification from above by training a fine-tuned model on the same labelled set of 101 transactions and applying this fine-tuned model on group of unseen transactions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building Fine-tuned Classifier\n", + "\n", + "We'll need to do some data prep first to get our data ready. This will take the following steps:\n", + "- First we'll list out our classes and replace them with numeric identifiers. Making the model predict a single token rather than multiple consecutive ones like 'Building Improvement' should give us better results\n", + "- We also need to add a common prefix and suffix to each example to aid the model in making predictions - in our case our text is already started with 'Supplier' and we'll add a suffix of '\\n\\n###\\n\\n'\n", + "- Lastly we'll aid a leading whitespace onto each of our target classes for classification, again to aid the model" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ft_prep_df = fs_df.copy()\n", + "len(ft_prep_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0DateSupplierDescriptionTransaction value (£)Classificationcombinedn_tokensbabbage_similaritybabbage_search
0015/08/2016Creative Video Productions LtdKelvin Hall26866OtherSupplier: Creative Video Productions Ltd; Desc...12[-0.009630300104618073, 0.009887108579277992, ...[-0.008217384107410908, 0.025170527398586273, ...
1129/05/2017John Graham Construction LtdCausewayside Refurbishment74806Building ImprovementSupplier: John Graham Construction Ltd; Descri...16[-0.006144719664007425, -0.0018709596479311585...[-0.007424891460686922, 0.008475713431835175, ...
2229/05/2017Morris & Spottiswood LtdGeorge IV Bridge Work56448Building ImprovementSupplier: Morris & Spottiswood Ltd; Descriptio...17[-0.005225738976150751, 0.015156379900872707, ...[-0.007611643522977829, 0.030322374776005745, ...
3331/05/2017John Graham Construction LtdCausewayside Refurbishment164691Building ImprovementSupplier: John Graham Construction Ltd; Descri...16[-0.006144719664007425, -0.0018709596479311585...[-0.007424891460686922, 0.008475713431835175, ...
4424/07/2017John Graham Construction LtdCausewayside Refurbishment27926Building ImprovementSupplier: John Graham Construction Ltd; Descri...16[-0.006144719664007425, -0.0018709596479311585...[-0.007424891460686922, 0.008475713431835175, ...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Date Supplier \\\n", + "0 0 15/08/2016 Creative Video Productions Ltd \n", + "1 1 29/05/2017 John Graham Construction Ltd \n", + "2 2 29/05/2017 Morris & Spottiswood Ltd \n", + "3 3 31/05/2017 John Graham Construction Ltd \n", + "4 4 24/07/2017 John Graham Construction Ltd \n", + "\n", + " Description Transaction value (£) Classification \\\n", + "0 Kelvin Hall 26866 Other \n", + "1 Causewayside Refurbishment 74806 Building Improvement \n", + "2 George IV Bridge Work 56448 Building Improvement \n", + "3 Causewayside Refurbishment 164691 Building Improvement \n", + "4 Causewayside Refurbishment 27926 Building Improvement \n", + "\n", + " combined n_tokens \\\n", + "0 Supplier: Creative Video Productions Ltd; Desc... 12 \n", + "1 Supplier: John Graham Construction Ltd; Descri... 16 \n", + "2 Supplier: Morris & Spottiswood Ltd; Descriptio... 17 \n", + "3 Supplier: John Graham Construction Ltd; Descri... 16 \n", + "4 Supplier: John Graham Construction Ltd; Descri... 16 \n", + "\n", + " babbage_similarity \\\n", + "0 [-0.009630300104618073, 0.009887108579277992, ... \n", + "1 [-0.006144719664007425, -0.0018709596479311585... \n", + "2 [-0.005225738976150751, 0.015156379900872707, ... \n", + "3 [-0.006144719664007425, -0.0018709596479311585... \n", + "4 [-0.006144719664007425, -0.0018709596479311585... \n", + "\n", + " babbage_search \n", + "0 [-0.008217384107410908, 0.025170527398586273, ... \n", + "1 [-0.007424891460686922, 0.008475713431835175, ... \n", + "2 [-0.007611643522977829, 0.030322374776005745, ... \n", + "3 [-0.007424891460686922, 0.008475713431835175, ... \n", + "4 [-0.007424891460686922, 0.008475713431835175, ... " + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ft_prep_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( class_id class\n", + " 0 0 Literature & Archive\n", + " 1 1 Utility Bills\n", + " 2 2 Building Improvement\n", + " 3 3 Software/IT\n", + " 4 4 Other,\n", + " 5)" + ] + }, + "execution_count": 212, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classes = list(set(ft_prep_df['Classification']))\n", + "class_df = pd.DataFrame(classes).reset_index()\n", + "class_df.columns = ['class_id','class']\n", + "class_df , len(class_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0DateSupplierDescriptionTransaction value (£)Classificationcombinedn_tokensbabbage_similaritybabbage_searchclass_idprompt
0015/08/2016Creative Video Productions LtdKelvin Hall26866OtherSupplier: Creative Video Productions Ltd; Desc...12[-0.009630300104618073, 0.009887108579277992, ...[-0.008217384107410908, 0.025170527398586273, ...4Supplier: Creative Video Productions Ltd; Desc...
15131/03/2017NLS FoundationGrant Payment177500OtherSupplier: NLS Foundation; Description: Grant P...11[-0.022305507212877274, 0.008543581701815128, ...[-0.020519884303212166, 0.01993306167423725, -...4Supplier: NLS Foundation; Description: Grant P...
27026/06/2017British LibraryLegal Deposit Services50056OtherSupplier: British Library; Description: Legal ...11[-0.01019938476383686, 0.015277703292667866, -...[-0.01843327097594738, 0.03343546763062477, -0...4Supplier: British Library; Description: Legal ...
37124/07/2017ALDLLegal Deposit Services27067OtherSupplier: ALDL; Description: Legal Deposit Ser...11[-0.008471488021314144, 0.004098685923963785, ...[-0.012966590002179146, 0.01299362163990736, 0...4Supplier: ALDL; Description: Legal Deposit Ser...
410024/07/2017AM PhillipVehicle Purchase26604OtherSupplier: AM Phillip; Description: Vehicle Pur...10[-0.003459023078903556, 0.004626389592885971, ...[-0.0010945454705506563, 0.008626140654087067,...4Supplier: AM Phillip; Description: Vehicle Pur...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Date Supplier \\\n", + "0 0 15/08/2016 Creative Video Productions Ltd \n", + "1 51 31/03/2017 NLS Foundation \n", + "2 70 26/06/2017 British Library \n", + "3 71 24/07/2017 ALDL \n", + "4 100 24/07/2017 AM Phillip \n", + "\n", + " Description Transaction value (£) Classification \\\n", + "0 Kelvin Hall 26866 Other \n", + "1 Grant Payment 177500 Other \n", + "2 Legal Deposit Services 50056 Other \n", + "3 Legal Deposit Services 27067 Other \n", + "4 Vehicle Purchase 26604 Other \n", + "\n", + " combined n_tokens \\\n", + "0 Supplier: Creative Video Productions Ltd; Desc... 12 \n", + "1 Supplier: NLS Foundation; Description: Grant P... 11 \n", + "2 Supplier: British Library; Description: Legal ... 11 \n", + "3 Supplier: ALDL; Description: Legal Deposit Ser... 11 \n", + "4 Supplier: AM Phillip; Description: Vehicle Pur... 10 \n", + "\n", + " babbage_similarity \\\n", + "0 [-0.009630300104618073, 0.009887108579277992, ... \n", + "1 [-0.022305507212877274, 0.008543581701815128, ... \n", + "2 [-0.01019938476383686, 0.015277703292667866, -... \n", + "3 [-0.008471488021314144, 0.004098685923963785, ... \n", + "4 [-0.003459023078903556, 0.004626389592885971, ... \n", + "\n", + " babbage_search class_id \\\n", + "0 [-0.008217384107410908, 0.025170527398586273, ... 4 \n", + "1 [-0.020519884303212166, 0.01993306167423725, -... 4 \n", + "2 [-0.01843327097594738, 0.03343546763062477, -0... 4 \n", + "3 [-0.012966590002179146, 0.01299362163990736, 0... 4 \n", + "4 [-0.0010945454705506563, 0.008626140654087067,... 4 \n", + "\n", + " prompt \n", + "0 Supplier: Creative Video Productions Ltd; Desc... \n", + "1 Supplier: NLS Foundation; Description: Grant P... \n", + "2 Supplier: British Library; Description: Legal ... \n", + "3 Supplier: ALDL; Description: Legal Deposit Ser... \n", + "4 Supplier: AM Phillip; Description: Vehicle Pur... " + ] + }, + "execution_count": 215, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ft_df_with_class = ft_prep_df.merge(class_df,left_on='Classification',right_on='class',how='inner')\n", + "\n", + "# Adding a leading whitespace onto each completion to help the model\n", + "ft_df_with_class['class_id'] = ft_df_with_class.apply(lambda x: ' ' + str(x['class_id']),axis=1)\n", + "ft_df_with_class = ft_df_with_class.drop('class', axis=1)\n", + "\n", + "# Adding a common separator onto the end of each prompt so the model knows when a prompt is terminating\n", + "ft_df_with_class['prompt'] = ft_df_with_class.apply(lambda x: x['combined'] + '\\n\\n###\\n\\n',axis=1)\n", + "ft_df_with_class.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptcompletion
ordering
0Supplier: Sothebys; Description: Literary & Ar...0
1Supplier: Sotheby'S; Description: Literary & A...0
2Supplier: City Of Edinburgh Council; Descripti...1
2Supplier: John Graham Construction Ltd; Descri...2
3Supplier: John Graham Construction Ltd; Descri...2
\n", + "
" + ], + "text/plain": [ + " prompt completion\n", + "ordering \n", + "0 Supplier: Sothebys; Description: Literary & Ar... 0\n", + "1 Supplier: Sotheby'S; Description: Literary & A... 0\n", + "2 Supplier: City Of Edinburgh Council; Descripti... 1\n", + "2 Supplier: John Graham Construction Ltd; Descri... 2\n", + "3 Supplier: John Graham Construction Ltd; Descri... 2" + ] + }, + "execution_count": 236, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This step is unnecessary if you have a number of observations in each class\n", + "# In our case we don't, so we shuffle the data to give us a better chance of getting equal classes in our train and validation sets\n", + "# Our fine-tuned model will error if we have less classes in the validation set, so this is a necessary step\n", + "\n", + "import random \n", + "\n", + "labels = [x for x in ft_df_with_class['class_id']]\n", + "text = [x for x in ft_df_with_class['prompt']]\n", + "ft_df = pd.DataFrame(zip(text, labels), columns = ['prompt','class_id']) #[:300]\n", + "ft_df.columns = ['prompt','completion']\n", + "ft_df['ordering'] = ft_df.apply(lambda x: random.randint(0,len(ft_df)), axis = 1)\n", + "ft_df.set_index('ordering',inplace=True)\n", + "ft_df_sorted = ft_df.sort_index(ascending=True)\n", + "ft_df_sorted.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This step is to remove any existing files if we've already produced training/validation sets for this classifier\n", + "#!rm transactions_grouped*\n", + "\n", + "# We output our shuffled dataframe to a .jsonl file and run the prepare_data function to get us our input files\n", + "ft_df_sorted.to_json(\"transactions_grouped.jsonl\", orient='records', lines=True)\n", + "!openai tools fine_tunes.prepare_data -f transactions_grouped.jsonl -q" + ] + }, + { + "cell_type": "code", + "execution_count": 322, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31\n", + "8\n", + "All good\n" + ] + } + ], + "source": [ + "# This functions checks that your classes all appear in both prepared files\n", + "# If they don't, the fine-tuned model creation will fail\n", + "check_classes('transactions_grouped_prepared_train.jsonl','transactions_grouped_prepared_valid.jsonl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This step creates your model\n", + "!openai api fine_tunes.create -t \"transactions_grouped_prepared_train.jsonl\" -v \"transactions_grouped_prepared_valid.jsonl\" --compute_classification_metrics --classification_n_classes 5 -m curie" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "metadata": {}, + "outputs": [], + "source": [ + "# Congrats, you've got a fine-tuned model!\n", + "# Copy/paste the name provided into the variable below and we'll take it for a spin\n", + "fine_tuned_model = 'curie:ft-personal-2022-10-20-10-42-56'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Applying Fine-tuned Classifier\n", + "\n", + "Now we'll apply our classifier to see how it performs. We only had 31 unique observations in our training set and 8 in our validation set, so lets see how the performance is" + ] + }, + { + "cell_type": "code", + "execution_count": 324, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptcompletion
0Supplier: Wavetek Ltd; Description: Kelvin Hal...2
1Supplier: ECG Facilities Service; Description:...1
2Supplier: M & J Ballantyne Ltd; Description: G...2
3Supplier: Private Sale; Description: Literary ...0
4Supplier: Ex Libris; Description: IT equipment...3
\n", + "
" + ], + "text/plain": [ + " prompt completion\n", + "0 Supplier: Wavetek Ltd; Description: Kelvin Hal... 2\n", + "1 Supplier: ECG Facilities Service; Description:... 1\n", + "2 Supplier: M & J Ballantyne Ltd; Description: G... 2\n", + "3 Supplier: Private Sale; Description: Literary ... 0\n", + "4 Supplier: Ex Libris; Description: IT equipment... 3" + ] + }, + "execution_count": 324, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_set = pd.read_json('transactions_grouped_prepared_valid.jsonl', lines=True)\n", + "test_set.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 325, + "metadata": {}, + "outputs": [], + "source": [ + "test_set['predicted_class'] = test_set.apply(lambda x: openai.Completion.create(model=fine_tuned_model, prompt=x['prompt'], max_tokens=1, temperature=0, logprobs=5),axis=1)\n", + "test_set['pred'] = test_set.apply(lambda x : x['predicted_class']['choices'][0]['text'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 326, + "metadata": {}, + "outputs": [], + "source": [ + "test_set['result'] = test_set.apply(lambda x: str(x['pred']).strip() == str(x['completion']).strip(), axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 327, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 4\n", + "False 4\n", + "Name: result, dtype: int64" + ] + }, + "execution_count": 327, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_set['result'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Performance is not great - unfortunately this is expected. With only a few examples of each class, the above approach with embeddings and a traditional classifier worked better.\n", + "\n", + "A fine-tuned model works best with a great number of labelled observations. If we had a few hundred or thousand we may get better results, but lets do one last test on a holdout set to confirm that it doesn't generalise well to a new set of observations" + ] + }, + { + "cell_type": "code", + "execution_count": 330, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)
10123/10/2017City Building LLPCausewayside Refurbishment53147.0
10230/10/2017ECG Facilities ServiceFacilities Management Charge35758.0
10330/10/2017ECG Facilities ServiceFacilities Management Charge35758.0
10406/11/2017John Graham Construction LtdCausewayside Refurbishment134208.0
10506/11/2017ALDLLegal Deposit Services27067.0
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "101 23/10/2017 City Building LLP Causewayside Refurbishment \n", + "102 30/10/2017 ECG Facilities Service Facilities Management Charge \n", + "103 30/10/2017 ECG Facilities Service Facilities Management Charge \n", + "104 06/11/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "105 06/11/2017 ALDL Legal Deposit Services \n", + "\n", + " Transaction value (£) \n", + "101 53147.0 \n", + "102 35758.0 \n", + "103 35758.0 \n", + "104 134208.0 \n", + "105 27067.0 " + ] + }, + "execution_count": 330, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holdout_df = transactions.copy().iloc[101:]\n", + "holdout_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 332, + "metadata": {}, + "outputs": [], + "source": [ + "holdout_df['combined'] = \"Supplier: \" + holdout_df['Supplier'].str.strip() + \"; Description: \" + holdout_df['Description'].str.strip() + '\\n\\n###\\n\\n' # + \"; Value: \" + str(df['Transaction value (£)']).strip()\n", + "holdout_df['prediction_result'] = holdout_df.apply(lambda x: openai.Completion.create(model=fine_tuned_model, prompt=x['combined'], max_tokens=1, temperature=0, logprobs=5),axis=1)\n", + "holdout_df['pred'] = holdout_df.apply(lambda x : x['prediction_result']['choices'][0]['text'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 333, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateSupplierDescriptionTransaction value (£)combinedprediction_resultpred
10123/10/2017City Building LLPCausewayside Refurbishment53147.0Supplier: City Building LLP; Description: Caus...{'id': 'cmpl-63YDadbYLo8xKsGY2vReOFCMgTOvG', '...2
10230/10/2017ECG Facilities ServiceFacilities Management Charge35758.0Supplier: ECG Facilities Service; Description:...{'id': 'cmpl-63YDbNK1D7UikDc3xi5ATihg5kQEt', '...2
10330/10/2017ECG Facilities ServiceFacilities Management Charge35758.0Supplier: ECG Facilities Service; Description:...{'id': 'cmpl-63YDbwfiHjkjMWsfTKNt6naeqPzOe', '...2
10406/11/2017John Graham Construction LtdCausewayside Refurbishment134208.0Supplier: John Graham Construction Ltd; Descri...{'id': 'cmpl-63YDbWAndtsRqPTi2ZHZtPodZvOwr', '...2
10506/11/2017ALDLLegal Deposit Services27067.0Supplier: ALDL; Description: Legal Deposit Ser...{'id': 'cmpl-63YDbDu7WM3svYWsRAMdDUKtSFDBu', '...2
10627/11/2017Maggs Bros LtdLiterary & Archival Items26500.0Supplier: Maggs Bros Ltd; Description: Literar...{'id': 'cmpl-63YDbxNNI8ZH5CJJNxQ0IF9Zf925C', '...0
10730/11/2017Glasgow City CouncilKelvin Hall42345.0Supplier: Glasgow City Council; Description: K...{'id': 'cmpl-63YDb8R1FWu4bjwM2xE775rouwneV', '...2
10811/12/2017ECG Facilities ServiceFacilities Management Charge35758.0Supplier: ECG Facilities Service; Description:...{'id': 'cmpl-63YDcAPsp37WhbPs9kwfUX0kBk7Hv', '...2
10911/12/2017John Graham Construction LtdCausewayside Refurbishment159275.0Supplier: John Graham Construction Ltd; Descri...{'id': 'cmpl-63YDcML2welrC3wF0nuKgcNmVu1oQ', '...2
11008/01/2018ECG Facilities ServiceFacilities Management Charge35758.0Supplier: ECG Facilities Service; Description:...{'id': 'cmpl-63YDc95SSdOHnIliFB2cjMEEm7Z2u', '...2
\n", + "
" + ], + "text/plain": [ + " Date Supplier Description \\\n", + "101 23/10/2017 City Building LLP Causewayside Refurbishment \n", + "102 30/10/2017 ECG Facilities Service Facilities Management Charge \n", + "103 30/10/2017 ECG Facilities Service Facilities Management Charge \n", + "104 06/11/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "105 06/11/2017 ALDL Legal Deposit Services \n", + "106 27/11/2017 Maggs Bros Ltd Literary & Archival Items \n", + "107 30/11/2017 Glasgow City Council Kelvin Hall \n", + "108 11/12/2017 ECG Facilities Service Facilities Management Charge \n", + "109 11/12/2017 John Graham Construction Ltd Causewayside Refurbishment \n", + "110 08/01/2018 ECG Facilities Service Facilities Management Charge \n", + "\n", + " Transaction value (£) combined \\\n", + "101 53147.0 Supplier: City Building LLP; Description: Caus... \n", + "102 35758.0 Supplier: ECG Facilities Service; Description:... \n", + "103 35758.0 Supplier: ECG Facilities Service; Description:... \n", + "104 134208.0 Supplier: John Graham Construction Ltd; Descri... \n", + "105 27067.0 Supplier: ALDL; Description: Legal Deposit Ser... \n", + "106 26500.0 Supplier: Maggs Bros Ltd; Description: Literar... \n", + "107 42345.0 Supplier: Glasgow City Council; Description: K... \n", + "108 35758.0 Supplier: ECG Facilities Service; Description:... \n", + "109 159275.0 Supplier: John Graham Construction Ltd; Descri... \n", + "110 35758.0 Supplier: ECG Facilities Service; Description:... \n", + "\n", + " prediction_result pred \n", + "101 {'id': 'cmpl-63YDadbYLo8xKsGY2vReOFCMgTOvG', '... 2 \n", + "102 {'id': 'cmpl-63YDbNK1D7UikDc3xi5ATihg5kQEt', '... 2 \n", + "103 {'id': 'cmpl-63YDbwfiHjkjMWsfTKNt6naeqPzOe', '... 2 \n", + "104 {'id': 'cmpl-63YDbWAndtsRqPTi2ZHZtPodZvOwr', '... 2 \n", + "105 {'id': 'cmpl-63YDbDu7WM3svYWsRAMdDUKtSFDBu', '... 2 \n", + "106 {'id': 'cmpl-63YDbxNNI8ZH5CJJNxQ0IF9Zf925C', '... 0 \n", + "107 {'id': 'cmpl-63YDb8R1FWu4bjwM2xE775rouwneV', '... 2 \n", + "108 {'id': 'cmpl-63YDcAPsp37WhbPs9kwfUX0kBk7Hv', '... 2 \n", + "109 {'id': 'cmpl-63YDcML2welrC3wF0nuKgcNmVu1oQ', '... 2 \n", + "110 {'id': 'cmpl-63YDc95SSdOHnIliFB2cjMEEm7Z2u', '... 2 " + ] + }, + "execution_count": 333, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holdout_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 334, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " 2 231\n", + " 0 27\n", + "Name: pred, dtype: int64" + ] + }, + "execution_count": 334, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "holdout_df['pred'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Well those results were similarly underwhelming - so we've learned that with a dataset with a small number of labelled observations, either zero-shot classification or traditional classification with embeddings return better results than a fine-tuned model.\n", + "\n", + "A fine-tuned model is still a great tool, but is more effective when you have a larger number of labelled examples for each class that you're looking to classify" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/data/25000_spend_dataset_current.csv b/examples/data/25000_spend_dataset_current.csv new file mode 100644 index 0000000..36e2c17 --- /dev/null +++ b/examples/data/25000_spend_dataset_current.csv @@ -0,0 +1,362 @@ +Date,Supplier,Description,Transaction value (£) +21/04/2016,M & J Ballantyne Ltd,George IV Bridge Work,35098 +26/04/2016,Private Sale,Literary & Archival Items,30000 +30/04/2016,City Of Edinburgh Council,Non Domestic Rates ,40800 +09/05/2016,Computacenter Uk,Kelvin Hall,72835 +09/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,64361 +09/05/2016,A McGillivray,Causewayside Refurbishment,53690 +16/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,365344 +23/05/2016,Computacenter Uk,Kelvin Hall,26506 +23/05/2016,ECG Facilities Service,Facilities Management Charge,32777 +23/05/2016,ECG Facilities Service,Facilities Management Charge,32777 +30/05/2016,ALDL,ALDL Charges,32317 +10/06/2016,Wavetek Ltd,Kelvin Hall,87589 +10/06/2016,John Graham Construction Ltd,Causewayside Refurbishment,381803 +28/06/2016,ECG Facilities Service,Facilities Management Charge,32832 +30/06/2016,Glasgow City Council,Kelvin Hall,1700000 +11/07/2016,Wavetek Ltd,Kelvin Hall,65692 +11/07/2016,John Graham Construction Ltd,Causewayside Refurbishment,139845 +15/07/2016,Sotheby'S,Literary & Archival Items,28500 +18/07/2016,Christies,Literary & Archival Items,33800 +25/07/2016,A McGillivray,Causewayside Refurbishment,30113 +31/07/2016,ALDL,ALDL Charges,32317 +08/08/2016,ECG Facilities Service,Facilities Management Charge,32795 +15/08/2016,Creative Video Productions Ltd,Kelvin Hall,26866 +15/08/2016,John Graham Construction Ltd,Causewayside Refurbishment,196807 +24/08/2016,ECG Facilities Service,Facilities Management Charge,32795 +05/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,36359 +12/09/2016,Flexiform,Kelvin Hall,42623 +12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,144330 +12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,49827 +12/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,228689 +19/09/2016,Jisc Services Ltd Subscription Account,Literary & Archival Items,42629 +26/09/2016,Senator International,Kelvin Hall,35706 +26/09/2016,ECG Facilities Service,Facilities Management Charge,32795 +26/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,28378 +30/09/2016,A McGillivray,Causewayside Refurbishment,44392 +10/10/2016,Cengage Learning (Emea )Ltd,Literary & Archival Items,86604 +10/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,303999 +24/10/2016,ECG Facilities Service,Facilities Management Charge,32795 +24/10/2016,ALDL,ALDL Charges,32317 +31/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,74245 +07/11/2016,CBRE,Kelvin Hall,83736 +14/11/2016,University Of Glasgow,Kelvin Hall,188682 +14/11/2016,John Graham Construction Ltd,Causewayside Refurbishment,362326 +08/12/2016,Sothebys,Literary & Archival Items,166000 +08/12/2016,Private Sale,Literary & Archival Items,87500 +08/12/2016,ECG Facilities Service,Facilities Management Charge,32795 +12/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,385310 +30/12/2016,ECG Facilities Service,Facilities Management Charge,32795 +30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,253618 +30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,45127 +23/01/2017,ALDL,ALDL Charges,27730 +07/02/2017,ECG Facilities Service,Facilities Management Charge,32795 +07/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,52404 +13/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,272390 +27/02/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302 +27/02/2017,ECG Facilities Service,Facilities Management Charge,32795 +06/03/2017,Private Sale,Literary & Archival Items,72500 +06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,31781 +06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,198048 +27/03/2017,ECG Facilities Service,Facilities Management Charge,32795 +31/03/2017,NLS Foundation,Grant Payment,177500 +31/03/2017,Private Sale,Literary & Archival Items,3422500 +31/03/2017,Nicholson Bros(Electrical Contractors) Ltd,Causewayside Refurbishment,33666 +31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,222090 +31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,63971 +31/03/2017,XMA Scotland Ltd,IT equipment,33450 +31/03/2017,XMA Scotland Ltd,IT equipment,84524 +24/04/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302 +24/04/2017,Scottish Historic Buildings Trust,Lawnmarket Work,50057 +24/04/2017,Insight Direct (UK) Ltd,IT equipment,56768 +30/04/2017,Morris & Spottiswood Ltd,George IV Bridge Work,63716 +08/05/2017,Anglian Water Business,Water,26832 +15/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,245381 +22/05/2017,ECG Facilities Service,Facilities Management Charge,33386 +22/05/2017,ALDL,Legal Deposit Services,27067 +29/05/2017,ECG Facilities Service,Facilities Management Charge,33386 +29/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,74806 +29/05/2017,Morris & Spottiswood Ltd,George IV Bridge Work,56448 +31/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,164691 +26/06/2017,ECG Facilities Service,Facilities Management Charge,33386 +26/06/2017,British Library,Legal Deposit Services,50056 +24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,27926 +24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,212690 +24/07/2017,ALDL,Legal Deposit Services,27067 +24/07/2017,AM Phillip,Vehicle Purchase,26604 +16/08/2017,ECG Facilities Service,Facilities Management Charge,33386 +16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,59021 +16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,136379 +16/08/2017,Ex Libris,IT equipment,76610 +23/08/2017,Culture And Sport Glasgow,Kelvin Hall,60503 +23/08/2017,XMA Scotland Ltd,Kelvin Hall,31830 +23/08/2017,ECG Facilities Service,Facilities Management Charge,33386 +31/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,36313 +31/08/2017,Insight Direct (UK) Ltd,Causewayside Refurbishment,68222 +31/08/2017,Mark Finn Laboratory,George IV Bridge Work,53884 +11/09/2017,John Graham Construction Ltd,Causewayside Refurbishment,189483 +15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,57662 +15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,142680 +09/10/2017,Frost And Sullivan Ltd,Literary & Archival Items,28125 +09/10/2017,JISC Services Ltd ,Literary & Archival Items,43481 +23/10/2017,John Graham Construction Ltd,Causewayside Refurbishment,151659 +23/10/2017,City Building LLP,Causewayside Refurbishment,53147 +30/10/2017,ECG Facilities Service,Facilities Management Charge,35758 +30/10/2017,ECG Facilities Service,Facilities Management Charge,35758 +06/11/2017,John Graham Construction Ltd,Causewayside Refurbishment,134208 +06/11/2017,ALDL,Legal Deposit Services,27067 +27/11/2017,Maggs Bros Ltd,Literary & Archival Items,26500 +30/11/2017,Glasgow City Council,Kelvin Hall,42345 +11/12/2017,ECG Facilities Service,Facilities Management Charge,35758 +11/12/2017,John Graham Construction Ltd,Causewayside Refurbishment,159275 +08/01/2018,ECG Facilities Service,Facilities Management Charge,35758 +15/01/2018,Proquest Information And Learn,Literary & Archival Items,42199 +15/01/2018,John Graham Construction Ltd,Causewayside Refurbishment,123244 +29/01/2018,ECG Facilities Service,Facilities Management Charge,35758 +05/02/2018,John Graham Construction Ltd,Causewayside Refurbishment,102659 +27/02/2018,ALDL,Legal Deposit Services,27067 +07/03/2018,John Graham Construction Ltd,Causewayside Refurbishment,89559 +14/03/2018,Bernard Quaritch Ltd,Literary & Archival Items,372500 +14/03/2018,ECG Facilities Service,Facilities Management Charge,35758 +21/03/2018,Site Sealants Ltd,Causewayside Refurbishment,27747 +30/03/2018,Private Sale,Literary & Archival Items,100000 +30/03/2018,ECG Facilities Service,Facilities Management Charge,35758 +30/04/2018,ECG FACILITIES SERVICE,Causewayside IT Work,25634.7 +30/04/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +14/05/2018,GLASGOW CITY COUNCIL,Kelvin Hall,90946 +11/06/2018,ALDL,ALDL Charges,27067 +11/06/2018,JOHN GRAHAM CONSTRUCTION LTD,Causewayisde Refurbishment,127753.31 +22/06/2018,BONHAMS - LONDON,Literary & Archival Items,25025 +22/06/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +22/06/2018,EX LIBRIS,IT equipment,39000 +30/06/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +16/07/2018,EX LIBRIS,IT equipment,80057.83 +18/07/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +18/07/2018,Sotheby's,Literary & Archival Items,41600 +31/08/2018,AUTOMATED DOCUMENT SERVICES,IT equipment,84480 +31/08/2018,XMA SCOTLAND LTD,IT equipment,313000 +13/09/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +13/09/2018,CITY OF EDINBURGH COUNCIL,Non Domestic Rates,59303.2 +13/09/2018,CITY OF EDINBURGH COUNCIL,Non Domestic Rates,146740 +20/09/2018,FROST AND SULLIVAN LTD,Literary & Archival Items,28125 +20/09/2018,SJS Property Services,George IV Bridge Work,44684.2 +20/09/2018,CENGAGE LEARNING (EMEA )LTD,Literary & Archival Items,64791 +30/09/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +30/09/2018,SJS Property Services,George IV Bridge Work,51635.35 +24/10/2018,XMA SCOTLAND LTD,IT equipment,35313.48 +24/10/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +21/11/2018,EX LIBRIS,IT equipment,39000 +21/11/2018,EX LIBRIS,IT equipment,53327.09 +26/11/2018,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +26/11/2018,SJS Property Services,George IV Bridge Work,66818.25 +11/12/2018,CALEDONIAN LIFT SERVICES LTD,Causewayside Work,47944.8 +31/12/2018,SOFTCAT,IT equipment,37064.3 +14/01/2019,m-hance,IT Work,33164.4 +14/01/2019,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +24/01/2019,ARTHUR MCKAY BUILDING SERVICES,Causewayside Work,100235.17 +31/01/2019,ECG FACILITIES SERVICE,Causewayside Work,32517.45 +31/01/2019,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +31/01/2019,CENGAGE LEARNING (EMEA )LTD,Literary & Archival Items,66443 +14/02/2019,Private Sale,Literary & Archival Items,50000 +27/02/2019,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +31/03/2019,ECG FACILITIES SERVICE,Facilities Management Charge,35757.91 +31/03/2019,ECG FACILITIES SERVICE,George IV Bridge Work,37320.15 +31/03/2019,HP INC UK LTD,IT equipment,40746 +31/03/2019,INSIGHT DIRECT (UK) LTD,IT equipment,56223.35 +23/04/2019,EX LIBRIS,"IT equipment +",129584.58 +30/04/2019,ECG FACILITIES SERVICE,Facilities Management Charge,36907.14 +30/04/2019,COMPUTACENTER UK,"IT equipment +",139571.14 +13/05/2019,GLASGOW LIFE,Kelvin Hall Service Charge,120335 +04/06/2019,ECG FACILITIES SERVICE,Facilities Management Charge,36907.14 +24/06/2019,Private Sale,Literary & Archival Items,34400 +25/06/2019,ECG FACILITIES SERVICE,Facilities Management Charge,36907.14 +31/07/2019,ECG FACILITIES SERVICE,Facilities Management Charge,36907.14 +26/08/2019,MICROBOX GmbH,Digital equipment,65881.58 +27/08/2019,ECG FACILITIES SERVICE,Facilities Management Charge,36907.14 +27/08/2019,FROST AND SULLIVAN LTD,Literary & Archival Items,28687.5 +18/09/2019,CITY OF EDINBURGH COUNCIL,Annual Property Rates 2019/20 for three buildings,221467.2 +25/09/2019,LOTHIAN HEATING SERVICES LTD,Payment 1 - GB Boiler replacement ,57114.18 +25/09/2019,ECG FACILITIES SERVICE,Facilities Management Charge,34021.61 +25/09/2019,EDF Energy,Electricity,33122.06 +18/09/2019,INSTITUTE OF CONSERVATION,Bursary Recruitment and Professional Services costs for intern,26805.2 +10/10/2019,ECG FACILITIES SERVICE,"CB Bolier Replacement (1),USP Batteries,Gutter Works & Cleaning of pigeon fouling",112794 +23/10/2019,ECG FACILITIES SERVICE,"CB Bolier Replacement (2),Facilities Management Charge October 19, intumescent strips & unblocking toilets",103462.39 +23/10/2019,Private Sale,Purchase of Manuscripts,45000 +04/10/2019,ECG FACILITIES SERVICE,Facilities Management Charge September 19,44288.57 +10/10/2019,GLASGOW LIFE,Service Charges Kelvin Hall,39100.16 +15/10/2019,EDF ENERGY,Electricity,26805.74 +04/10/2019,JISC SERVICES LTD SUBSCRIPTION ACCOUNT,Annual Subscription,25731 +23/10/2019,ALDL,Oct19-Dec19 charge from Agency for Legal Deposit Libraries,25155.6 +27/11/2019,ECG FACILITIES SERVICE,"Paymnet for 31 invoices including Facilities Managemenr Charge Nov 19, Lift Repairs, replacement refrigerant gas detection system & data cabling and install of WIFI devices",104526.09 +05/11/2019,LOTHIAN HEATING SERVICES LTD,GB Bolier Replacement - application 2,45728.9 +27/11/2019,GLASGOW LIFE,Service Charges Kelvin Hall 01/07/19-30/09/19,41541.47 +19/11/2019,EDF ENERGY,Electricity Oct 2019 3 buildings,26660.9 +10/12/2019,PRIVATE SALE,Collection of papers of an individual,125000 +06/12/2019,PROQUEST,Purchase of 9 subscriptions 01/11/19-31/10/20,61638 +18/12/2019,ECG,"Payment of 19 separate invoice including for service of chiller, re-route return pipes, data cabling and install of WifI devices, sprinkler work",44556.15 +22/01/2020,ECG,"Payment of 28 separate invoices including for supply and fit aluminium screen, upgrade boilerhouse electrical panels,CCTV components, pump casting & lift repairs",89297.94 +09/01/2020,ECG,Payment of 18 separate invoices including for December facilities services and boiler replacement CB,78585.73 +14/01/2020,LM Information Delivery UK LTD,Payment of 18 separate invoice for Online/Print subscriptions Jan 20-Dec 20,27822.54 +14/01/2020,EDF,Electricity,25172.34 +14/01/2020,ALDL,Jan20-Mar 20 charge from Agency for Legal Deposit Libraries,25155.6 +06/02/2020,XMA Scotland,Scality Ring Maintenance,68464.62 +06/02/2020,Trustmarque,Miscrosoft Software Licenses,38069.66 +11/02/2020,Studio MB,Concept Design Semi-Permanent Exhibtion,27000 +11/02/2020,EDF,Electricity,25484.03 +06/03/2020,British Library,Governance and Management Costs,27766.6 +10/03/2020,Proquest,Subscriptions,50309.81 +10/03/2020,ECG,Two months maintance contracts,80041.02 +17/03/2020,BSI,Subscription,30951.6 +17/03/2020,Glasgow Life,Kelvin Hall Service Charges,55857.04 +17/03/2020,Private Collection,Collection of literary papers,60000 +20/03/2020,EDF,Electricity,25829.65 +20/03/2020,ECG,This payment covers 16 invoices including upgrade to boiler control panel & remedial works following 5 year test,32025.98 +06/04/2020,Gardiner and Theobald,GB Feasibility Study,49508 +06/04/2020,ECG,This payment covers 8 invocies including monthly facilities management fees & site inspection fees,51822.68 +23/04/2020,OCLC UK,Cataloging and Metadata subscription,26251.2 +23/04/2020,John Graham,Stonework Retention Payment,25104.56 +23/04/2020,EDF,Electricity,25025.89 +23/04/2020,Studio MB,Exhibition design,63000 +23/04/2020,ECG,"This payment covers 5 invocies including monthly facilities management fees, software and hardware maintenance & Lighting Upgrades",65200.11 +14/05/2020,GARDINER AND THEOBALD LLP,GB Feasibility Study,26291.48 +14/05/2020,HP INC UK LTD,IT equipment purchase,30640.32 +14/05/2020,XMA SCOTLAND LTD,Purchase of IT equipment and renewal of maintenance agreement. This payment covers 2 invoices,139167.6 +14/05/2020,CENGAGE LEARNING EMEA LTD,Annual hosting fee,28800 +21/05/2020,ECG FACILITIES SERVICE,CB Boiler replacement plus monthly maintenance fee. This payment covers 2 invoices,47899.83 +29/05/2020,EDF ENERGY,Electricity for April in Causewayside and George IV Bridge buildings. This payment covers 2 invoices.,30175.09 +29/05/2020,SOFTCAT,Software Licence,42866.5 +09/06/2020,Ex Libris,Annual subsriptions. This payment covers 2 invoices.,189036.11 +09/06/2020,Glasgow Life,Service Charges,49509.2 +09/06/2020,XMA Scotland Ltd,IT equipment,25371.84 +18/06/2020,JISC SERVICES LTD SUBSCRIPTION ACCOUNT,Annual subscription,25896 +25/06/2020,ECG FACILITIES SERVICE,Facility Management fees,49000 +25/06/2020,GARDINER AND THEOBALD LLP,GB Feasibility Study,26291.48 +25/06/2020,THE LEARNING POOL,E-Learning Resources,25344 +07/07/2020,Agency for the Legal Deposit Libraries,Agency services,26007.95 +07/07/2020,Lyon and Turnball,Various collection items,54094 +09/07/2020,XMA Scotland Ltd,Computer equipment,33327 +14/07/2020,EDF Energy,Utilities,25768.85 +23/07/2020,Computer Centre UK Ltd,Computer equipment,27750.79 +23/07/2020,ECG Facility Services,Facility Management fees,49000 +23/07/2020,GARDINER AND THEOBALD LLP,GB Feasibility Study,26291.48 +13/08/2020,EDF Energy,Utilities. This transaction is made up of 3 invoices.,26688.27 +13/08/2020,Frost & Sullivan Ltd,Annual subscription,34425 +27/08/2020,Agency for Legal Deposit Libaries,Agency services,26007.95 +27/08/2020,ECG Facilities Services,Facility Management fees,49000 +27/08/2020,Gardiner and Theobald LLP,GB Feasibility Study,26291.48 +17/09/2020,EDF Energy,This payment covers 3 invoices for utility services,34283.03 +17/09/2020,JISC Services Ltd,Subscription,26179.72 +17/09/2020,XMA Scotland Ltd,IT equipment,26533.92 +24/09/2020,ECG Facilities Services,Facility Management fees,55450.58 +24/09/2020,Glasgow Life,Service charges,25211.17 +08/10/2020,EDF Energy,This payment covers 5 invoices for utility services,27625.53 +08/10/2020,ALDL,Agency services,26007.95 +08/10/2020,Institute of Conservation,This payment covers 2 invoices for student bursary costs,31654 +08/10/2020,Studio MB,Exhibition build works,36000 +22/10/2020,ECG Facilities,This payment covers 11 invoices for facility Management fees,55672.9 +22/10/2020,Glasgow City Council,Capital works,34802.4 +19/11/2020,DTEK DIGITAL SOLUTIONS LTD,Computer equipment,39348 +19/11/2020,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility Management fees,31888.51 +19/11/2020,GLASGOW LIFE,Builidng service charges,47690.16 +26/11/2020,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility Management fees,55299.92 +26/11/2020,LEE BOYD LIMITED,This payment covers 7 invoices for project management fees,26440.98 +03/12/2020,PROQUEST INFORMATION AND LEARN,This payment covers multiple invoices for collection items,50232.54 +10/12/2020,STUDIO MB,This payment covers 2 invoices for exhibition services and equipment,55902 +17/12/2020,ECG FACILITIES SERVICE,Facility Management Fees,49000 +17/12/2020,LEE BOYD LIMITED,This payment covers multiple invoices for project management fees,28922.8 +07/01/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,39150.26 +14/01/2021,EDF ENERGY,This payment covers multiple invoices for electricity,28711.17 +14/01/2021,ALDL,Legal deposit services,26007.95 +14/01/2021,EXCHANGE COMMUNICATIONS INSTALLATIONS LTD,Telecom services,31878 +21/01/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,28797.1 +28/01/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,54875.74 +04/02/2021,PROQUEST INFORMATION AND LEARN,One invoice for collection items,40000 +18/02/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,54931.68 +25/02/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,51283.39 +25/02/2021,HP INC UK LTD,IT Equipment,37868.04 +10/03/2021,BSI,BSOL Modular Subscription,30510 +16/03/2021,PHOENIX SOFTWARE LTD,IT Hardware plus 5 year licence,74432.04 +16/03/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,134758.64 +23/03/2021,ECG FACILITIES SERVICE,Maintenance Contract - March,49000 +23/03/2021,ICAM ARCHIVE SYSTEMS,Camera System - phase 1,39120 +25/03/2021,ECG FACILITIES SERVICE,This payment covers multiple invoices for facility management fees,108450.85 +31/03/2021,GLASGOW LIFE,Oct 20 to Dec 20 service charge - Kelvin Hall,54840.53 +31/03/2021,ECG FACILITIES SERVICE,Replacement Humidifer units,76751 +31/03/2021,ECG FACILITIES SERVICE,Cooling and Humidifer system upgrade,26943.84 +31/03/2021,ECG FACILITIES SERVICE,Installation of CCTV,29404.62 +29/04/2021,ECG FACILITIES SERVICE,This payment covers April 21 Maintenance Contract and the installation of battery rack and batteries plus smaller maintenance invoices,71604.07 +29/04/2021,GLASGOW LIFE,Jan 21 to Mar 21 service charge - Kelvin Hall,46657.33 +20/05/2021,ECG FACILITIES SERVICE,Routine inspection and maintenance of all NLS properties,52584.2 +27/05/2021,XMA SCOTLAND LTD,2 invoices one for the replacement of obsolete hardware and the other for a new laptop,28587.59 +13/05/2021,ALDL,"Claiming, receipting and onward distribution of legal deposit on behalf of NLS",26376.68 +27/05/2021,LYON AND TURNBULL,Purchase of a manuscript,26000 +27/05/2021,ARNOLD CLARK,Purchase of an electric van,25949.5 +28/06/2021,XMA Scotland Ltd,Purchase of IT hardware for cloud and maintenance of hardware,72061.92 +08/07/2021,EX LIBRIS,Subscription April to Oct 21 cloud based library services,95045.31 +08/07/2021,ECG FACILITIES SERVICE,Maintenance contract - June 21 period,52459.25 +08/07/2021,XMA SCOTLAND LTD,IT hardware equipment,37620.86 +22/07/2021,ALDL,Quarterly invoice legal deposit materials - July to Sept 21,26400.68 +12/08/2021,ECG FACILITIES SERVICE,Maintenance contract - July 21 period,52459.25 +27/08/2021,ECG FACILITIES SERVICE,Maintenance contract - August 21 period,52459.25 +27/08/2021,ECG FACILITIES SERVICE,Water penetration works - part 2,28350 +27/08/2021,ECG FACILITIES SERVICE,Water penetration works - part 3,28350 +22/09/2021,GLASGOW LIFE,Kelvin Hall Service Charge - April to June 21,35420.45 +29/09/2021,ECG FACILITIES SERVICE,Maintenance contract - all properties,52459.25 +29/09/2021,FROST AND SULLIVAN LTD,Annual Subscription - Sept 21 to Oct 22,35147.09 +21/10/2021,ECG FACILITIES SERVICE,Maintenance contract - October,52459.25 +31/10/2021,SOFTCAT,It purchases for server,42282.72 +14/10/2021,ALDL,"Claiming, receipting and onward distribution for quarter Oct to Dec 21",26400.68 +04/11/2021,Web of Science JISC SHEDL subs ,Subscription 2021 to 2021 SHEDL,28361.78 +11/11/2021,M and J Kelman Ltd,Literary and personal papers of James Kelman,40000 +11/11/2021,John Graham Constrution Ltd,External fabric repairs - Causeway Side building,75262.75 +11/11/2021,Robert Harland,Correspondance and Literary papers - Thomas Carlyle,94000 +11/11/2021,Jisc Services Ltd,IT Subscription and router service charge,25896 +25/11/2021,ECG Facilities,Maintenance Contract - November,52459.25 +25/11/2021,Ex Libris,IT Subscription ,81729.02 +31/12/2021,ECG FACILITIES SERVICE,Electrical and mechanical works,28071.17 +16/12/2021,JAMES BRECK LTD,Re-slating of roof LB,28572.28 +23/12/2021,CENGAGE LEARNING EMEA LTD,Subscription - Historical Archive,32460 +31/12/2021,GLASGOW LIFE,Quarterly service charge KH,45541.34 +31/12/2021,ECG FACILITIES SERVICE,Maintenance Contract - December,52459.25 +16/12/2021,ECG FACILITIES SERVICE,"Electrical, mechanical and building works",82227.96 +27/01/2022,ECG FACILITIES SERVICE,January maintenance contract,52459.25 +31/01/2022,ALDL,1st January to 31st March 22 - receipting and onward distribution of UK legal deposit materials on behalf of National Library of Scotland,26388.68 +03/02/2022,ECG FACILITIES SERVICE,"Monthly maintenance contract, drainage jetting and cctv remedials, patio roofing wash",62411.69 +10/02/2022,JAMES BRECK LTD,Roof uplifting and re-slating,31890.41 +10/02/2022,LEE BOYD LIMITED,Various invoices smoke extract system and rateable value review,30552 +17/02/2022,LEE BOYD LIMITED,"Various invoices for CB smoke extract system, project work - FM maintenance framework, sprinkler system",57766.9 +24/02/2022,ECG FACILITIES SERVICE,"Carry out tanking works, supply and fit mini drive unit, balustrade repairs",27723.16 +24/02/2022,ADAM MATTHEW DIGITAL LTD,Resource - slavery abolution and social justice,37080 +10/03/2022,ECG FACILITIES SERVICE,Maintenance contract - March,52459.25 +10/03/2022,XMA SCOTLAND LTD,It equipment,61885.56 +17/03/2022,EDF ENERGY,Electricity bill for various sites,57220.55 +17/03/2022,ECG FACILITIES SERVICE,Maintenance contract - Feb plus various smaller invoices for maintenance jobs,71653.47 +17/03/2022,XMA010,IT equipment,77208.77 +17/03/2022,OXFORD UNIVERSITY PRESS,Annual subscription,28576.89 +24/03/2022,ECG FACILITIES SERVICE,Various small maintenance jobs around library sites,34055.73 +24/03/2022,GLASGOW LIFE,Kelvin Hall quarterly service charge,41637.96 +24/03/2022,LEE BOYD LIMITED,Sprinkler system project and lift refurb George IV,55234 +24/03/2022,BSI,Annual subscription,31425 +31/03/2022,ECG FACILITIES SERVICE,Various small maintenance jobs around library sites,28760.32 +31/03/2022,XMA SCOTLAND LTD,It equipment,47461.25 +31/03/2022,JAMES BRECK LTD,Roof uplift and reslating,28230.64 +31/03/2022,LEE BOYD LIMITED,Various small maintenance jobs around library sites,26396.1 +31/03/2022,UNIVERSITY OF DUNDEE,Salary costs for SCURL Scottish Universities press project,39726.44 +30/04/2022,JISC Services Ltd,Managed router service charge annual subscription 01/04/22 to 31/03/23,25896 +30/04/2022,EX Libris,Subscription Alma and Primo 01/04/22 to 31/10/22,114420.65 +11/05/2022,KENNYS BOOKSHOP&ART GALLERIES,Purchase of Smillie Archive,30000 +12/05/2022,ECG FACILITIES SERVICE,Inspection and Maintenance of all Library properties,55711.72 +19/05/2022,CAE TECHNOLOGY SERVICES LIMITED,Subscription renewal,25041.31 +19/05/2022,GLASGOW LIFE,Kelvin Hall service charge Jan to Mar 22,59084.95 +31/05/2022,ECG FACILITIES SERVICE,Fit pre-purchased humidifiers,29710.8 +31/05/2022,ECG FACILITIES SERVICE,Routine inspection and maintenance May 22,55711.72 +31/05/2022,ALDL,Legal deposit materials April to July 22,27013.18 +09/06/2022,LEE BOYD LIMITED,Architectural Works,93690 +16/06/2022,CITY OF EDINBURGH COUNCIL,Rates for 33 Salisbury Place,136240 +16/06/2022,CITY OF EDINBURGH COUNCIL,Rates 57 George IV Bridge,41920 +23/06/2022,ECG FACILITIES SERVICE,Maintenance contract - June 22,55711.72 +21/07/2022,ALDL,"Claiming,receipting and onward distribution of UK legal deposit materials July to Sept 22",27013.16 +21/07/2022,RICK GEKOSKI,Papers 1970's to 2019 Alisdair Gray,125000 +28/07/2022,SONYA LEONARD,Literary and personal papers of Tom Leonard 1961 to 2018,40000 diff --git a/examples/data/labelled_transactions.csv b/examples/data/labelled_transactions.csv new file mode 100644 index 0000000..6f97a3e --- /dev/null +++ b/examples/data/labelled_transactions.csv @@ -0,0 +1,102 @@ +Date,Supplier,Description,Transaction value (£),Classification +15/08/2016,Creative Video Productions Ltd,Kelvin Hall,26866,Other +29/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,74806,Building Improvement +29/05/2017,Morris & Spottiswood Ltd,George IV Bridge Work,56448,Building Improvement +31/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,164691,Building Improvement +24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,27926,Building Improvement +24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,212690,Building Improvement +16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,59021,Building Improvement +16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,136379,Building Improvement +23/08/2017,Culture And Sport Glasgow,Kelvin Hall,60503,Building Improvement +23/08/2017,XMA Scotland Ltd,Kelvin Hall,31830,Building Improvement +31/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,36313,Building Improvement +31/08/2017,Insight Direct (UK) Ltd,Causewayside Refurbishment,68222,Building Improvement +31/08/2017,Mark Finn Laboratory,George IV Bridge Work,53884,Building Improvement +11/09/2017,John Graham Construction Ltd,Causewayside Refurbishment,189483,Building Improvement +23/10/2017,John Graham Construction Ltd,Causewayside Refurbishment,151659,Building Improvement +23/10/2017,City Building LLP,Causewayside Refurbishment,53147,Building Improvement +07/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,52404,Building Improvement +13/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,272390,Building Improvement +06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,31781,Building Improvement +06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,198048,Building Improvement +31/03/2017,Nicholson Bros(Electrical Contractors) Ltd,Causewayside Refurbishment,33666,Building Improvement +31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,222090,Building Improvement +31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,63971,Building Improvement +24/04/2017,Scottish Historic Buildings Trust,Lawnmarket Work,50057,Building Improvement +30/04/2017,Morris & Spottiswood Ltd,George IV Bridge Work,63716,Building Improvement +15/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,245381,Building Improvement +12/09/2016,Flexiform,Kelvin Hall,42623,Building Improvement +12/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,228689,Building Improvement +26/09/2016,Senator International,Kelvin Hall,35706,Building Improvement +26/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,28378,Building Improvement +30/09/2016,A McGillivray,Causewayside Refurbishment,44392,Building Improvement +10/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,303999,Building Improvement +31/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,74245,Building Improvement +07/11/2016,CBRE,Kelvin Hall,83736,Building Improvement +14/11/2016,University Of Glasgow,Kelvin Hall,188682,Building Improvement +14/11/2016,John Graham Construction Ltd,Causewayside Refurbishment,362326,Building Improvement +12/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,385310,Building Improvement +30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,253618,Building Improvement +30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,45127,Building Improvement +21/04/2016,M & J Ballantyne Ltd,George IV Bridge Work,35098,Building Improvement +09/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,64361,Building Improvement +09/05/2016,A McGillivray,Causewayside Refurbishment,53690,Building Improvement +16/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,365344,Building Improvement +10/06/2016,Wavetek Ltd,Kelvin Hall,87589,Building Improvement +10/06/2016,John Graham Construction Ltd,Causewayside Refurbishment,381803,Building Improvement +30/06/2016,Glasgow City Council,Kelvin Hall,1700000,Building Improvement +11/07/2016,Wavetek Ltd,Kelvin Hall,65692,Building Improvement +11/07/2016,John Graham Construction Ltd,Causewayside Refurbishment,139845,Building Improvement +25/07/2016,A McGillivray,Causewayside Refurbishment,30113,Building Improvement +15/08/2016,John Graham Construction Ltd,Causewayside Refurbishment,196807,Building Improvement +06/11/2017,John Graham Construction Ltd,Causewayside Refurbishment,134208,Building Improvement +31/03/2017,NLS Foundation,Grant Payment,177500,Other +09/10/2017,Frost And Sullivan Ltd,Literary & Archival Items,28125,Literature & Archive +09/10/2017,JISC Services Ltd ,Literary & Archival Items,43481,Literature & Archive +27/02/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive +06/03/2017,Private Sale,Literary & Archival Items,72500,Literature & Archive +31/03/2017,Private Sale,Literary & Archival Items,3422500,Literature & Archive +24/04/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive +22/05/2017,ALDL,Legal Deposit Services,27067,Literature & Archive +19/09/2016,Jisc Services Ltd Subscription Account,Literary & Archival Items,42629,Literature & Archive +10/10/2016,Cengage Learning (Emea )Ltd,Literary & Archival Items,86604,Literature & Archive +24/10/2016,ALDL,ALDL Charges,32317,Literature & Archive +26/04/2016,Private Sale,Literary & Archival Items,30000,Literature & Archive +30/05/2016,ALDL,ALDL Charges,32317,Literature & Archive +15/07/2016,Sotheby'S,Literary & Archival Items,28500,Literature & Archive +18/07/2016,Christies,Literary & Archival Items,33800,Literature & Archive +31/07/2016,ALDL,ALDL Charges,32317,Literature & Archive +08/12/2016,Sothebys,Literary & Archival Items,166000,Literature & Archive +08/12/2016,Private Sale,Literary & Archival Items,87500,Literature & Archive +26/06/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills +26/06/2017,British Library,Legal Deposit Services,50056,Other +24/07/2017,ALDL,Legal Deposit Services,27067,Other +16/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills +23/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills +07/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +27/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +27/03/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +22/05/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills +26/09/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +24/10/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +08/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +30/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills +23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills +28/06/2016,ECG Facilities Service,Facilities Management Charge,32832,Utility Bills +08/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +24/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills +30/10/2017,ECG Facilities Service,Facilities Management Charge,35758,Utility Bills +16/08/2017,Ex Libris,IT equipment,76610,Software/IT +31/03/2017,XMA Scotland Ltd,IT equipment,33450,Software/IT +31/03/2017,XMA Scotland Ltd,IT equipment,84524,Software/IT +24/04/2017,Insight Direct (UK) Ltd,IT equipment,56768,Software/IT +09/05/2016,Computacenter Uk,Kelvin Hall,72835,Software/IT +23/05/2016,Computacenter Uk,Kelvin Hall,26506,Software/IT +15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,57662,Utility Bills +15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,142680,Utility Bills +08/05/2017,Anglian Water Business,Water,26832,Utility Bills +30/04/2016,City Of Edinburgh Council,Non Domestic Rates ,40800,Utility Bills +12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,144330,Utility Bills +12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,49827,Utility Bills +24/07/2017,AM Phillip,Vehicle Purchase,26604,Other \ No newline at end of file