{ "cells": [ { "cell_type": "markdown", "id": "c2b98618", "metadata": {}, "source": [ "# Intro\n", "This notebook is an example on how you can use SingleStoreDB vector storage and functions to build an interactive Q&A application with ChatGPT. If you start a [Trial](https://www.singlestore.com/cloud-trial/) in SingleStoreDB, you can find the same notebook in our sample notebooks with native connection." ] }, { "cell_type": "markdown", "id": "55b58478", "metadata": {}, "source": [ "## First let's talk directly to ChatGPT and try and get back a response" ] }, { "cell_type": "code", "execution_count": 2, "id": "661cd7c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install openai --quiet\n" ] }, { "cell_type": "code", "execution_count": 54, "id": "61468873", "metadata": {}, "outputs": [], "source": [ "import openai\n", "\n", "EMBEDDING_MODEL = \"text-embedding-3-small\"\n", "GPT_MODEL = \"gpt-3.5-turbo\"\n" ] }, { "cell_type": "markdown", "id": "3778d23e", "metadata": {}, "source": [ "## Let's connect to OpenAI and see the result we get when asking for a date beyond 2021" ] }, { "cell_type": "code", "execution_count": 5, "id": "3f654b3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I'm sorry, I cannot provide information about events that have not occurred yet. The Winter Olympics 2022 will be held in Beijing, China from February 4 to 20, 2022. The curling events will take place during this time and the results will not be known until after the competition has concluded.\n" ] } ], "source": [ "openai.api_key = 'OPENAI API KEY'\n", "\n", "response = openai.ChatCompletion.create(\n", " model=GPT_MODEL,\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", " {\"role\": \"user\", \"content\": \"Who won the gold medal for curling in Olymics 2022?\"},\n", " ]\n", ")\n", "\n", "print(response['choices'][0]['message']['content'])\n" ] }, { "cell_type": "markdown", "id": "a9c15d6d", "metadata": {}, "source": [ "# Get the data about Winter Olympics and provide the information to ChatGPT as context" ] }, { "cell_type": "markdown", "id": "c5247835", "metadata": {}, "source": [ "## 1. Setup" ] }, { "cell_type": "code", "execution_count": 33, "id": "0948696c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install matplotlib plotly.express scikit-learn tabulate tiktoken wget --quiet\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "1e36f5d8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import wget\n", "import ast\n" ] }, { "cell_type": "markdown", "id": "ba9b8ae2", "metadata": {}, "source": [ "## Step 1 - Grab the data from CSV and prepare it" ] }, { "cell_type": "code", "execution_count": 8, "id": "ce3897b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File downloaded successfully.\n" ] } ], "source": [ "# download pre-chunked text and pre-computed embeddings\n", "# this file is ~200 MB, so may take a minute depending on your connection speed\n", "embeddings_path = \"https://cdn.openai.com/API/examples/data/winter_olympics_2022.csv\"\n", "file_path = \"winter_olympics_2022.csv\"\n", "\n", "if not os.path.exists(file_path):\n", " wget.download(embeddings_path, file_path)\n", " print(\"File downloaded successfully.\")\n", "else:\n", " print(\"File already exists in the local file system.\")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "082e9545", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"winter_olympics_2022.csv\"\n", ")\n", "\n", "# convert embeddings from CSV str type back to list type\n", "df['embedding'] = df['embedding'].apply(ast.literal_eval)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1768fa60", "metadata": {}, "outputs": [], "source": [ "df\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "37791a10", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6059 entries, 0 to 6058\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 text 6059 non-null object\n", " 1 embedding 6059 non-null object\n", "dtypes: object(2)\n", "memory usage: 94.8+ KB\n" ] } ], "source": [ "df.info(show_counts=True)\n" ] }, { "cell_type": "markdown", "id": "c4e7feb6", "metadata": {}, "source": [ "## 2. Set up SingleStore DB" ] }, { "cell_type": "code", "execution_count": 62, "id": "81571781", "metadata": {}, "outputs": [], "source": [ "import singlestoredb as s2\n", "\n", "conn = s2.connect(\":@:3306/\")\n", "\n", "cur = conn.cursor()\n" ] }, { "cell_type": "code", "execution_count": 70, "id": "e1b3fc6f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create database\n", "stmt = \"\"\"\n", " CREATE DATABASE IF NOT EXISTS winter_wikipedia2;\n", "\"\"\"\n", "\n", "cur.execute(stmt)\n" ] }, { "cell_type": "code", "execution_count": 71, "id": "e49c728c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create table\n", "stmt = \"\"\"\n", "CREATE TABLE IF NOT EXISTS winter_wikipedia2.winter_olympics_2022 (\n", " id INT PRIMARY KEY,\n", " text TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", " embedding BLOB\n", ");\"\"\"\n", "\n", "cur.execute(stmt)\n" ] }, { "cell_type": "markdown", "id": "8f10e57e", "metadata": {}, "source": [ "## 3. Populate the Table with our dataframe df and use JSON_ARRAY_PACK to compact it" ] }, { "cell_type": "code", "execution_count": 72, "id": "98424a33", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8.79 s, sys: 4.63 s, total: 13.4 s\n", "Wall time: 11min 4s\n" ] } ], "source": [ "%%time\n", "\n", "# Prepare the statement\n", "stmt = \"\"\"\n", " INSERT INTO winter_wikipedia2.winter_olympics_2022 (\n", " id,\n", " text,\n", " embedding\n", " )\n", " VALUES (\n", " %s,\n", " %s,\n", " JSON_ARRAY_PACK_F64(%s)\n", " )\n", "\"\"\"\n", "\n", "# Convert the DataFrame to a NumPy record array\n", "record_arr = df.to_records(index=True)\n", "\n", "# Set the batch size\n", "batch_size = 1000\n", "\n", "# Iterate over the rows of the record array in batches\n", "for i in range(0, len(record_arr), batch_size):\n", " batch = record_arr[i:i+batch_size]\n", " values = [(row[0], row[1], str(row[2])) for row in batch]\n", " cur.executemany(stmt, values)\n" ] }, { "cell_type": "markdown", "id": "3afeb4ec", "metadata": {}, "source": [ "## 4. Do a semantic search with the same question from above and use the response to send to OpenAI again\n" ] }, { "cell_type": "code", "execution_count": 73, "id": "b2b79750", "metadata": {}, "outputs": [], "source": [ "from utils.embeddings_utils import get_embedding\n", "\n", "def strings_ranked_by_relatedness(\n", " query: str,\n", " df: pd.DataFrame,\n", " relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),\n", " top_n: int = 100\n", ") -> tuple:\n", " \"\"\"Returns a list of strings and relatednesses, sorted from most related to least.\"\"\"\n", "\n", " # Get the embedding of the query.\n", " query_embedding_response = get_embedding(query, EMBEDDING_MODEL)\n", "\n", " # Create the SQL statement.\n", " stmt = \"\"\"\n", " SELECT\n", " text,\n", " DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS score\n", " FROM winter_wikipedia2.winter_olympics_2022\n", " ORDER BY score DESC\n", " LIMIT %s\n", " \"\"\"\n", "\n", " # Execute the SQL statement.\n", " results = cur.execute(stmt, [str(query_embedding_response), top_n])\n", "\n", " # Fetch the results\n", " results = cur.fetchall()\n", "\n", " strings = []\n", " relatednesses = []\n", "\n", " for row in results:\n", " strings.append(row[0])\n", " relatednesses.append(row[1])\n", "\n", " # Return the results.\n", " return strings[:top_n], relatednesses[:top_n]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "804f2659", "metadata": {}, "outputs": [], "source": [ "from tabulate import tabulate\n", "\n", "strings, relatednesses = strings_ranked_by_relatedness(\n", " \"curling gold medal\",\n", " df,\n", " top_n=5\n", ")\n", "\n", "for string, relatedness in zip(strings, relatednesses):\n", " print(f\"{relatedness=:.3f}\")\n", " print(tabulate([[string]], headers=['Result'], tablefmt='fancy_grid'))\n" ] }, { "cell_type": "markdown", "id": "3a03fd7f", "metadata": {}, "source": [ "## 5. Send the right context to ChatGPT for a more accurate answer" ] }, { "cell_type": "code", "execution_count": 75, "id": "13265651", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "\n", "def num_tokens(text: str, model: str = GPT_MODEL) -> int:\n", " \"\"\"Return the number of tokens in a string.\"\"\"\n", " encoding = tiktoken.encoding_for_model(model)\n", " return len(encoding.encode(text))\n", "\n", "\n", "def query_message(\n", " query: str,\n", " df: pd.DataFrame,\n", " model: str,\n", " token_budget: int\n", ") -> str:\n", " \"\"\"Return a message for GPT, with relevant source texts pulled from SingleStoreDB.\"\"\"\n", " strings, relatednesses = strings_ranked_by_relatedness(query, df, \"winter_olympics_2022\")\n", " introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\"'\n", " question = f\"\\n\\nQuestion: {query}\"\n", " message = introduction\n", " for string in strings:\n", " next_article = f'\\n\\nWikipedia article section:\\n\"\"\"\\n{string}\\n\"\"\"'\n", " if (\n", " num_tokens(message + next_article + question, model=model)\n", " > token_budget\n", " ):\n", " break\n", " else:\n", " message += next_article\n", " return message + question\n", "\n", "\n", "def ask(\n", " query: str,\n", " df: pd.DataFrame = df,\n", " model: str = GPT_MODEL,\n", " token_budget: int = 4096 - 500,\n", " print_message: bool = False,\n", ") -> str:\n", " \"\"\"Answers a query using GPT and a table of relevant texts and embeddings in SingleStoreDB.\"\"\"\n", " message = query_message(query, df, model=model, token_budget=token_budget)\n", " if print_message:\n", " print(message)\n", " messages = [\n", " {\"role\": \"system\", \"content\": \"You answer questions about the 2022 Winter Olympics.\"},\n", " {\"role\": \"user\", \"content\": message},\n", " ]\n", " response = openai.ChatCompletion.create(\n", " model=model,\n", " messages=messages,\n", " temperature=0\n", " )\n", " response_message = response[\"choices\"][0][\"message\"][\"content\"]\n", " return response_message\n" ] }, { "cell_type": "markdown", "id": "c9128b90", "metadata": {}, "source": [ "## 6. Get an answer from Chat GPT" ] }, { "cell_type": "code", "execution_count": 76, "id": "d295286a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(\"There were three curling events at the 2022 Winter Olympics: men's, women's, \"\n", " 'and mixed doubles. The gold medalists for each event are:\\n'\n", " '\\n'\n", " \"- Men's: Sweden (Niklas Edin, Oskar Eriksson, Rasmus Wranå, Christoffer \"\n", " 'Sundgren, Daniel Magnusson)\\n'\n", " \"- Women's: Great Britain (Eve Muirhead, Vicky Wright, Jennifer Dodds, Hailey \"\n", " 'Duff, Mili Smith)\\n'\n", " '- Mixed doubles: Italy (Stefania Constantini, Amos Mosaner)')\n" ] } ], "source": [ "from pprint import pprint\n", "\n", "answer = ask('Who won the gold medal for curling in Olymics 2022?')\n", "\n", "pprint(answer)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.11.0 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "vscode": { "interpreter": { "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" } } }, "nbformat": 4, "nbformat_minor": 5 }