diff --git a/examples/vector_databases/SingleStoreDB/OpenAI_wikipedia_semantic_search.ipynb b/examples/vector_databases/SingleStoreDB/OpenAI_wikipedia_semantic_search.ipynb new file mode 100644 index 00000000..cba337c5 --- /dev/null +++ b/examples/vector_databases/SingleStoreDB/OpenAI_wikipedia_semantic_search.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c2b98618", + "metadata": {}, + "source": [ + "# Intro\n", + "This notebook is an example on how you can use SingleStoreDB vector storage and functions to build an interactive Q&A application with ChatGPT. If you start a [Trial](https://www.singlestore.com/cloud-trial/) in SingleStoreDB, you can find the same notebook in our sample notebooks with native connection." + ] + }, + { + "cell_type": "markdown", + "id": "55b58478", + "metadata": {}, + "source": [ + "## First let's talk directly to ChatGPT and try and get back a response" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "661cd7c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install openai --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "61468873", + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "\n", + "# models\n", + "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", + "GPT_MODEL = \"gpt-3.5-turbo\"" + ] + }, + { + "cell_type": "markdown", + "id": "3778d23e", + "metadata": {}, + "source": [ + "## Let's connect to OpenAI and see the result we get when asking for a date beyond 2021" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3f654b3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm sorry, I cannot provide information about events that have not occurred yet. The Winter Olympics 2022 will be held in Beijing, China from February 4 to 20, 2022. The curling events will take place during this time and the results will not be known until after the competition has concluded.\n" + ] + } + ], + "source": [ + "openai.api_key = 'OPENAI API KEY'\n", + "\n", + "response = openai.ChatCompletion.create(\n", + " model=GPT_MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Who won the gold medal for curling in Olymics 2022?\"},\n", + " ]\n", + ")\n", + "\n", + "print(response['choices'][0]['message']['content'])" + ] + }, + { + "cell_type": "markdown", + "id": "a9c15d6d", + "metadata": {}, + "source": [ + "# Get the data about Winter Olympics and provide the information to ChatGPT as context" + ] + }, + { + "cell_type": "markdown", + "id": "c5247835", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0948696c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install matplotlib plotly.express scikit-learn tabulate tiktoken wget --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1e36f5d8", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import wget\n", + "import ast" + ] + }, + { + "cell_type": "markdown", + "id": "ba9b8ae2", + "metadata": {}, + "source": [ + "## Step 1 - Grab the data from CSV and prepare it" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ce3897b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File downloaded successfully.\n" + ] + } + ], + "source": [ + "# download pre-chunked text and pre-computed embeddings\n", + "# this file is ~200 MB, so may take a minute depending on your connection speed\n", + "embeddings_path = \"https://cdn.openai.com/API/examples/data/winter_olympics_2022.csv\"\n", + "file_path = \"winter_olympics_2022.csv\"\n", + "\n", + "if not os.path.exists(file_path):\n", + " wget.download(embeddings_path, file_path)\n", + " print(\"File downloaded successfully.\")\n", + "else:\n", + " print(\"File already exists in the local file system.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "082e9545", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\n", + " \"winter_olympics_2022.csv\"\n", + ")\n", + "\n", + "# convert embeddings from CSV str type back to list type\n", + "df['embedding'] = df['embedding'].apply(ast.literal_eval)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1768fa60", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "37791a10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 6059 entries, 0 to 6058\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 text 6059 non-null object\n", + " 1 embedding 6059 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 94.8+ KB\n" + ] + } + ], + "source": [ + "df.info(show_counts=True)" + ] + }, + { + "cell_type": "markdown", + "id": "c4e7feb6", + "metadata": {}, + "source": [ + "## 2. Set up SingleStore DB" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "81571781", + "metadata": {}, + "outputs": [], + "source": [ + "import singlestoredb as s2\n", + "\n", + "conn = s2.connect(\":@:3306/\")\n", + "\n", + "cur = conn.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "e1b3fc6f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create database\n", + "stmt = \"\"\"\n", + " CREATE DATABASE IF NOT EXISTS winter_wikipedia2;\n", + "\"\"\"\n", + "\n", + "cur.execute(stmt)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e49c728c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#create table\n", + "stmt = \"\"\"\n", + "CREATE TABLE IF NOT EXISTS winter_wikipedia2.winter_olympics_2022 (\n", + " id INT PRIMARY KEY,\n", + " text TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", + " embedding BLOB\n", + ");\"\"\"\n", + "\n", + "cur.execute(stmt)" + ] + }, + { + "cell_type": "markdown", + "id": "8f10e57e", + "metadata": {}, + "source": [ + "## 3. Populate the Table with our dataframe df and use JSON_ARRAY_PACK to compact it" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "98424a33", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.79 s, sys: 4.63 s, total: 13.4 s\n", + "Wall time: 11min 4s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# Prepare the statement\n", + "stmt = \"\"\"\n", + " INSERT INTO winter_wikipedia2.winter_olympics_2022 (\n", + " id,\n", + " text,\n", + " embedding\n", + " )\n", + " VALUES (\n", + " %s,\n", + " %s,\n", + " JSON_ARRAY_PACK_F64(%s)\n", + " )\n", + "\"\"\"\n", + "\n", + "# Convert the DataFrame to a NumPy record array\n", + "record_arr = df.to_records(index=True)\n", + "\n", + "# Set the batch size\n", + "batch_size = 1000\n", + "\n", + "# Iterate over the rows of the record array in batches\n", + "for i in range(0, len(record_arr), batch_size):\n", + " batch = record_arr[i:i+batch_size]\n", + " values = [(row[0], row[1], str(row[2])) for row in batch]\n", + " cur.executemany(stmt, values)" + ] + }, + { + "cell_type": "markdown", + "id": "3afeb4ec", + "metadata": {}, + "source": [ + "## 4. Do a semantic search with the same question from above and use the response to send to OpenAI again\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "b2b79750", + "metadata": {}, + "outputs": [], + "source": [ + "from openai.embeddings_utils import get_embedding\n", + "\n", + "def strings_ranked_by_relatedness(\n", + " query: str,\n", + " df: pd.DataFrame,\n", + " relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),\n", + " top_n: int = 100\n", + ") -> tuple:\n", + " \"\"\"Returns a list of strings and relatednesses, sorted from most related to least.\"\"\"\n", + "\n", + " # Get the embedding of the query.\n", + " query_embedding_response = get_embedding(query, EMBEDDING_MODEL)\n", + "\n", + " # Create the SQL statement.\n", + " stmt = \"\"\"\n", + " SELECT\n", + " text,\n", + " DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS score\n", + " FROM winter_wikipedia2.winter_olympics_2022\n", + " ORDER BY score DESC\n", + " LIMIT %s\n", + " \"\"\"\n", + "\n", + " # Execute the SQL statement.\n", + " results = cur.execute(stmt, [str(query_embedding_response), top_n])\n", + "\n", + " # Fetch the results\n", + " results = cur.fetchall()\n", + " \n", + " strings = []\n", + " relatednesses = []\n", + "\n", + " for row in results:\n", + " strings.append(row[0])\n", + " relatednesses.append(row[1])\n", + "\n", + " # Return the results.\n", + " return strings[:top_n], relatednesses[:top_n]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "804f2659", + "metadata": {}, + "outputs": [], + "source": [ + "from tabulate import tabulate\n", + "\n", + "strings, relatednesses = strings_ranked_by_relatedness(\n", + " \"curling gold medal\",\n", + " df,\n", + " top_n=5\n", + ")\n", + "\n", + "for string, relatedness in zip(strings, relatednesses):\n", + " print(f\"{relatedness=:.3f}\")\n", + " print(tabulate([[string]], headers=['Result'], tablefmt='fancy_grid'))" + ] + }, + { + "cell_type": "markdown", + "id": "3a03fd7f", + "metadata": {}, + "source": [ + "## 5. Send the right context to ChatGPT for a more accurate answer" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "13265651", + "metadata": {}, + "outputs": [], + "source": [ + "import tiktoken\n", + "\n", + "def num_tokens(text: str, model: str = GPT_MODEL) -> int:\n", + " \"\"\"Return the number of tokens in a string.\"\"\"\n", + " encoding = tiktoken.encoding_for_model(model)\n", + " return len(encoding.encode(text))\n", + "\n", + "\n", + "def query_message(\n", + " query: str,\n", + " df: pd.DataFrame,\n", + " model: str,\n", + " token_budget: int\n", + ") -> str:\n", + " \"\"\"Return a message for GPT, with relevant source texts pulled from SingleStoreDB.\"\"\"\n", + " strings, relatednesses = strings_ranked_by_relatedness(query, df, \"winter_olympics_2022\")\n", + " introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\"'\n", + " question = f\"\\n\\nQuestion: {query}\"\n", + " message = introduction\n", + " for string in strings:\n", + " next_article = f'\\n\\nWikipedia article section:\\n\"\"\"\\n{string}\\n\"\"\"'\n", + " if (\n", + " num_tokens(message + next_article + question, model=model)\n", + " > token_budget\n", + " ):\n", + " break\n", + " else:\n", + " message += next_article\n", + " return message + question\n", + "\n", + "\n", + "def ask(\n", + " query: str,\n", + " df: pd.DataFrame = df,\n", + " model: str = GPT_MODEL,\n", + " token_budget: int = 4096 - 500,\n", + " print_message: bool = False,\n", + ") -> str:\n", + " \"\"\"Answers a query using GPT and a table of relevant texts and embeddings in SingleStoreDB.\"\"\"\n", + " message = query_message(query, df, model=model, token_budget=token_budget)\n", + " if print_message:\n", + " print(message)\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": \"You answer questions about the 2022 Winter Olympics.\"},\n", + " {\"role\": \"user\", \"content\": message},\n", + " ]\n", + " response = openai.ChatCompletion.create(\n", + " model=model,\n", + " messages=messages,\n", + " temperature=0\n", + " )\n", + " response_message = response[\"choices\"][0][\"message\"][\"content\"]\n", + " return response_message" + ] + }, + { + "cell_type": "markdown", + "id": "c9128b90", + "metadata": {}, + "source": [ + "## 6. Get an answer from Chat GPT" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "d295286a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(\"There were three curling events at the 2022 Winter Olympics: men's, women's, \"\n", + " 'and mixed doubles. The gold medalists for each event are:\\n'\n", + " '\\n'\n", + " \"- Men's: Sweden (Niklas Edin, Oskar Eriksson, Rasmus Wranå, Christoffer \"\n", + " 'Sundgren, Daniel Magnusson)\\n'\n", + " \"- Women's: Great Britain (Eve Muirhead, Vicky Wright, Jennifer Dodds, Hailey \"\n", + " 'Duff, Mili Smith)\\n'\n", + " '- Mixed doubles: Italy (Stefania Constantini, Amos Mosaner)')\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "answer = ask('Who won the gold medal for curling in Olymics 2022?')\n", + "\n", + "pprint(answer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.11.0 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "vscode": { + "interpreter": { + "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vector_databases/SingleStoreDB/README.md b/examples/vector_databases/SingleStoreDB/README.md new file mode 100644 index 00000000..062c2239 --- /dev/null +++ b/examples/vector_databases/SingleStoreDB/README.md @@ -0,0 +1,15 @@ +**[SingleStoreDB](https://singlestore.com)** has first-class support for vector search through our [Vector Functions](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions.html). Our vector database subsystem, first made available in 2017 and subsequently enhanced, allows extremely fast nearest-neighbor search to find objects that are semantically similar, easily using SQL. + +SingleStoreDB supports vectors and vector similarity search using dot_product (for cosine similarity) and euclidean_distance functions. These functions are used by our customers for applications including face recognition, visual product photo search and text-based semantic search. With the explosion of generative AI technology, these capabilities form a firm foundation for text-based AI chatbots. + +But remember, SingleStoreDB is a high-performance, scalable, modern SQL DBMS that supports multiple data models including structured data, semi-structured data based on JSON, time-series, full text, spatial, key-value and of course vector data. Start powering your next intelligent application with SingleStoreDB today! + +![SingleStore Open AI](https://user-images.githubusercontent.com/8846480/236985121-48980956-fdc5-49c8-b006-f3a412142676.png) + +## Example + +This folder contains examples of using SingleStoreDB and OpenAI together. We will keep adding more scenarios so stay tuned! + +| Name | Description | +| --- | --- | +| [OpenAI wikipedia semantic search](./OpenAI_wikipedia_semantic_search.ipynb) | Improve ChatGPT accuracy through SingleStoreDB semantic Search in QA |