From c2959fd60b71a47d7be61e59742b750b16ebb649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sim=C3=B3n=20Fishman?= Date: Wed, 27 Sep 2023 16:12:31 -0700 Subject: [PATCH] [tiktoken_counting] fix tokenizer name (#741) --- .../How_to_count_tokens_with_tiktoken.ipynb | 68 ++++++++++++++++--- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/examples/How_to_count_tokens_with_tiktoken.ipynb b/examples/How_to_count_tokens_with_tiktoken.ipynb index 68af32a7..74e3923a 100644 --- a/examples/How_to_count_tokens_with_tiktoken.ipynb +++ b/examples/How_to_count_tokens_with_tiktoken.ipynb @@ -58,6 +58,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -68,14 +69,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: tiktoken in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.5.1)\n", + "Requirement already satisfied: requests>=2.26.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2.31.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2023.8.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.2.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2.0.5)\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n", + "You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "Requirement already satisfied: openai in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.28.1)\n", + "Requirement already satisfied: aiohttp in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (3.8.5)\n", + "Requirement already satisfied: requests>=2.20 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (2.31.0)\n", + "Requirement already satisfied: tqdm in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (4.66.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2023.7.22)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.2.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2.0.5)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.9.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (6.0.4)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (23.1.0)\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n", + "You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "%pip install --upgrade tiktoken" + "%pip install --upgrade tiktoken\n", + "%pip install --upgrade openai" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -88,10 +125,11 @@ "metadata": {}, "outputs": [], "source": [ - "import tiktoken\n" + "import tiktoken" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -112,6 +150,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -128,6 +167,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -136,6 +176,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -163,6 +204,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -203,6 +245,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -210,6 +253,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -237,6 +281,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -244,6 +289,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -271,6 +317,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -278,6 +325,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -297,7 +345,7 @@ " # print the example string\n", " print(f'\\nExample string: \"{example_string}\"')\n", " # for each encoding, print the # of tokens, the token integers, and the token bytes\n", - " for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n", + " for encoding_name in [\"r50k_base\", \"p50k_base\", \"cl100k_base\"]:\n", " encoding = tiktoken.get_encoding(encoding_name)\n", " token_integers = encoding.encode(example_string)\n", " num_tokens = len(token_integers)\n", @@ -321,7 +369,7 @@ "\n", "Example string: \"antidisestablishmentarianism\"\n", "\n", - "gpt2: 5 tokens\n", + "r50k_base: 5 tokens\n", "token integers: [415, 29207, 44390, 3699, 1042]\n", "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n", "\n", @@ -351,7 +399,7 @@ "\n", "Example string: \"2 + 2 = 4\"\n", "\n", - "gpt2: 5 tokens\n", + "r50k_base: 5 tokens\n", "token integers: [17, 1343, 362, 796, 604]\n", "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n", "\n", @@ -381,7 +429,7 @@ "\n", "Example string: \"お誕生日おめでとう\"\n", "\n", - "gpt2: 14 tokens\n", + "r50k_base: 14 tokens\n", "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n", "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n", "\n", @@ -482,7 +530,7 @@ "gpt-3.5-turbo\n", "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\n", "129 prompt tokens counted by num_tokens_from_messages().\n", - "127 prompt tokens counted by the OpenAI API.\n", + "129 prompt tokens counted by the OpenAI API.\n", "\n", "gpt-4-0314\n", "129 prompt tokens counted by num_tokens_from_messages().\n", @@ -575,7 +623,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.13" }, "vscode": { "interpreter": {