[tiktoken_counting] fix tokenizer name (#741)

pull/742/head
Simón Fishman 8 months ago committed by GitHub
parent 4631e1b74a
commit c2959fd60b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -58,6 +58,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -68,14 +69,50 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tiktoken in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.5.1)\n",
"Requirement already satisfied: requests>=2.26.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2.31.0)\n",
"Requirement already satisfied: regex>=2022.1.18 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from tiktoken) (2023.8.8)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.4)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.2.0)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2.0.5)\n",
"\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
"You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: openai in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (0.28.1)\n",
"Requirement already satisfied: aiohttp in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (3.8.5)\n",
"Requirement already satisfied: requests>=2.20 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (2.31.0)\n",
"Requirement already satisfied: tqdm in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from openai) (4.66.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2023.7.22)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (3.2.0)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from requests>=2.20->openai) (2.0.5)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (4.0.3)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (1.9.2)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (6.0.4)\n",
"Requirement already satisfied: attrs>=17.3.0 in /Users/simon/.virtualenvs/openai/lib/python3.9/site-packages (from aiohttp->openai) (23.1.0)\n",
"\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n",
"You should consider upgrading via the '/Users/simon/.virtualenvs/openai/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install --upgrade tiktoken"
"%pip install --upgrade tiktoken\n",
"%pip install --upgrade openai"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -88,10 +125,11 @@
"metadata": {},
"outputs": [],
"source": [
"import tiktoken\n"
"import tiktoken"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -112,6 +150,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -128,6 +167,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -136,6 +176,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -163,6 +204,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -203,6 +245,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -210,6 +253,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -237,6 +281,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -244,6 +289,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -271,6 +317,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -278,6 +325,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -297,7 +345,7 @@
" # print the example string\n",
" print(f'\\nExample string: \"{example_string}\"')\n",
" # for each encoding, print the # of tokens, the token integers, and the token bytes\n",
" for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n",
" for encoding_name in [\"r50k_base\", \"p50k_base\", \"cl100k_base\"]:\n",
" encoding = tiktoken.get_encoding(encoding_name)\n",
" token_integers = encoding.encode(example_string)\n",
" num_tokens = len(token_integers)\n",
@ -321,7 +369,7 @@
"\n",
"Example string: \"antidisestablishmentarianism\"\n",
"\n",
"gpt2: 5 tokens\n",
"r50k_base: 5 tokens\n",
"token integers: [415, 29207, 44390, 3699, 1042]\n",
"token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
"\n",
@ -351,7 +399,7 @@
"\n",
"Example string: \"2 + 2 = 4\"\n",
"\n",
"gpt2: 5 tokens\n",
"r50k_base: 5 tokens\n",
"token integers: [17, 1343, 362, 796, 604]\n",
"token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
"\n",
@ -381,7 +429,7 @@
"\n",
"Example string: \"お誕生日おめでとう\"\n",
"\n",
"gpt2: 14 tokens\n",
"r50k_base: 14 tokens\n",
"token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
"token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
"\n",
@ -482,7 +530,7 @@
"gpt-3.5-turbo\n",
"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\n",
"129 prompt tokens counted by num_tokens_from_messages().\n",
"127 prompt tokens counted by the OpenAI API.\n",
"129 prompt tokens counted by the OpenAI API.\n",
"\n",
"gpt-4-0314\n",
"129 prompt tokens counted by num_tokens_from_messages().\n",
@ -575,7 +623,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.9.13"
},
"vscode": {
"interpreter": {

Loading…
Cancel
Save