From 50ae26c0e442ed97eacbaaebc6748b9a4ca38173 Mon Sep 17 00:00:00 2001
From: Ted Sanders <95656834+ted-at-openai@users.noreply.github.com>
Date: Tue, 11 Jul 2023 17:00:38 -0700
Subject: [PATCH] fixes token counting in translate_latex_book.ipynb (#579)

* fixes token counting in translate_latex_book.ipynb

* adds back comment
---
 .../translate_latex_book.ipynb                | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/examples/book_translation/translate_latex_book.ipynb b/examples/book_translation/translate_latex_book.ipynb
index c68cdb8b..8f7a8cc3 100644
--- a/examples/book_translation/translate_latex_book.ipynb
+++ b/examples/book_translation/translate_latex_book.ipynb
@@ -110,27 +110,37 @@
     }
    ],
    "source": [
-    "def group_chunks(chunks, ntokens, max_len=1000):\n",
+    "def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):\n",
     "    \"\"\"\n",
-    "    Group very short chunks, to form approximately a page long chunks.\n",
+    "    Group very short chunks, to form approximately page long chunks.\n",
     "    \"\"\"\n",
     "    batches = []\n",
     "    cur_batch = \"\"\n",
     "    cur_tokens = 0\n",
-    "\n",
+    "    \n",
     "    # iterate over chunks, and group the short ones together\n",
     "    for chunk, ntoken in zip(chunks, ntokens):\n",
-    "        cur_tokens += ntoken + 2  # +2 for the newlines between chunks\n",
+    "        # discard chunks that exceed hard max length\n",
+    "        if ntoken > hard_max_len:\n",
+    "            print(f\"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'\")\n",
+    "            continue\n",
     "\n",
-    "        # if adding this chunk would exceed the max length, finalize the current batch and start a new one\n",
-    "        if ntoken + cur_tokens > max_len:\n",
+    "        # if room in current batch, add new chunk\n",
+    "        if cur_tokens + 1 + ntoken <= max_len:\n",
+    "            cur_batch += \"\\n\\n\" + chunk\n",
+    "            cur_tokens += 1 + ntoken  # adds 1 token for the two newlines\n",
+    "        # otherwise, record the batch and start a new one\n",
+    "        else:\n",
     "            batches.append(cur_batch)\n",
     "            cur_batch = chunk\n",
-    "        else:\n",
-    "            cur_batch += \"\\n\\n\" + chunk\n",
-    "    batches.append(cur_batch)\n",
+    "            cur_tokens = ntoken\n",
+    "            \n",
+    "    if cur_batch:  # add the last batch if it's not empty\n",
+    "        batches.append(cur_batch)\n",
+    "        \n",
     "    return batches\n",
     "\n",
+    "\n",
     "chunks = group_chunks(chunks, ntokens)\n",
     "len(chunks)"
    ]