fixes token counting in translate_latex_book.ipynb (#579)

* fixes token counting in translate_latex_book.ipynb * adds back comment
11 months ago · 17858f204f
parent 07c0351216
commit 17858f204f
1 changed files with 19 additions and 9 deletions
--- a/examples/book_translation/translate_latex_book.ipynb
+++ b/examples/book_translation/translate_latex_book.ipynb
@ -110,27 +110,37 @@
    }
   ],
   "source": [
-    "def group_chunks(chunks, ntokens, max_len=1000):\n",
+    "def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):\n",
    "    \"\"\"\n",
-    "    Group very short chunks, to form approximately a page long chunks.\n",
+    "    Group very short chunks, to form approximately page long chunks.\n",
    "    \"\"\"\n",
    "    batches = []\n",
    "    cur_batch = \"\"\n",
    "    cur_tokens = 0\n",
-    "\n",
+    "    \n",
    "    # iterate over chunks, and group the short ones together\n",
    "    for chunk, ntoken in zip(chunks, ntokens):\n",
-    "        cur_tokens += ntoken + 2  # +2 for the newlines between chunks\n",
+    "        # discard chunks that exceed hard max length\n",
+    "        if ntoken > hard_max_len:\n",
+    "            print(f\"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'\")\n",
+    "            continue\n",
    "\n",
-    "        # if adding this chunk would exceed the max length, finalize the current batch and start a new one\n",
-    "        if ntoken + cur_tokens > max_len:\n",
+    "        # if room in current batch, add new chunk\n",
+    "        if cur_tokens + 1 + ntoken <= max_len:\n",
+    "            cur_batch += \"\\n\\n\" + chunk\n",
+    "            cur_tokens += 1 + ntoken  # adds 1 token for the two newlines\n",
+    "        # otherwise, record the batch and start a new one\n",
+    "        else:\n",
    "            batches.append(cur_batch)\n",
    "            cur_batch = chunk\n",
-    "        else:\n",
-    "            cur_batch += \"\\n\\n\" + chunk\n",
-    "    batches.append(cur_batch)\n",
+    "            cur_tokens = ntoken\n",
+    "            \n",
+    "    if cur_batch:  # add the last batch if it's not empty\n",
+    "        batches.append(cur_batch)\n",
+    "        \n",
    "    return batches\n",
    "\n",
+    "\n",
    "chunks = group_chunks(chunks, ntokens)\n",
    "len(chunks)"
   ]