From 50ae26c0e442ed97eacbaaebc6748b9a4ca38173 Mon Sep 17 00:00:00 2001 From: Ted Sanders <95656834+ted-at-openai@users.noreply.github.com> Date: Tue, 11 Jul 2023 17:00:38 -0700 Subject: [PATCH] fixes token counting in translate_latex_book.ipynb (#579) * fixes token counting in translate_latex_book.ipynb * adds back comment --- .../translate_latex_book.ipynb | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/book_translation/translate_latex_book.ipynb b/examples/book_translation/translate_latex_book.ipynb index c68cdb8b..8f7a8cc3 100644 --- a/examples/book_translation/translate_latex_book.ipynb +++ b/examples/book_translation/translate_latex_book.ipynb @@ -110,27 +110,37 @@ } ], "source": [ - "def group_chunks(chunks, ntokens, max_len=1000):\n", + "def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):\n", " \"\"\"\n", - " Group very short chunks, to form approximately a page long chunks.\n", + " Group very short chunks, to form approximately page long chunks.\n", " \"\"\"\n", " batches = []\n", " cur_batch = \"\"\n", " cur_tokens = 0\n", - "\n", + " \n", " # iterate over chunks, and group the short ones together\n", " for chunk, ntoken in zip(chunks, ntokens):\n", - " cur_tokens += ntoken + 2 # +2 for the newlines between chunks\n", + " # discard chunks that exceed hard max length\n", + " if ntoken > hard_max_len:\n", + " print(f\"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'\")\n", + " continue\n", "\n", - " # if adding this chunk would exceed the max length, finalize the current batch and start a new one\n", - " if ntoken + cur_tokens > max_len:\n", + " # if room in current batch, add new chunk\n", + " if cur_tokens + 1 + ntoken <= max_len:\n", + " cur_batch += \"\\n\\n\" + chunk\n", + " cur_tokens += 1 + ntoken # adds 1 token for the two newlines\n", + " # otherwise, record the batch and start a new one\n", + " else:\n", " batches.append(cur_batch)\n", " cur_batch = chunk\n", - " else:\n", - " cur_batch += \"\\n\\n\" + chunk\n", - " batches.append(cur_batch)\n", + " cur_tokens = ntoken\n", + " \n", + " if cur_batch: # add the last batch if it's not empty\n", + " batches.append(cur_batch)\n", + " \n", " return batches\n", "\n", + "\n", "chunks = group_chunks(chunks, ntokens)\n", "len(chunks)" ]