fixes token counting in translate_latex_book.ipynb (#579)

* fixes token counting in translate_latex_book.ipynb

* adds back comment
pull/581/head
Ted Sanders 11 months ago committed by GitHub
parent 07c0351216
commit 17858f204f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -110,27 +110,37 @@
}
],
"source": [
"def group_chunks(chunks, ntokens, max_len=1000):\n",
"def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):\n",
" \"\"\"\n",
" Group very short chunks, to form approximately a page long chunks.\n",
" Group very short chunks, to form approximately page long chunks.\n",
" \"\"\"\n",
" batches = []\n",
" cur_batch = \"\"\n",
" cur_tokens = 0\n",
"\n",
" \n",
" # iterate over chunks, and group the short ones together\n",
" for chunk, ntoken in zip(chunks, ntokens):\n",
" cur_tokens += ntoken + 2 # +2 for the newlines between chunks\n",
" # discard chunks that exceed hard max length\n",
" if ntoken > hard_max_len:\n",
" print(f\"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'\")\n",
" continue\n",
"\n",
" # if adding this chunk would exceed the max length, finalize the current batch and start a new one\n",
" if ntoken + cur_tokens > max_len:\n",
" # if room in current batch, add new chunk\n",
" if cur_tokens + 1 + ntoken <= max_len:\n",
" cur_batch += \"\\n\\n\" + chunk\n",
" cur_tokens += 1 + ntoken # adds 1 token for the two newlines\n",
" # otherwise, record the batch and start a new one\n",
" else:\n",
" batches.append(cur_batch)\n",
" cur_batch = chunk\n",
" else:\n",
" cur_batch += \"\\n\\n\" + chunk\n",
" batches.append(cur_batch)\n",
" cur_tokens = ntoken\n",
" \n",
" if cur_batch: # add the last batch if it's not empty\n",
" batches.append(cur_batch)\n",
" \n",
" return batches\n",
"\n",
"\n",
"chunks = group_chunks(chunks, ntokens)\n",
"len(chunks)"
]

Loading…
Cancel
Save