issue#4082 base_language had wrong code comment that it was using gpt… (#4084)

…3 to tokenize text instead of gpt-2

Co-authored-by: Pulkit <pulkit.mehta@catylex.com>
This commit is contained in:
Pulkit Mehta 2023-05-04 10:28:29 +05:30 committed by GitHub
parent 6caba8e759
commit bbbca10704
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -25,7 +25,7 @@ def _get_num_tokens_default_method(text: str) -> int:
# create a GPT-2 tokenizer instance # create a GPT-2 tokenizer instance
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# tokenize the text using the GPT-3 tokenizer # tokenize the text using the GPT-2 tokenizer
tokenized_text = tokenizer.tokenize(text) tokenized_text = tokenizer.tokenize(text)
# calculate the number of tokens in the tokenized text # calculate the number of tokens in the tokenized text