From bbbca1070476679a3f6c8cd42e4c490342245bcd Mon Sep 17 00:00:00 2001 From: Pulkit Mehta Date: Thu, 4 May 2023 10:28:29 +0530 Subject: [PATCH] =?UTF-8?q?issue#4082=20base=5Flanguage=20had=20wrong=20co?= =?UTF-8?q?de=20comment=20that=20it=20was=20using=20gpt=E2=80=A6=20(#4084)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …3 to tokenize text instead of gpt-2 Co-authored-by: Pulkit --- langchain/base_language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/base_language.py b/langchain/base_language.py index 3c524ef4..d29eecce 100644 --- a/langchain/base_language.py +++ b/langchain/base_language.py @@ -25,7 +25,7 @@ def _get_num_tokens_default_method(text: str) -> int: # create a GPT-2 tokenizer instance tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - # tokenize the text using the GPT-3 tokenizer + # tokenize the text using the GPT-2 tokenizer tokenized_text = tokenizer.tokenize(text) # calculate the number of tokens in the tokenized text