@ -3,7 +3,17 @@ from __future__ import annotations
import logging
import logging
import sys
import sys
from typing import Any , Callable , Dict , List , Mapping , Optional , Tuple , Union
from typing import (
TYPE_CHECKING ,
Any ,
Callable ,
Dict ,
List ,
Mapping ,
Optional ,
Tuple ,
Union ,
)
from pydantic import Extra , Field , root_validator
from pydantic import Extra , Field , root_validator
from tenacity import (
from tenacity import (
@ -30,9 +40,24 @@ from langchain.schema import (
)
)
from langchain . utils import get_from_dict_or_env
from langchain . utils import get_from_dict_or_env
if TYPE_CHECKING :
import tiktoken
logger = logging . getLogger ( __name__ )
logger = logging . getLogger ( __name__ )
def _import_tiktoken ( ) - > Any :
try :
import tiktoken
except ImportError :
raise ValueError (
" Could not import tiktoken python package. "
" This is needed in order to calculate get_token_ids. "
" Please install it with `pip install tiktoken`. "
)
return tiktoken
def _create_retry_decorator ( llm : ChatOpenAI ) - > Callable [ [ Any ] , Any ] :
def _create_retry_decorator ( llm : ChatOpenAI ) - > Callable [ [ Any ] , Any ] :
import openai
import openai
@ -354,42 +379,8 @@ class ChatOpenAI(BaseChatModel):
""" Return type of chat model. """
""" Return type of chat model. """
return " openai-chat "
return " openai-chat "
def get_num_tokens ( self , text : str ) - > int :
def _get_encoding_model ( self ) - > Tuple [ str , tiktoken . Encoding ] :
""" Calculate num tokens with tiktoken package. """
tiktoken_ = _import_tiktoken ( )
# tiktoken NOT supported for Python 3.7 or below
if sys . version_info [ 1 ] < = 7 :
return super ( ) . get_num_tokens ( text )
try :
import tiktoken
except ImportError :
raise ValueError (
" Could not import tiktoken python package. "
" This is needed in order to calculate get_num_tokens. "
" Please install it with `pip install tiktoken`. "
)
# create a GPT-3.5-Turbo encoder instance
enc = tiktoken . encoding_for_model ( self . model_name )
# encode the text using the GPT-3.5-Turbo encoder
tokenized_text = enc . encode ( text )
# calculate the number of tokens in the encoded text
return len ( tokenized_text )
def get_num_tokens_from_messages ( self , messages : List [ BaseMessage ] ) - > int :
""" Calculate num tokens for gpt-3.5-turbo and gpt-4 with tiktoken package.
Official documentation : https : / / github . com / openai / openai - cookbook / blob /
main / examples / How_to_format_inputs_to_ChatGPT_models . ipynb """
try :
import tiktoken
except ImportError :
raise ValueError (
" Could not import tiktoken python package. "
" This is needed in order to calculate get_num_tokens. "
" Please install it with `pip install tiktoken`. "
)
model = self . model_name
model = self . model_name
if model == " gpt-3.5-turbo " :
if model == " gpt-3.5-turbo " :
# gpt-3.5-turbo may change over time.
# gpt-3.5-turbo may change over time.
@ -399,14 +390,31 @@ class ChatOpenAI(BaseChatModel):
# gpt-4 may change over time.
# gpt-4 may change over time.
# Returning num tokens assuming gpt-4-0314.
# Returning num tokens assuming gpt-4-0314.
model = " gpt-4-0314 "
model = " gpt-4-0314 "
# Returns the number of tokens used by a list of messages.
# Returns the number of tokens used by a list of messages.
try :
try :
encoding = tiktoken . encoding_for_model ( model )
encoding = tiktoken _ . encoding_for_model ( model )
except KeyError :
except KeyError :
logger . warning ( " Warning: model not found. Using cl100k_base encoding. " )
logger . warning ( " Warning: model not found. Using cl100k_base encoding. " )
encoding = tiktoken . get_encoding ( " cl100k_base " )
model = " cl100k_base "
encoding = tiktoken_ . get_encoding ( model )
return model , encoding
def get_token_ids ( self , text : str ) - > List [ int ] :
""" Get the tokens present in the text with tiktoken package. """
# tiktoken NOT supported for Python 3.7 or below
if sys . version_info [ 1 ] < = 7 :
return super ( ) . get_token_ids ( text )
_ , encoding_model = self . _get_encoding_model ( )
return encoding_model . encode ( text )
def get_num_tokens_from_messages ( self , messages : List [ BaseMessage ] ) - > int :
""" Calculate num tokens for gpt-3.5-turbo and gpt-4 with tiktoken package.
Official documentation : https : / / github . com / openai / openai - cookbook / blob /
main / examples / How_to_format_inputs_to_ChatGPT_models . ipynb """
if sys . version_info [ 1 ] < = 7 :
return super ( ) . get_num_tokens_from_messages ( messages )
model , encoding = self . _get_encoding_model ( )
if model == " gpt-3.5-turbo-0301 " :
if model == " gpt-3.5-turbo-0301 " :
# every message follows <im_start>{role/name}\n{content}<im_end>\n
# every message follows <im_start>{role/name}\n{content}<im_end>\n
tokens_per_message = 4
tokens_per_message = 4