add openai tokenizer (#355)

This commit is contained in:
Harrison Chase 2022-12-15 22:35:42 -08:00 committed by GitHub
parent c1b50b7b13
commit 2dd895d98c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 2504 additions and 2296 deletions

View File

@ -49,7 +49,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'" "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'"
] ]
}, },
"execution_count": 3, "execution_count": 3,
@ -110,7 +110,7 @@
"data": { "data": {
"text/plain": [ "text/plain": [
"[Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'),\n", "[Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'),\n",
" Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.')]" " Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!')]"
] ]
}, },
"execution_count": 6, "execution_count": 6,
@ -132,7 +132,7 @@
"data": { "data": {
"text/plain": [ "text/plain": [
"[Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt reminds me of the love I've found\\n\\nAnd I know that wherever I go\\n\\nI'll always find my rose by the side of the road.\"),\n", "[Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt reminds me of the love I've found\\n\\nAnd I know that wherever I go\\n\\nI'll always find my rose by the side of the road.\"),\n",
" Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt tells me that true love is nigh\\n\\nAnd I know that this is the day\\n\\nWhen I look at this rose\\n\\nI am sure of what I am doing\\n\\nWhen I look at this rose\\n\\nI am confident in my love for you\\n\\nAnd I know that I am in love with you\\n\\nSo let it be, the rose by the side of the road\\n\\nAnd let it be what you do, what you are\\n\\nAnd you do it well, for this is what we want\\n\\nAnd we want to be with you\\n\\nAnd we want to be with you\\n\\nAnd we want to be with you\\n\\nWhen we find our way home\")]" " Generation(text=\"\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nWhen I was younger\\nI was it for my fairytale prince\\nNow I realize\\nThat love is something that waits\\nFor when my prince comes\\nAnd when I am ready to be his wife\\nI'll tell you a poem\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nAnd I would be happy\\nWhen my prince came\\nAnd I was ready to be his wife\")]"
] ]
}, },
"execution_count": 7, "execution_count": 7,
@ -153,9 +153,9 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"{'token_usage': {'completion_tokens': 4108,\n", "{'token_usage': {'completion_tokens': 3722,\n",
" 'prompt_tokens': 120,\n", " 'prompt_tokens': 120,\n",
" 'total_tokens': 4228}}" " 'total_tokens': 3842}}"
] ]
}, },
"execution_count": 8, "execution_count": 8,
@ -184,13 +184,6 @@
"id": "b623c774", "id": "b623c774",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
},
{ {
"data": { "data": {
"text/plain": [ "text/plain": [

View File

@ -76,7 +76,7 @@ class LLM(BaseModel, ABC):
except ImportError: except ImportError:
raise ValueError( raise ValueError(
"Could not import transformers python package. " "Could not import transformers python package. "
"This is needed in order to calculate max_tokens_for_prompt. " "This is needed in order to calculate get_num_tokens. "
"Please it install it with `pip install transformers`." "Please it install it with `pip install transformers`."
) )
# create a GPT-3 tokenizer instance # create a GPT-3 tokenizer instance

View File

@ -186,6 +186,25 @@ class OpenAI(LLM, BaseModel):
""" """
return self.generate([prompt], stop=stop).generations[0][0].text return self.generate([prompt], stop=stop).generations[0][0].text
def get_num_tokens(self, text: str) -> int:
"""Calculate num tokens with tiktoken package."""
try:
import tiktoken
except ImportError:
raise ValueError(
"Could not import tiktoken python package. "
"This is needed in order to calculate get_num_tokens. "
"Please it install it with `pip install tiktoken`."
)
# create a GPT-3 encoder instance
enc = tiktoken.get_encoding("gpt2")
# encode the text using the GPT-3 encoder
tokenized_text = enc.encode(text)
# calculate the number of tokens in the encoded text
return len(tokenized_text)
def modelname_to_contextsize(self, modelname: str) -> int: def modelname_to_contextsize(self, modelname: str) -> int:
"""Calculate the maximum number of tokens possible to generate for a model. """Calculate the maximum number of tokens possible to generate for a model.

View File

@ -49,7 +49,7 @@ class TextSplitter(ABC):
@classmethod @classmethod
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter: def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
"""Text splitter than uses HuggingFace tokenizer to count length.""" """Text splitter that uses HuggingFace tokenizer to count length."""
try: try:
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@ -68,6 +68,27 @@ class TextSplitter(ABC):
) )
return cls(length_function=_huggingface_tokenizer_length, **kwargs) return cls(length_function=_huggingface_tokenizer_length, **kwargs)
@classmethod
def from_tiktoken_encoder(
cls, encoding_name: str = "gpt2", **kwargs: Any
) -> TextSplitter:
"""Text splitter that uses tiktoken encoder to count length."""
try:
import tiktoken
except ImportError:
raise ValueError(
"Could not import tiktoken python package. "
"This is needed in order to calculate max_tokens_for_prompt. "
"Please it install it with `pip install tiktoken`."
)
# create a GPT-3 encoder instance
enc = tiktoken.get_encoding(encoding_name)
def _tiktoken_encoder(text: str) -> int:
return len(enc.encode(text))
return cls(length_function=_tiktoken_encoder, **kwargs)
class CharacterTextSplitter(TextSplitter): class CharacterTextSplitter(TextSplitter):
"""Implementation of splitting text that looks at characters.""" """Implementation of splitting text that looks at characters."""

4736
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@ spacy = {version = "^3", optional = true}
nltk = {version = "^3", optional = true} nltk = {version = "^3", optional = true}
transformers = {version = "^4", optional = true} transformers = {version = "^4", optional = true}
beautifulsoup4 = {version = "^4", optional = true} beautifulsoup4 = {version = "^4", optional = true}
tiktoken = {version = "^0", optional = true, python="^3.9"}
[tool.poetry.group.test.dependencies] [tool.poetry.group.test.dependencies]
pytest = "^7.2.0" pytest = "^7.2.0"
@ -49,7 +50,7 @@ playwright = "^1.28.0"
[tool.poetry.extras] [tool.poetry.extras]
llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml"] llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml"]
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4"] all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken"]
[tool.isort] [tool.isort]
profile = "black" profile = "black"