add openai tokenizer (#355)

2024-11-04 06:00:26 +00:00 · 2022-12-15 22:35:42 -08:00 · 2022-12-15 22:35:42 -08:00 · 2dd895d98c
commit 2dd895d98c
parent c1b50b7b13
6 changed files with 2504 additions and 2296 deletions
--- a/docs/examples/prompts/llm_functionality.ipynb
+++ b/docs/examples/prompts/llm_functionality.ipynb
@ -49,7 +49,7 @@
    {
     "data": {
      "text/plain": [
-       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'"
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'"
      ]
     },
     "execution_count": 3,
@ -110,7 +110,7 @@
     "data": {
      "text/plain": [
       "[Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'),\n",
-       " Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.')]"
+       " Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!')]"
      ]
     },
     "execution_count": 6,
@ -132,7 +132,7 @@
     "data": {
      "text/plain": [
       "[Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt reminds me of the love I've found\\n\\nAnd I know that wherever I go\\n\\nI'll always find my rose by the side of the road.\"),\n",
-       " Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt tells me that true love is nigh\\n\\nAnd I know that this is the day\\n\\nWhen I look at this rose\\n\\nI am sure of what I am doing\\n\\nWhen I look at this rose\\n\\nI am confident in my love for you\\n\\nAnd I know that I am in love with you\\n\\nSo let it be, the rose by the side of the road\\n\\nAnd let it be what you do, what you are\\n\\nAnd you do it well, for this is what we want\\n\\nAnd we want to be with you\\n\\nAnd we want to be with you\\n\\nAnd we want to be with you\\n\\nWhen we find our way home\")]"
+       " Generation(text=\"\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nWhen I was younger\\nI was it for my fairytale prince\\nNow I realize\\nThat love is something that waits\\nFor when my prince comes\\nAnd when I am ready to be his wife\\nI'll tell you a poem\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nAnd I would be happy\\nWhen my prince came\\nAnd I was ready to be his wife\")]"
      ]
     },
     "execution_count": 7,
@ -153,9 +153,9 @@
    {
     "data": {
      "text/plain": [
-       "{'token_usage': {'completion_tokens': 4108,\n",
+       "{'token_usage': {'completion_tokens': 3722,\n",
       "  'prompt_tokens': 120,\n",
-       "  'total_tokens': 4228}}"
+       "  'total_tokens': 3842}}"
      ]
     },
     "execution_count": 8,
@ -184,13 +184,6 @@
   "id": "b623c774",
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
-     ]
-    },
    {
     "data": {
      "text/plain": [
--- a/langchain/llms/base.py
+++ b/langchain/llms/base.py
@ -76,7 +76,7 @@ class LLM(BaseModel, ABC):
        except ImportError:
            raise ValueError(
                "Could not import transformers python package. "
-                "This is needed in order to calculate max_tokens_for_prompt. "
+                "This is needed in order to calculate get_num_tokens. "
                "Please it install it with `pip install transformers`."
            )
        # create a GPT-3 tokenizer instance
--- a/langchain/llms/openai.py
+++ b/langchain/llms/openai.py
@ -186,6 +186,25 @@ class OpenAI(LLM, BaseModel):
        """
        return self.generate([prompt], stop=stop).generations[0][0].text

+    def get_num_tokens(self, text: str) -> int:
+        """Calculate num tokens with tiktoken package."""
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate get_num_tokens. "
+                "Please it install it with `pip install tiktoken`."
+            )
+        # create a GPT-3 encoder instance
+        enc = tiktoken.get_encoding("gpt2")
+
+        # encode the text using the GPT-3 encoder
+        tokenized_text = enc.encode(text)
+
+        # calculate the number of tokens in the encoded text
+        return len(tokenized_text)
+
    def modelname_to_contextsize(self, modelname: str) -> int:
        """Calculate the maximum number of tokens possible to generate for a model.

--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -49,7 +49,7 @@ class TextSplitter(ABC):

    @classmethod
    def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
-        """Text splitter than uses HuggingFace tokenizer to count length."""
+        """Text splitter that uses HuggingFace tokenizer to count length."""
        try:
            from transformers import PreTrainedTokenizerBase

@ -68,6 +68,27 @@ class TextSplitter(ABC):
            )
        return cls(length_function=_huggingface_tokenizer_length, **kwargs)

+    @classmethod
+    def from_tiktoken_encoder(
+        cls, encoding_name: str = "gpt2", **kwargs: Any
+    ) -> TextSplitter:
+        """Text splitter that uses tiktoken encoder to count length."""
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate max_tokens_for_prompt. "
+                "Please it install it with `pip install tiktoken`."
+            )
+        # create a GPT-3 encoder instance
+        enc = tiktoken.get_encoding(encoding_name)
+
+        def _tiktoken_encoder(text: str) -> int:
+            return len(enc.encode(text))
+
+        return cls(length_function=_tiktoken_encoder, **kwargs)
+

 class CharacterTextSplitter(TextSplitter):
    """Implementation of splitting text that looks at characters."""
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,6 +22,7 @@ spacy = {version = "^3", optional = true}
 nltk = {version = "^3", optional = true}
 transformers = {version = "^4", optional = true}
 beautifulsoup4 = {version = "^4", optional = true}
+tiktoken = {version = "^0", optional = true, python="^3.9"}

 [tool.poetry.group.test.dependencies]
 pytest = "^7.2.0"
@ -49,7 +50,7 @@ playwright = "^1.28.0"

 [tool.poetry.extras]
 llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml"]
-all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4"]
+all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken"]

 [tool.isort]
 profile = "black"