langchain/tests/integration_tests/test_schema.py
Zander Chase 785502edb3
Add 'get_token_ids' method (#4784)
Let user inspect the token ids in addition to getting th enumber of tokens

---------

Co-authored-by: Zach Schillaci <40636930+zachschillaci27@users.noreply.github.com>
2023-05-22 13:17:26 +00:00

20 lines
743 B
Python

"""Test formatting functionality."""
from langchain.base_language import _get_token_ids_default_method
class TestTokenCountingWithGPT2Tokenizer:
def test_tokenization(self) -> None:
# Check that the tokenization is consistent with the GPT-2 tokenizer
assert _get_token_ids_default_method("This is a test") == [1212, 318, 257, 1332]
def test_empty_token(self) -> None:
assert len(_get_token_ids_default_method("")) == 0
def test_multiple_tokens(self) -> None:
assert len(_get_token_ids_default_method("a b c")) == 3
def test_special_tokens(self) -> None:
# test for consistency when the default tokenizer is changed
assert len(_get_token_ids_default_method("a:b_c d")) == 6