2023-04-29 03:42:24 +00:00
|
|
|
"""Test formatting functionality."""
|
|
|
|
|
2023-07-07 20:09:10 +00:00
|
|
|
from langchain.schema.language_model import _get_token_ids_default_method
|
2023-04-29 03:42:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TestTokenCountingWithGPT2Tokenizer:
|
2023-05-22 13:17:26 +00:00
|
|
|
def test_tokenization(self) -> None:
|
|
|
|
# Check that the tokenization is consistent with the GPT-2 tokenizer
|
|
|
|
assert _get_token_ids_default_method("This is a test") == [1212, 318, 257, 1332]
|
|
|
|
|
2023-04-29 03:42:24 +00:00
|
|
|
def test_empty_token(self) -> None:
|
2023-05-22 13:17:26 +00:00
|
|
|
assert len(_get_token_ids_default_method("")) == 0
|
2023-04-29 03:42:24 +00:00
|
|
|
|
|
|
|
def test_multiple_tokens(self) -> None:
|
2023-05-22 13:17:26 +00:00
|
|
|
assert len(_get_token_ids_default_method("a b c")) == 3
|
2023-04-29 03:42:24 +00:00
|
|
|
|
|
|
|
def test_special_tokens(self) -> None:
|
|
|
|
# test for consistency when the default tokenizer is changed
|
2023-05-22 13:17:26 +00:00
|
|
|
assert len(_get_token_ids_default_method("a:b_c d")) == 6
|