mirror of https://github.com/hwchase17/langchain
huggingface tokenizer (#75)
parent
b542941234
commit
d87e73ddb1
@ -0,0 +1,23 @@
|
||||
"""Test text splitters that require an integration."""
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
"""Test that type checks are done properly on input."""
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter.from_huggingface_tokenizer("foo")
|
||||
|
||||
|
||||
def test_huggingface_tokenizer() -> None:
|
||||
"""Test text splitter that uses a HuggingFace tokenizer."""
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
|
||||
)
|
||||
output = text_splitter.split_text("foo bar")
|
||||
assert output == ["foo", "bar"]
|
Loading…
Reference in New Issue