diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 27b21925..acda49e7 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -11,6 +11,19 @@ from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES +@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) +def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]): + enc = make_enc() + for c in ["^", "0", "a", " ", "\n", "'s"]: + print(f"Validating `{c}`") + + big_value = "^" * 1000000 + assert big_value == enc.decode(enc.encode(big_value)) + + big_value = " " + big_value + assert big_value == enc.decode(enc.encode(big_value)) + + def test_simple(): enc = tiktoken.get_encoding("gpt2") assert enc.encode("hello world") == [31373, 995]