From 4bf478bf345c52022fd21e2a159605f84f28ef12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 11 Feb 2024 22:48:30 +0100 Subject: [PATCH] Add tests for humongous encodings --- tests/test_encoding.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 27b21925..acda49e7 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -11,6 +11,19 @@ from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES +@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) +def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]): + enc = make_enc() + for c in ["^", "0", "a", " ", "\n", "'s"]: + print(f"Validating `{c}`") + + big_value = "^" * 1000000 + assert big_value == enc.decode(enc.encode(big_value)) + + big_value = " " + big_value + assert big_value == enc.decode(enc.encode(big_value)) + + def test_simple(): enc = tiktoken.get_encoding("gpt2") assert enc.encode("hello world") == [31373, 995]