From 190413d97b5a75b167a88e2edf5933a3f015fb0d Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 25 Jan 2024 09:57:00 -0500 Subject: [PATCH] Add Llama2 checks and log additional values (#1578) --- loadgen/bindings/python_api.cc | 2 ++ loadgen/results.cc | 16 ++++++++++++-- loadgen/test_settings.h | 3 +++ loadgen/test_settings_internal.cc | 18 +++++++++++++++- loadgen/test_settings_internal.h | 2 ++ mlperf.conf | 1 + tools/submission/submission_checker.py | 29 ++++++++++++++++++++++++++ 7 files changed, 68 insertions(+), 3 deletions(-) diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 816f7a4f6..cfe24bd3c 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -336,6 +336,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::test05_sample_index_rng_seed) .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed) .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) + .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) + .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) .def("FromConfig", &TestSettings::FromConfig, "FromConfig."); pybind11::enum_(m, "LoggingMode") diff --git a/loadgen/results.cc b/loadgen/results.cc index 21fd2c90a..5048ac72e 100644 --- a/loadgen/results.cc +++ b/loadgen/results.cc @@ -636,16 +636,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) { MLPERF_LOG(detail, "result_first_token_mean_latency_ns", first_token_latency_mean); for (auto& lp : token_latency_percentiles) { MLPERF_LOG(detail, - "result_" + DoubleToString(lp.percentile * 100) + + "result_first_token_" + DoubleToString(lp.percentile * 100) + "_percentile_latency_ns", lp.sample_latency); + if ((lp.percentile == .999) & (lp.sample_latency > settings.server_ttft_latency)){ + MLPERF_LOG_WARNING(detail, "warning_generic_message", + "Value for result_first_token_" + DoubleToString(lp.percentile * 100) + + "_percentile_latency_ns greater than target time_to_first_token" + ); + } } double tps_w_lg = ((double)token_count) / pr.final_query_issued_time; double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count); MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg); MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg); - double tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / ((double)token_count); + uint64_t tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / (token_count); MLPERF_LOG(detail, "result_time_to_output_token", tpot); + if (tpot > settings.server_tpot_latency){ + MLPERF_LOG_WARNING(detail, "warning_generic_message", + "Value for result_time_to_output_token " + "greater than target time_to_output_token" + ); + } } else { double tokens_per_second = token_count / pr.max_latency; MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second); diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index ccae5327a..b0018380d 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -264,6 +264,9 @@ struct TestSettings { uint64_t performance_sample_count_override = 0; /// \brief Measure token latencies bool use_token_latencies = false; + /// Token latency parameters + uint64_t server_ttft_latency = 100000000; + uint64_t server_tpot_latency = 100000000; /**@}*/ }; diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 0a4b9af6e..95e53b731 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -49,7 +49,9 @@ TestSettingsInternal::TestSettingsInternal( performance_issue_same_index(requested.performance_issue_same_index), performance_sample_count(0), sample_concatenate_permutation(false), - use_token_latencies(requested.use_token_latencies){ + use_token_latencies(requested.use_token_latencies), + server_ttft_latency(requested.server_ttft_latency), + server_tpot_latency(requested.server_tpot_latency){ // Target QPS, target latency, and max_async_queries. switch (requested.scenario) { case TestScenario::SingleStream: @@ -330,6 +332,14 @@ void LogRequestedTestSettings(const TestSettings &s) { s.performance_issue_same_index); MLPERF_LOG(detail, "requested_performance_sample_count_override", s.performance_sample_count_override); + // Token latencies specific values + if (s.use_token_latencies){ + MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies); + if (s.scenario != TestScenario::Offline){ + MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency); + MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency); + } + } #else detail(""); detail("Requested Settings:"); @@ -673,8 +683,14 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed, nullptr); lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr); + + // keys that apply to token metrics if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) use_token_latencies = (val == 1) ? true : false; + if (use_token_latencies){ + lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000); + lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000); + } // keys that apply to SingleStream lookupkv(model, "SingleStream", "target_latency_percentile", nullptr, diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index 6bc1c3cf3..3392dd59d 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -83,6 +83,8 @@ struct TestSettingsInternal { bool sample_concatenate_permutation; bool use_token_latencies = false; + uint64_t server_ttft_latency; + uint64_t server_tpot_latency; }; /// \brief A namespace of collections of FindPeakPerformance helper functions, diff --git a/mlperf.conf b/mlperf.conf index fe743a8dd..8c6f53849 100644 --- a/mlperf.conf +++ b/mlperf.conf @@ -58,6 +58,7 @@ rnnt.Server.target_latency = 1000 gptj.Server.target_latency = 20000 stable-diffusion-xl.Server.target_latency = 20000 # Falcon Server scenario requires two latency constraints +llama2-70b.*.use_token_latencies = 1 llama2-70b.Server.target_latency = 2000 llama2-70b.Server.ttft_latency = 2000 llama2-70b.Server.tpot_latency = 200 diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 2d00b6a0e..bd716fbd0 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1389,6 +1389,17 @@ } } +LLAMA2_LATENCY_LIMITS = { + "interactive": { + "ttft": 500, + "tpot": 50 + }, + "conversational": { + "ttft": 2000, + "tpot": 200 + } +} + ACC_PATTERN = { "acc": r"^accuracy=([\d\.]+).*", "AUC": r"^AUC=([\d\.]+).*", @@ -1891,6 +1902,19 @@ def check_accuracy_dir(config, model, path, verbose): return is_valid, result_acc +def extra_check_llama2(mlperf_log, scenario): + if (mlperf_log["use_token_latencies"]): + if scenario == "Offline": + # For offline no further checks are necessary + return None, True + else: + for constraint, limits in LLAMA2_LATENCY_LIMITS.items(): + if mlperf_log["result_first_token_99.9_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_to_output_token"] < limits["tpot"]: + return constraint, True + else: + return None, False + + def get_performance_metric( config, model, path, scenario_fixed, division, system_json, has_power=False ): @@ -1911,6 +1935,8 @@ def get_performance_metric( ) res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]]) + if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]: + res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]]) inferred = False if scenario_fixed != scenario: @@ -1946,6 +1972,9 @@ def check_performance_dir( res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]]) if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]: res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]]) + + if model in ["llama2-70b-99", "llama2-70b-99.9"]: + llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed) latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"] latency_mean = mlperf_log["result_mean_latency_ns"]