Skip to content

Commit

Permalink
Add Llama2 checks and log additional values (#1578)
Browse files Browse the repository at this point in the history
  • Loading branch information
pgmpablo157321 authored Jan 25, 2024
1 parent 901ce67 commit 190413d
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 3 deletions.
2 changes: 2 additions & 0 deletions loadgen/bindings/python_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
&TestSettings::test05_sample_index_rng_seed)
.def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed)
.def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
.def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
.def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
.def("FromConfig", &TestSettings::FromConfig, "FromConfig.");

pybind11::enum_<LoggingMode>(m, "LoggingMode")
Expand Down
16 changes: 14 additions & 2 deletions loadgen/results.cc
Original file line number Diff line number Diff line change
Expand Up @@ -636,16 +636,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
MLPERF_LOG(detail, "result_first_token_mean_latency_ns", first_token_latency_mean);
for (auto& lp : token_latency_percentiles) {
MLPERF_LOG(detail,
"result_" + DoubleToString(lp.percentile * 100) +
"result_first_token_" + DoubleToString(lp.percentile * 100) +
"_percentile_latency_ns",
lp.sample_latency);
if ((lp.percentile == .999) & (lp.sample_latency > settings.server_ttft_latency)){
MLPERF_LOG_WARNING(detail, "warning_generic_message",
"Value for result_first_token_" + DoubleToString(lp.percentile * 100) +
"_percentile_latency_ns greater than target time_to_first_token"
);
}
}
double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count);
MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg);
MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
double tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / ((double)token_count);
uint64_t tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / (token_count);
MLPERF_LOG(detail, "result_time_to_output_token", tpot);
if (tpot > settings.server_tpot_latency){
MLPERF_LOG_WARNING(detail, "warning_generic_message",
"Value for result_time_to_output_token "
"greater than target time_to_output_token"
);
}
} else {
double tokens_per_second = token_count / pr.max_latency;
MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
Expand Down
3 changes: 3 additions & 0 deletions loadgen/test_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ struct TestSettings {
uint64_t performance_sample_count_override = 0;
/// \brief Measure token latencies
bool use_token_latencies = false;
/// Token latency parameters
uint64_t server_ttft_latency = 100000000;
uint64_t server_tpot_latency = 100000000;
/**@}*/
};

Expand Down
18 changes: 17 additions & 1 deletion loadgen/test_settings_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ TestSettingsInternal::TestSettingsInternal(
performance_issue_same_index(requested.performance_issue_same_index),
performance_sample_count(0),
sample_concatenate_permutation(false),
use_token_latencies(requested.use_token_latencies){
use_token_latencies(requested.use_token_latencies),
server_ttft_latency(requested.server_ttft_latency),
server_tpot_latency(requested.server_tpot_latency){
// Target QPS, target latency, and max_async_queries.
switch (requested.scenario) {
case TestScenario::SingleStream:
Expand Down Expand Up @@ -330,6 +332,14 @@ void LogRequestedTestSettings(const TestSettings &s) {
s.performance_issue_same_index);
MLPERF_LOG(detail, "requested_performance_sample_count_override",
s.performance_sample_count_override);
// Token latencies specific values
if (s.use_token_latencies){
MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies);
if (s.scenario != TestScenario::Offline){
MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency);
MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency);
}
}
#else
detail("");
detail("Requested Settings:");
Expand Down Expand Up @@ -673,8 +683,14 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed,
nullptr);
lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);

// keys that apply to token metrics
if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr))
use_token_latencies = (val == 1) ? true : false;
if (use_token_latencies){
lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000);
lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000);
}

// keys that apply to SingleStream
lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
Expand Down
2 changes: 2 additions & 0 deletions loadgen/test_settings_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ struct TestSettingsInternal {

bool sample_concatenate_permutation;
bool use_token_latencies = false;
uint64_t server_ttft_latency;
uint64_t server_tpot_latency;
};

/// \brief A namespace of collections of FindPeakPerformance helper functions,
Expand Down
1 change: 1 addition & 0 deletions mlperf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ rnnt.Server.target_latency = 1000
gptj.Server.target_latency = 20000
stable-diffusion-xl.Server.target_latency = 20000
# Falcon Server scenario requires two latency constraints
llama2-70b.*.use_token_latencies = 1
llama2-70b.Server.target_latency = 2000
llama2-70b.Server.ttft_latency = 2000
llama2-70b.Server.tpot_latency = 200
Expand Down
29 changes: 29 additions & 0 deletions tools/submission/submission_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1389,6 +1389,17 @@
}
}

LLAMA2_LATENCY_LIMITS = {
"interactive": {
"ttft": 500,
"tpot": 50
},
"conversational": {
"ttft": 2000,
"tpot": 200
}
}

ACC_PATTERN = {
"acc": r"^accuracy=([\d\.]+).*",
"AUC": r"^AUC=([\d\.]+).*",
Expand Down Expand Up @@ -1891,6 +1902,19 @@ def check_accuracy_dir(config, model, path, verbose):
return is_valid, result_acc


def extra_check_llama2(mlperf_log, scenario):
if (mlperf_log["use_token_latencies"]):
if scenario == "Offline":
# For offline no further checks are necessary
return None, True
else:
for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
if mlperf_log["result_first_token_99.9_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_to_output_token"] < limits["tpot"]:
return constraint, True
else:
return None, False


def get_performance_metric(
config, model, path, scenario_fixed, division, system_json, has_power=False
):
Expand All @@ -1911,6 +1935,8 @@ def get_performance_metric(
)

res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])

inferred = False
if scenario_fixed != scenario:
Expand Down Expand Up @@ -1946,6 +1972,9 @@ def check_performance_dir(
res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])

if model in ["llama2-70b-99", "llama2-70b-99.9"]:
llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed)

latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
latency_mean = mlperf_log["result_mean_latency_ns"]
Expand Down

0 comments on commit 190413d

Please sign in to comment.