From 190413d97b5a75b167a88e2edf5933a3f015fb0d Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Thu, 25 Jan 2024 09:57:00 -0500
Subject: [PATCH] Add Llama2 checks and log additional values (#1578)

---
 loadgen/bindings/python_api.cc         |  2 ++
 loadgen/results.cc                     | 16 ++++++++++++--
 loadgen/test_settings.h                |  3 +++
 loadgen/test_settings_internal.cc      | 18 +++++++++++++++-
 loadgen/test_settings_internal.h       |  2 ++
 mlperf.conf                            |  1 +
 tools/submission/submission_checker.py | 29 ++++++++++++++++++++++++++
 7 files changed, 68 insertions(+), 3 deletions(-)
diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 816f7a4f6..cfe24bd3c 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -336,6 +336,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::test05_sample_index_rng_seed)
       .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
+      .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
+      .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
       .def("FromConfig", &TestSettings::FromConfig, "FromConfig.");
 
   pybind11::enum_<LoggingMode>(m, "LoggingMode")
diff --git a/loadgen/results.cc b/loadgen/results.cc
index 21fd2c90a..5048ac72e 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -636,16 +636,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
       MLPERF_LOG(detail, "result_first_token_mean_latency_ns", first_token_latency_mean);
       for (auto& lp : token_latency_percentiles) {
         MLPERF_LOG(detail,
-                    "result_" + DoubleToString(lp.percentile * 100) +
+                    "result_first_token_" + DoubleToString(lp.percentile * 100) +
                         "_percentile_latency_ns",
                     lp.sample_latency);
+        if ((lp.percentile == .999) & (lp.sample_latency > settings.server_ttft_latency)){
+          MLPERF_LOG_WARNING(detail, "warning_generic_message", 
+            "Value for result_first_token_" + DoubleToString(lp.percentile * 100) +
+            "_percentile_latency_ns greater than target time_to_first_token"
+          );
+        }
       }
       double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
       double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count);
       MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg);
       MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
-      double tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / ((double)token_count);
+      uint64_t tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / (token_count);
       MLPERF_LOG(detail, "result_time_to_output_token", tpot);
+      if (tpot > settings.server_tpot_latency){
+        MLPERF_LOG_WARNING(detail, "warning_generic_message",
+          "Value for result_time_to_output_token "
+          "greater than target time_to_output_token"
+        );
+      }
     } else {
       double tokens_per_second = token_count / pr.max_latency;
       MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index ccae5327a..b0018380d 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -264,6 +264,9 @@ struct TestSettings {
   uint64_t performance_sample_count_override = 0;
   /// \brief Measure token latencies
   bool use_token_latencies = false;
+  /// Token latency parameters
+  uint64_t server_ttft_latency = 100000000;
+  uint64_t server_tpot_latency = 100000000;
   /**@}*/
 };
 
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 0a4b9af6e..95e53b731 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -49,7 +49,9 @@ TestSettingsInternal::TestSettingsInternal(
       performance_issue_same_index(requested.performance_issue_same_index),
       performance_sample_count(0),
       sample_concatenate_permutation(false),
-      use_token_latencies(requested.use_token_latencies){
+      use_token_latencies(requested.use_token_latencies),
+      server_ttft_latency(requested.server_ttft_latency),
+      server_tpot_latency(requested.server_tpot_latency){
   // Target QPS, target latency, and max_async_queries.
   switch (requested.scenario) {
     case TestScenario::SingleStream:
@@ -330,6 +332,14 @@ void LogRequestedTestSettings(const TestSettings &s) {
                s.performance_issue_same_index);
     MLPERF_LOG(detail, "requested_performance_sample_count_override",
                s.performance_sample_count_override);
+    // Token latencies specific values
+    if (s.use_token_latencies){
+      MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies);
+      if (s.scenario != TestScenario::Offline){
+        MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency);
+        MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency);
+      }
+    }
 #else
     detail("");
     detail("Requested Settings:");
@@ -673,8 +683,14 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
   lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed,
            nullptr);
   lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);
+
+  // keys that apply to token metrics
   if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr))
     use_token_latencies = (val == 1) ? true : false;
+    if (use_token_latencies){
+      lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000);
+      lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000);
+    }
 
   // keys that apply to SingleStream
   lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 6bc1c3cf3..3392dd59d 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -83,6 +83,8 @@ struct TestSettingsInternal {
 
   bool sample_concatenate_permutation;
   bool use_token_latencies = false;
+  uint64_t server_ttft_latency;
+  uint64_t server_tpot_latency;
 };
 
 /// \brief A namespace of collections of FindPeakPerformance helper functions,
diff --git a/mlperf.conf b/mlperf.conf
index fe743a8dd..8c6f53849 100644
--- a/mlperf.conf
+++ b/mlperf.conf
@@ -58,6 +58,7 @@ rnnt.Server.target_latency = 1000
 gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
 # Falcon Server scenario requires two latency constraints
+llama2-70b.*.use_token_latencies = 1
 llama2-70b.Server.target_latency = 2000
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 2d00b6a0e..bd716fbd0 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1389,6 +1389,17 @@
     }
 }
 
+LLAMA2_LATENCY_LIMITS = {
+    "interactive": {
+        "ttft": 500,
+        "tpot": 50
+    },
+    "conversational": {
+        "ttft": 2000,
+        "tpot": 200
+    }
+}
+
 ACC_PATTERN = {
     "acc": r"^accuracy=([\d\.]+).*",
     "AUC": r"^AUC=([\d\.]+).*",
@@ -1891,6 +1902,19 @@ def check_accuracy_dir(config, model, path, verbose):
     return is_valid, result_acc
 
 
+def extra_check_llama2(mlperf_log, scenario):
+    if (mlperf_log["use_token_latencies"]):
+        if scenario == "Offline":
+            # For offline no further checks are necessary
+            return None, True
+        else:
+            for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
+                if mlperf_log["result_first_token_99.9_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_to_output_token"] < limits["tpot"]:
+                    return constraint, True
+    else:
+        return None, False
+            
+
 def get_performance_metric(
     config, model, path, scenario_fixed, division, system_json, has_power=False
 ):
@@ -1911,6 +1935,8 @@ def get_performance_metric(
     )
 
     res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
+    if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
+        res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])
 
     inferred = False
     if scenario_fixed != scenario:
@@ -1946,6 +1972,9 @@ def check_performance_dir(
         res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
         if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
             res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])
+        
+        if model in ["llama2-70b-99", "llama2-70b-99.9"]:
+            llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed)
 
         latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
         latency_mean = mlperf_log["result_mean_latency_ns"]