diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h index 7166936a9..a3dc7c3a1 100644 --- a/src/c++/perf_analyzer/perf_utils.h +++ b/src/c++/perf_analyzer/perf_utils.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "client_backend/client_backend.h" @@ -83,6 +84,17 @@ class Range { T step; }; +template +std::optional +CalculateAverage(const std::vector& data) +{ + if (data.empty()) { + return std::nullopt; + } + T sum = std::reduce(data.begin(), data.end()); + return sum / data.size(); +} + // Converts the datatype from tensorflow to perf analyzer space // \param tf_dtype The data type string returned from the model metadata. // \param datatype Returns the datatype in perf_analyzer space. diff --git a/src/c++/perf_analyzer/report_writer.cc b/src/c++/perf_analyzer/report_writer.cc index deacb1eef..0bf47d399 100644 --- a/src/c++/perf_analyzer/report_writer.cc +++ b/src/c++/perf_analyzer/report_writer.cc @@ -406,11 +406,20 @@ void ReportWriter::WriteLLMMetrics(std::ostream& ofs) { auto [avg_first_token_latency, avg_t2t_latency] = CalculateLLMMetrics(); - ofs << "," << avg_first_token_latency; - ofs << "," << avg_t2t_latency; + + if (avg_first_token_latency.has_value()) { + ofs << "," << avg_first_token_latency.value(); + } else { + ofs << ",n/a"; + } + if (avg_t2t_latency.has_value()) { + ofs << "," << avg_t2t_latency.value(); + } else { + ofs << ",n/a"; + } } -std::tuple +std::tuple, std::optional> ReportWriter::CalculateLLMMetrics() { if (collector_->IsEmpty()) { @@ -440,13 +449,8 @@ ReportWriter::CalculateLLMMetrics() } } - auto avg_first_token_latency = - std::reduce(first_token_latencies.begin(), first_token_latencies.end()) / - first_token_latencies.size(); - auto avg_t2t_latency = - std::reduce(t2t_latencies.begin(), t2t_latencies.end()) / - t2t_latencies.size(); - + auto avg_first_token_latency = CalculateAverage(first_token_latencies); + auto avg_t2t_latency = CalculateAverage(t2t_latencies); return std::make_tuple(avg_first_token_latency, avg_t2t_latency); } diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h index 544036f9f..ecd063cbb 100644 --- a/src/c++/perf_analyzer/report_writer.h +++ b/src/c++/perf_analyzer/report_writer.h @@ -97,7 +97,8 @@ class ReportWriter { /// Calculate LLM metrics (e.g., average first token latency) using the /// profile data collected for decoupled model. - std::tuple CalculateLLMMetrics(); + std::tuple, std::optional> + CalculateLLMMetrics(); const std::string& filename_{""}; diff --git a/src/c++/perf_analyzer/test_report_writer.cc b/src/c++/perf_analyzer/test_report_writer.cc index 2bec0dc8a..f2441df8b 100644 --- a/src/c++/perf_analyzer/test_report_writer.cc +++ b/src/c++/perf_analyzer/test_report_writer.cc @@ -132,18 +132,61 @@ TEST_CASE("report_writer: WriteLLMMetrics") pa::ProfileDataCollector::Create(&collector), "failed to create profile data collector"); - InferenceLoadMode infer_mode{10, 20.0}; // dummy values + InferenceLoadMode infer_mode{}; SUBCASE("request with zero response") { - // TODO - CHECK(false); + uint64_t sequence_id1{123}; + uint64_t request_timestamp1{1}; + std::vector response_timestamps1{}; + RequestRecord rr1 = GenerateRequestRecord( + sequence_id1, request_timestamp1, response_timestamps1); + + uint64_t sequence_id2{456}; + uint64_t request_timestamp2{2}; + std::vector response_timestamps2{}; + RequestRecord rr2 = GenerateRequestRecord( + sequence_id2, request_timestamp2, response_timestamps2); + + std::vector request_records{rr1, rr2}; + collector->AddData(infer_mode, std::move(request_records)); + + // Avg first token latency = n/a + // Avg token-to-token latency = n/a + TestReportWriter trw(collector); + std::ostringstream actual_output{}; + trw.WriteLLMMetrics(actual_output); + const std::string expected_output{",n/a,n/a"}; + CHECK(actual_output.str() == expected_output); } - SUBCASE("request with single response") + SUBCASE("requests with single response") { - // TODO - CHECK(false); + uint64_t sequence_id1{123}; + uint64_t request_timestamp1{1}; + std::vector response_timestamps1{2}; + RequestRecord rr1 = GenerateRequestRecord( + sequence_id1, request_timestamp1, response_timestamps1); + + uint64_t sequence_id2{456}; + uint64_t request_timestamp2{2}; + std::vector response_timestamps2{9}; + RequestRecord rr2 = GenerateRequestRecord( + sequence_id2, request_timestamp2, response_timestamps2); + + std::vector request_records{rr1, rr2}; + collector->AddData(infer_mode, std::move(request_records)); + + // Avg first token latency + // = ((response1[0] - request1) + (response2[0] - request2)) / 2 + // = ((2 - 1) + (9 - 2)) / 2 = 4 us + // + // Avg token-to-token latency = n/a + TestReportWriter trw(collector); + std::ostringstream actual_output{}; + trw.WriteLLMMetrics(actual_output); + const std::string expected_output{",4,n/a"}; + CHECK(actual_output.str() == expected_output); } SUBCASE("requests with multiple responses") @@ -164,7 +207,7 @@ TEST_CASE("report_writer: WriteLLMMetrics") collector->AddData(infer_mode, std::move(request_records)); // Avg first token latency - // = ((response1[0] - request1) + (response2[0] - request2) + ...) / 3 + // = ((response1[0] - request1) + (response2[0] - request2)) / 2 // = ((4 - 1) + (6 - 2)) / 2 = 3.5 us // // Avg token-to-token latency