diff --git a/cmd/log2sql/main.go b/cmd/log2sql/main.go index 5bdae65..9392513 100644 --- a/cmd/log2sql/main.go +++ b/cmd/log2sql/main.go @@ -96,6 +96,7 @@ func writeHeader(f io.Writer) { activeThreadsMax int NULL, -- Active threads (max in last 10 secs) pausedThreads int NULL, -- Paused threads pausedThreadsMax int NULL, -- Paused threads (max in last 10 secs) + pausedErrorCount int NULL, -- Commands exited in error due to pause thresholds being exceeded pauseRateCPU int NULL, -- Pause rate CPU (percentage 0-100) pauseRateMem int NULL, -- Pause rate Mem (percentage 0-100) cpuPressureState int NULL, -- CPU pressure (0 low, 1 med, 2 high) @@ -158,10 +159,10 @@ func getProcessStatement() string { func getEventsStatement() string { return `INSERT INTO events (lineNumber, eventTime, - activeThreads, activeThreadsMax, pausedThreads, pausedThreadsMax, + activeThreads, activeThreadsMax, pausedThreads, pausedThreadsMax, pausedErrorCount, pauseRateCPU, pauseRateMem, cpuPressureState, memPressureState) - VALUES (?,?,?,?,?,?,?,?,?,?)` + VALUES (?,?,?,?,?,?,?,?,?,?,?)` } func getTableUseStatement() string { @@ -226,7 +227,7 @@ func preparedInsert(logger *logrus.Logger, stmtProcess, stmtTableuse *sqlite3.St func preparedInsertServerEvents(logger *logrus.Logger, stmtEvents *sqlite3.Stmt, evt *p4dlog.ServerEvent) int64 { rows := 1 err := stmtEvents.Exec( - evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, + evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, evt.PausedErrorCount, evt.PauseRateCPU, evt.PauseRateMem, evt.CPUPressureState, evt.MemPressureState) if err != nil { logger.Errorf("Events insert: %v lineNo %d, %s", @@ -237,8 +238,8 @@ func preparedInsertServerEvents(logger *logrus.Logger, stmtEvents *sqlite3.Stmt, func writeSQLServerEvents(f io.Writer, evt *p4dlog.ServerEvent) int64 { rows := 1 - fmt.Fprintf(f, `INSERT INTO events VALUES (%d,"%s",%d,%d,%d,%d,%d,%d,%d,%d);`+"\n", - evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, + fmt.Fprintf(f, `INSERT INTO events VALUES (%d,"%s",%d,%d,%d,%d,%d,%d,%d,%d,%d);`+"\n", + evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, evt.PausedErrorCount, evt.PauseRateCPU, evt.PauseRateMem, evt.CPUPressureState, evt.MemPressureState) return int64(rows) } diff --git a/metrics/metrics.go b/metrics/metrics.go index 2f5c07b..2866574 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -63,6 +63,7 @@ type P4DMetrics struct { cmdsRunningMax int64 cmdsPaused int64 // Server Events cmdsPausedMax int64 // Server Events + cmdsPausedErrorCount int64 // ditto pauseRateCPU int64 // ditto pauseRateMem int64 // ditto cpuPressureState int64 // ditto @@ -301,6 +302,7 @@ func (p4m *P4DMetrics) getCumulativeMetrics() string { p4m.outputMetric(metrics, "p4_cmds_running_max", "The max number of running commands at any one time since last metric", "gauge", fmt.Sprintf("%d", p4m.cmdsRunningMax), fixedLabels) p4m.outputMetric(metrics, "p4_cmds_paused", "The number of (resource pressure) paused commands at any one time", "gauge", fmt.Sprintf("%d", p4m.cmdsPaused), fixedLabels) p4m.outputMetric(metrics, "p4_cmds_paused_max", "The max number of (resource pressure) paused commands since last metric", "gauge", fmt.Sprintf("%d", p4m.cmdsPausedMax), fixedLabels) + p4m.outputMetric(metrics, "p4_cmds_paused_errors", "The number of commands exited with error due to resource pressure thresholds being exceeded", "counter", fmt.Sprintf("%d", p4m.cmdsPausedErrorCount), fixedLabels) p4m.outputMetric(metrics, "p4_pause_rate_cpu", "The (resource pressure) pause rate for CPU", "gauge", fmt.Sprintf("%d", p4m.pauseRateCPU), fixedLabels) p4m.outputMetric(metrics, "p4_pause_rate_mem", "The (resource pressure) pause rate for Mem", "gauge", fmt.Sprintf("%d", p4m.pauseRateMem), fixedLabels) p4m.outputMetric(metrics, "p4_pause_state_cpu", "The (resource pressure) pause state for CPU (0-2)", "gauge", fmt.Sprintf("%d", p4m.cpuPressureState), fixedLabels) @@ -518,6 +520,7 @@ func (p4m *P4DMetrics) publishSvrEvent(evt p4dlog.ServerEvent) { p4m.cmdsRunningMax = evt.ActiveThreadsMax p4m.cmdsPaused = evt.PausedThreads p4m.cmdsPausedMax = evt.PausedThreadsMax + p4m.cmdsPausedErrorCount = evt.PausedErrorCount p4m.pauseRateCPU = evt.PauseRateCPU p4m.pauseRateMem = evt.PauseRateMem p4m.cpuPressureState = evt.CPUPressureState diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index 56be2d1..1bcc20b 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -847,3 +847,58 @@ p4_pause_state_mem{serverid="myserverid"} 1 p4_prom_svr_events_processed{serverid="myserverid"} 3`, -1) compareOutput(t, expected, output) } + +func TestServerEventsPauseErrors(t *testing.T) { + cfg := &Config{ + ServerID: "myserverid", + UpdateInterval: 10 * time.Millisecond, + OutputCmdsByUser: true} + input := ` +Perforce server info: + 2024/06/19 12:25:31 pid 1056864 perforce@ip-10-0-0-106 127.0.0.1 [p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120] 'user-fstat -Ob //...' + +Perforce server error: + Date 2024/06/19 12:25:31: + Pid 1056860 + Operation: user-fstat + Operation 'user-fstat' failed. + Too many commands paused; terminated. + +Perforce server info: + 2024/06/19 12:25:39 pid 1056864 completed 8.39s 598+67us 304+0io 0+0net 68864k 0pf +Perforce server info: + 2024/06/19 12:25:31 pid 1056864 perforce@ip-10-0-0-106 127.0.0.1 [p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120] 'user-fstat -Ob //...' +--- exited on fatal server error +--- lapse 8.39s +--- usage 598+67us 304+0io 0+0net 68864k 0pf +--- memory cmd/proc 74mb/74mb +--- rpc msgs/size in+out 2+84225/0mb+45mb himarks 795416/795272 snd/rcv 5.64s/.002s + +2024/06/19 12:25:39 004246895 pid 1056103: Server under resource pressure. Pause rate CPU 59%, mem 20%, CPU pressure 2, mem pressure 1 +` + historical := false + output := basicTest(cfg, input, historical) + + expected := eol.Split(`p4_cmd_counter{serverid="myserverid",cmd="user-fstat"} 1 +p4_cmd_cpu_system_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 0.067 +p4_cmd_cpu_user_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 0.598 +p4_cmd_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 8.390 +p4_cmd_error_counter{serverid="myserverid",cmd="user-fstat"} 1 +p4_cmd_mem_mb{serverid="myserverid"} 74 +p4_cmd_mem_peak_mb{serverid="myserverid"} 74 +p4_cmd_program_counter{serverid="myserverid",program="p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120"} 1 +p4_cmd_program_cumulative_seconds{serverid="myserverid",program="p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120"} 8.390 +p4_cmd_running{serverid="myserverid"} 1 +p4_cmd_user_counter{serverid="myserverid",user="perforce"} 1 +p4_cmd_user_cumulative_seconds{serverid="myserverid",user="perforce"} 8.390 +p4_cmds_paused_errors{serverid="myserverid"} 1 +p4_cmds_running{serverid="myserverid"} 1 +p4_pause_rate_cpu{serverid="myserverid"} 59 +p4_pause_rate_mem{serverid="myserverid"} 20 +p4_pause_state_cpu{serverid="myserverid"} 2 +p4_pause_state_mem{serverid="myserverid"} 1 +p4_prom_cmds_processed{serverid="myserverid"} 1 +p4_prom_log_lines_read{serverid="myserverid"} 23 +p4_prom_svr_events_processed{serverid="myserverid"} 1`, -1) + compareOutput(t, expected, output) +} diff --git a/p4dlog.go b/p4dlog.go index c34935f..0a32e0a 100644 --- a/p4dlog.go +++ b/p4dlog.go @@ -132,6 +132,7 @@ type ServerEvent struct { ActiveThreadsMax int64 `json:"activeThreadsMax"` PausedThreads int64 `json:"pausedThreads"` PausedThreadsMax int64 `json:"pausedThreadsMax"` + PausedErrorCount int64 `json:"pausedErrorCount"` PauseRateCPU int64 `json:"pauseRateCPU"` // Percentage 1-100 PauseRateMem int64 `json:"pauseRateMem"` // Percentage 1-100 CPUPressureState int64 `json:"cpuPressureState"` // 0-2 @@ -584,6 +585,7 @@ func (s *ServerEvent) MarshalJSON() ([]byte, error) { ActiveThreadsMax int64 `json:"activeThreadsMax"` PausedThreads int64 `json:"pausedThreads"` PausedThreadsMax int64 `json:"pausedThreadsMax"` + PausedErrorCount int64 `json:"pausedErrorCount"` PauseRateCPU int64 `json:"pauseRateCPU"` // Percentage 1-100 PauseRateMem int64 `json:"pauseRateMem"` // Percentage 1-100 CPUPressureState int64 `json:"cpuPressureState"` // 0-2 @@ -595,6 +597,7 @@ func (s *ServerEvent) MarshalJSON() ([]byte, error) { ActiveThreadsMax: s.ActiveThreadsMax, PausedThreads: s.PausedThreads, PausedThreadsMax: s.PausedThreadsMax, + PausedErrorCount: s.PausedErrorCount, PauseRateCPU: s.PauseRateCPU, PauseRateMem: s.PauseRateMem, CPUPressureState: s.CPUPressureState, @@ -1091,6 +1094,7 @@ type P4dFileParser struct { cmdsRunningMax int64 // Max No of currently running threads cmdsPaused int64 // No of paused threads cmdsPausedMax int64 // Max no of paused threads + cmdsPausedErrorCount int64 // Count of commands paused due to resource pressure errors pauseRateCPU int64 // Resource pressure pauseRateMem int64 // ditto cpuPressureState int64 // ditto @@ -1365,6 +1369,7 @@ func (fp *P4dFileParser) processTrackRecords(cmd *Command, lines []string) { if strings.HasPrefix(line, trackFatalError) { cmd.CmdError = true hasTrackInfo = true + fp.cmdsPausedErrorCount += 1 continue } if strings.HasPrefix(line, trackDB) { @@ -1712,6 +1717,7 @@ func (fp *P4dFileParser) outputSvrEvent(timeStr string, lineNo int64) { ActiveThreadsMax: fp.cmdsRunningMax, PausedThreads: fp.cmdsPaused, PausedThreadsMax: fp.cmdsPausedMax, + PausedErrorCount: fp.cmdsPausedErrorCount, PauseRateCPU: fp.pauseRateCPU, PauseRateMem: fp.pauseRateMem, CPUPressureState: fp.cpuPressureState,