From 1b4def1c8496f2bedbc31d57820bfe0f0f5160cb Mon Sep 17 00:00:00 2001 From: Ahmet Findikci Date: Sun, 5 Jan 2025 20:51:59 +0100 Subject: [PATCH] watchdog: improve the behavior how watchdog is handled * dlt_daemon_process_user_messages * dlt_daemon_send_ringbuffer_to_client * dlt_daemon_client_send_all_multiple * dlt_daemon_process_systemd_timer * dlt_daemon_handle_event * dlt_daemon_socket_sendreliable These methods can take longer than the systemd watchdog timeout allowed, depending on the number of clients, which leads that the systemd watchdog kills dlt-daemon. Add a timeout to these methods so that they may only run for as long as the watchdog timer is set. In addition, set the default WatchdogTimeout to an appropriate value in case the env variable is not set. Signed-off-by: Ahmet Findikci --- src/daemon/dlt-daemon.c | 37 ++++++++++++++++++++------- src/daemon/dlt_daemon_client.c | 14 +++++++--- src/daemon/dlt_daemon_common.c | 22 +++++++++++----- src/daemon/dlt_daemon_common.h | 8 +++++- src/daemon/dlt_daemon_event_handler.c | 4 ++- 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/src/daemon/dlt-daemon.c b/src/daemon/dlt-daemon.c index f0d5a1657..9b320dbc8 100644 --- a/src/daemon/dlt-daemon.c +++ b/src/daemon/dlt-daemon.c @@ -1549,14 +1549,25 @@ int main(int argc, char *argv[]) #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE { char *watchdogUSec = getenv("WATCHDOG_USEC"); - int watchdogTimeoutSeconds = 0; + // set a sensible default, in case the environment variable is not set + int watchdogTimeoutSeconds = 30; dlt_log(LOG_DEBUG, "Systemd watchdog initialization\n"); - if (watchdogUSec) + if (watchdogUSec) { + // WATCHDOG_USEC is the timeout in micrsoseconds + // divide this by 2*10^6 to get the interval in seconds + // 2 * because we notify systemd after half the timeout watchdogTimeoutSeconds = atoi(watchdogUSec) / 2000000; + } + + if (watchdogTimeoutSeconds == 0) { + dlt_log(LOG_WARNING, "Watchdog timeout is too small, need at least 1s, setting 1s timeout\n"); + watchdogTimeoutSeconds = 1; + } daemon.watchdog_trigger_interval = watchdogTimeoutSeconds; + daemon.watchdog_last_trigger_time = 0U; create_timer_fd(&daemon_local, watchdogTimeoutSeconds, watchdogTimeoutSeconds, @@ -3152,6 +3163,17 @@ int dlt_daemon_process_user_messages(DltDaemon *daemon, /* look through buffer as long as data is in there */ while ((receiver->bytesRcvd >= min_size) && run_loop) { +#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE + /* this loop may be running long, so we have to exit it at some point to be able to + * to process other events, like feeding the watchdog + */ + bool watchdog_triggered= dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); + if (watchdog_triggered) { + dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__); + run_loop = 0; // exit loop in next iteration + } +#endif + dlt_daemon_process_user_message_func func = NULL; offset = 0; @@ -3798,12 +3820,9 @@ int dlt_daemon_process_user_message_log(DltDaemon *daemon, while (1) { #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE - const unsigned int uptime = dlt_uptime(); - if ((uptime - start_time) / 10000 > daemon->watchdog_trigger_interval) { - dlt_vlog(LOG_WARNING, - "spent already 1 watchdog trigger interval in %s, yielding to process other events.\n", __func__); - if (sd_notify(0, "WATCHDOG=1") < 0) - dlt_vlog(LOG_CRIT, "Could not reset systemd watchdog from %s\n", __func__); + bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); + if (watchdog_triggered) { + dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__); break; } #endif @@ -4162,7 +4181,7 @@ int dlt_daemon_send_ringbuffer_to_client(DltDaemon *daemon, DltDaemonLocal *daem while ((length = dlt_buffer_copy(&(daemon->client_ringbuffer), data, sizeof(data))) > 0) { #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE - dlt_daemon_trigger_systemd_watchdog_if_necessary(&curr_time, daemon->watchdog_trigger_interval); + dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); #endif if ((ret = diff --git a/src/daemon/dlt_daemon_client.c b/src/daemon/dlt_daemon_client.c index 5b6fd5bcc..26f7fdd15 100644 --- a/src/daemon/dlt_daemon_client.c +++ b/src/daemon/dlt_daemon_client.c @@ -110,7 +110,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon, int verbose) { int sent = 0; - unsigned int i = 0; + nfds_t i = 0; int ret = 0; DltConnection *temp = NULL; int type_mask = @@ -125,6 +125,13 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon, for (i = 0; i < daemon_local->pEvent.nfds; i++) { +#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE + bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); + if (watchdog_triggered) { + dlt_vlog(LOG_WARNING, "%s notified watchdog, processed %lu/%lu fds already.\n", + __func__, i, daemon_local->pEvent.nfds); + } +#endif temp = dlt_event_handler_find_connection(&(daemon_local->pEvent), daemon_local->pEvent.pfd[i].fd); @@ -152,7 +159,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon, if (ret != DLT_DAEMON_ERROR_OK) dlt_vlog(LOG_WARNING, "%s: send dlt message failed\n", __func__); else - /* If sent to at least one client, + /* If sent to at least one client, * then do not store in ring buffer */ sent = 1; @@ -2398,8 +2405,7 @@ int dlt_daemon_process_systemd_timer(DltDaemon *daemon, daemon->received_message_since_last_watchdog_interval = 0; #endif - if (sd_notify(0, "WATCHDOG=1") < 0) - dlt_log(LOG_CRIT, "Could not reset systemd watchdog\n"); + dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); dlt_log(LOG_DEBUG, "Timer watchdog\n"); diff --git a/src/daemon/dlt_daemon_common.c b/src/daemon/dlt_daemon_common.c index f3aa277b9..86018ba48 100644 --- a/src/daemon/dlt_daemon_common.c +++ b/src/daemon/dlt_daemon_common.c @@ -2002,14 +2002,24 @@ void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState) } #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE -void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval) { - unsigned int uptime = dlt_uptime(); - if ((uptime - *last_trigger_time) / 10000 >= watchdog_trigger_interval) { - if (sd_notify(0, "WATCHDOG=1") < 0) - dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__); - *last_trigger_time = uptime; +bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon) { + if (daemon->watchdog_trigger_interval == 0) { + return false; } + + const unsigned int uptime_seconds = dlt_uptime() / 10000; + const unsigned int seconds_since_last_trigger = uptime_seconds - daemon->watchdog_last_trigger_time; + if (seconds_since_last_trigger < daemon->watchdog_trigger_interval) { + return false; + } + if (sd_notify(0, "WATCHDOG=1") < 0) + dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__); + else + daemon->watchdog_last_trigger_time = uptime_seconds; + + return true; } + #endif #ifdef DLT_TRACE_LOAD_CTRL_ENABLE diff --git a/src/daemon/dlt_daemon_common.h b/src/daemon/dlt_daemon_common.h index fc91ee934..c46151387 100644 --- a/src/daemon/dlt_daemon_common.h +++ b/src/daemon/dlt_daemon_common.h @@ -207,6 +207,7 @@ typedef struct #endif #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE unsigned int watchdog_trigger_interval; /* watchdog trigger interval in [s] */ + unsigned int watchdog_last_trigger_time; /* when the watchdog was last triggered in [s] */ #endif #ifdef DLT_LOG_LEVEL_APP_CONFIG DltDaemonContextLogSettings *app_id_log_level_settings; /**< Settings for app id specific log levels */ @@ -606,7 +607,12 @@ void dlt_daemon_control_reset_to_factory_default(DltDaemon *daemon, void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState); #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE -void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval); +/** + * Trigger the systemd watchdog when the timeout has been reached + * @param daemon pointer to dlt daemon structure + * @return true if the watchdog has been triggered + */ +bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon); #endif # ifdef __cplusplus diff --git a/src/daemon/dlt_daemon_event_handler.c b/src/daemon/dlt_daemon_event_handler.c index 0dec007bf..5fdefd953 100644 --- a/src/daemon/dlt_daemon_event_handler.c +++ b/src/daemon/dlt_daemon_event_handler.c @@ -268,7 +268,9 @@ int dlt_daemon_handle_event(DltEventHandler *pEvent, return -1; } #ifdef DLT_SYSTEMD_WATCHDOG_ENABLE - dlt_daemon_trigger_systemd_watchdog_if_necessary(&start_time, daemon->watchdog_trigger_interval); + // no need to yield here, it will be called in a loop anyways. + // therefore we also do not log. + dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon); #endif }