Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

watchdog: improve the behavior how watchdog is handled #727

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions src/daemon/dlt-daemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -1549,14 +1549,25 @@ int main(int argc, char *argv[])
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
{
char *watchdogUSec = getenv("WATCHDOG_USEC");
int watchdogTimeoutSeconds = 0;
// set a sensible default, in case the environment variable is not set
int watchdogTimeoutSeconds = 30;

dlt_log(LOG_DEBUG, "Systemd watchdog initialization\n");

if (watchdogUSec)
if (watchdogUSec) {
// WATCHDOG_USEC is the timeout in micrsoseconds
// divide this by 2*10^6 to get the interval in seconds
// 2 * because we notify systemd after half the timeout
watchdogTimeoutSeconds = atoi(watchdogUSec) / 2000000;
}

if (watchdogTimeoutSeconds == 0) {
dlt_log(LOG_WARNING, "Watchdog timeout is too small, need at least 1s, setting 1s timeout\n");
watchdogTimeoutSeconds = 1;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we set the default value is 30 for this case?

}

daemon.watchdog_trigger_interval = watchdogTimeoutSeconds;
daemon.watchdog_last_trigger_time = 0U;
create_timer_fd(&daemon_local,
watchdogTimeoutSeconds,
watchdogTimeoutSeconds,
Expand Down Expand Up @@ -3152,6 +3163,17 @@ int dlt_daemon_process_user_messages(DltDaemon *daemon,

/* look through buffer as long as data is in there */
while ((receiver->bytesRcvd >= min_size) && run_loop) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
/* this loop may be running long, so we have to exit it at some point to be able to
* to process other events, like feeding the watchdog
*/
bool watchdog_triggered= dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__);
run_loop = 0; // exit loop in next iteration
}
#endif

dlt_daemon_process_user_message_func func = NULL;

offset = 0;
Expand Down Expand Up @@ -3798,12 +3820,9 @@ int dlt_daemon_process_user_message_log(DltDaemon *daemon,

while (1) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
const unsigned int uptime = dlt_uptime();
if ((uptime - start_time) / 10000 > daemon->watchdog_trigger_interval) {
dlt_vlog(LOG_WARNING,
"spent already 1 watchdog trigger interval in %s, yielding to process other events.\n", __func__);
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_CRIT, "Could not reset systemd watchdog from %s\n", __func__);
bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__);
break;
}
#endif
Expand Down Expand Up @@ -4162,7 +4181,7 @@ int dlt_daemon_send_ringbuffer_to_client(DltDaemon *daemon, DltDaemonLocal *daem

while ((length = dlt_buffer_copy(&(daemon->client_ringbuffer), data, sizeof(data))) > 0) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
dlt_daemon_trigger_systemd_watchdog_if_necessary(&curr_time, daemon->watchdog_trigger_interval);
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
#endif

if ((ret =
Expand Down
14 changes: 10 additions & 4 deletions src/daemon/dlt_daemon_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,
int verbose)
{
int sent = 0;
unsigned int i = 0;
nfds_t i = 0;
int ret = 0;
DltConnection *temp = NULL;
int type_mask =
Expand All @@ -125,6 +125,13 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,

for (i = 0; i < daemon_local->pEvent.nfds; i++)
{
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s notified watchdog, processed %lu/%lu fds already.\n",
__func__, i, daemon_local->pEvent.nfds);
}
#endif
temp = dlt_event_handler_find_connection(&(daemon_local->pEvent),
daemon_local->pEvent.pfd[i].fd);

Expand Down Expand Up @@ -152,7 +159,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,
if (ret != DLT_DAEMON_ERROR_OK)
dlt_vlog(LOG_WARNING, "%s: send dlt message failed\n", __func__);
else
/* If sent to at least one client,
/* If sent to at least one client,
* then do not store in ring buffer
*/
sent = 1;
Expand Down Expand Up @@ -2398,8 +2405,7 @@ int dlt_daemon_process_systemd_timer(DltDaemon *daemon,
daemon->received_message_since_last_watchdog_interval = 0;
#endif

if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_log(LOG_CRIT, "Could not reset systemd watchdog\n");
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);

dlt_log(LOG_DEBUG, "Timer watchdog\n");

Expand Down
22 changes: 16 additions & 6 deletions src/daemon/dlt_daemon_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -2002,14 +2002,24 @@ void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState)
}

#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval) {
unsigned int uptime = dlt_uptime();
if ((uptime - *last_trigger_time) / 10000 >= watchdog_trigger_interval) {
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__);
*last_trigger_time = uptime;
bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon) {
if (daemon->watchdog_trigger_interval == 0) {
return false;
}

const unsigned int uptime_seconds = dlt_uptime() / 10000;
const unsigned int seconds_since_last_trigger = uptime_seconds - daemon->watchdog_last_trigger_time;
if (seconds_since_last_trigger < daemon->watchdog_trigger_interval) {
return false;
}
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add return false in case could not reset the systemd watchdog

else
daemon->watchdog_last_trigger_time = uptime_seconds;

return true;
}

#endif

#ifdef DLT_TRACE_LOAD_CTRL_ENABLE
Expand Down
8 changes: 7 additions & 1 deletion src/daemon/dlt_daemon_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ typedef struct
#endif
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
unsigned int watchdog_trigger_interval; /* watchdog trigger interval in [s] */
unsigned int watchdog_last_trigger_time; /* when the watchdog was last triggered in [s] */
#endif
#ifdef DLT_LOG_LEVEL_APP_CONFIG
DltDaemonContextLogSettings *app_id_log_level_settings; /**< Settings for app id specific log levels */
Expand Down Expand Up @@ -606,7 +607,12 @@ void dlt_daemon_control_reset_to_factory_default(DltDaemon *daemon,
void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState);

#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval);
/**
* Trigger the systemd watchdog when the timeout has been reached
* @param daemon pointer to dlt daemon structure
* @return true if the watchdog has been triggered
*/
bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon);
#endif

# ifdef __cplusplus
Expand Down
4 changes: 3 additions & 1 deletion src/daemon/dlt_daemon_event_handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ int dlt_daemon_handle_event(DltEventHandler *pEvent,
return -1;
}
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
dlt_daemon_trigger_systemd_watchdog_if_necessary(&start_time, daemon->watchdog_trigger_interval);
// no need to yield here, it will be called in a loop anyways.
// therefore we also do not log.
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
#endif
}

Expand Down
Loading