From 340ee75509903d13261d3c264b53b6ce5f58a201 Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Fri, 20 Dec 2024 16:08:43 +0100 Subject: [PATCH] Improve tracking and error reporting of startup probe Currently startup probe are scheduled with defaults from k8s (scheduled every 10s, failure threshold of 3). As galera joiner nodes can take a long time to start, this generates unecessary unhealthy events. Rework how the startup probe work by allowing a single, long probe which internally loops while probe the startup state. Throughout the startup process, keep track of the specific startup phase so in case the startup times out, the probe can log a precise error. Also rework how joiner nodes are tracked, to fail early in case galera cannot join a primary partition, to avoid the server being stuck until indefinitely until the startup probe times out. A subsequent commit will provide the ability to override probe settings and timeouts. Jira: OSPRH-11392 --- pkg/mariadb/const.go | 3 + pkg/mariadb/statefulset.go | 11 +- templates/galera/bin/mysql_probe.sh | 203 ++++++++++++++++++++-- templates/galera/config/galera.cnf.in | 2 +- templates/galera/config/galera_tls.cnf.in | 2 +- 5 files changed, 205 insertions(+), 16 deletions(-) diff --git a/pkg/mariadb/const.go b/pkg/mariadb/const.go index 6e505770..efc30366 100644 --- a/pkg/mariadb/const.go +++ b/pkg/mariadb/const.go @@ -6,4 +6,7 @@ const ( // ActivePodSelectorKey - Selector key used to configure A/P service behavior ActivePodSelectorKey = "statefulset.kubernetes.io/pod-name" + + // Time allowed during a the startup probe (in seconds) + StartupProbeTimeout = 240 ) diff --git a/pkg/mariadb/statefulset.go b/pkg/mariadb/statefulset.go index 786278c6..f0603925 100644 --- a/pkg/mariadb/statefulset.go +++ b/pkg/mariadb/statefulset.go @@ -1,6 +1,8 @@ package mariadb import ( + "strconv" + common "github.com/openstack-k8s-operators/lib-common/modules/common" "github.com/openstack-k8s-operators/lib-common/modules/common/affinity" mariadbv1 "github.com/openstack-k8s-operators/mariadb-operator/api/v1beta1" @@ -112,6 +114,7 @@ func getGaleraInitContainers(g *mariadbv1.Galera) []corev1.Container { } func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Container { + timeout := strconv.Itoa(StartupProbeTimeout) containers := []corev1.Container{{ Image: g.Spec.ContainerImage, Name: "galera", @@ -144,11 +147,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai StartupProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ Exec: &corev1.ExecAction{ - Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup"}, + Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup", timeout}, }, }, - PeriodSeconds: 10, - FailureThreshold: 30, + // extra seconds so that the script is not preempted by k8s + TimeoutSeconds: StartupProbeTimeout + 10, + // the current probe implementation assumes a single failure threshold + FailureThreshold: 1, }, LivenessProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ diff --git a/templates/galera/bin/mysql_probe.sh b/templates/galera/bin/mysql_probe.sh index a1b262f3..2fafcc04 100755 --- a/templates/galera/bin/mysql_probe.sh +++ b/templates/galera/bin/mysql_probe.sh @@ -6,31 +6,212 @@ read -s -u 3 3< /var/lib/secrets/dbpassword MYSQL_PWD || true export MYSQL_PWD PROBE_USER=root -function mysql_status_check { + +MYSQL_SOCKET=/var/lib/mysql/mysql.sock +SST_IN_PROGRESS=/var/lib/mysql/sst_in_progress + +CHECK_RETRY=10 +CHECK_WAIT=0.5 +STARTUP_WAIT=2 + +LAST_STATE="" +function log_state { + local state="$1" + # do not duplicate error logs in the probe, to minimize the + # output in k8s events in case the probe fails + if [ "${LAST_STATE}" != "${state}" ]; then + LAST_STATE="${state}" + fi +} + +function log_last_state { + if [ -n "${LAST_STATE}" ]; then + echo "${LAST_STATE}" + fi +} +trap log_last_state EXIT + +function get_mysql_status { + local status=$1 + local i + local out + for i in $(seq $CHECK_RETRY); do + out=$(mysql -u${PROBE_USER} -sNEe "show status like '${status}';" 2>&1) + if [ $? -eq 0 ]; then + echo "${out}" | tail -1 + return 0 + else + sleep ${CHECK_WAIT} + fi + done + # if we pass here, log the last error from mysql + echo "${out}" >&2 + return 1 +} + +function check_mysql_status { local status=$1 local expect=$2 - set -x - mysql -u${PROBE_USER} -sNEe "show status like '${status}';" | tail -1 | grep -w -e "${expect}" + local val + local rc + + val=$(get_mysql_status "${status}") + test "${val}" = "${expect}" + rc=$? + if [ $rc -ne 0 ]; then + log_state "${status} (${val}) differs from ${expect}" + fi + return $rc } -# Consider the pod has "started" once mysql is reachable -# and is part of the primary partition -if [ "$1" = "startup" ]; then - mysql_status_check wsrep_cluster_status Primary - exit $? -fi +function check_sst_in_progress { + local i + # retry to give some time to mysql to set up the SST + for i in $(seq $CHECK_RETRY); do + if [ -e ${MYSQL_SOCKET} ]; then + return 1 + elif [ -e ${SST_IN_PROGRESS} ]; then + return 0 + else + sleep ${CHECK_WAIT} + fi + done + return 1 +} +function check_mysql_ready { + local i + # retry to give some time to mysql to create its socket + for i in $(seq $CHECK_RETRY); do + if [ -e ${MYSQL_SOCKET} ] && mysqladmin -s -u${PROBE_USER} ping >dev/null; then + return 0 + else + sleep ${CHECK_WAIT} + fi + done + return 1 +} + +# Monitor the startup sequence until the galera node is connected +# to a primary component and synced +# NOTE: as of mariadb 10.5, if mysql connects to a non-primary +# partition, it never creates any socket and gets stuck indefinitely. +# In that case, in order to not wait until the startup times out +# (very long), we error out of the probe so that the pod can restart +# and mysql reconnect to a primary partition if possible. +function check_mysql_startup { + # mysql initialization sequence: + # . mysql connects to a remote galera node over port 4567 + # . mysql optionally runs a SST (port 4444), SST marker created on disk + # . only at this point, InnoDB is initialized, mysql pidfile and + # mysql socket are created on disk + + if pgrep -f detect_gcomm_and_start.sh >/dev/null ; then + log_state "waiting for gcomm URI" + return 1 + fi + # pidfile is not written on disk until mysql is ready, + # so look for the mysqld process instead + if ! pgrep -f /usr/libexec/mysqld >/dev/null ; then + log_state "waiting for mysql to start" + return 1 + fi + + # a bootstrap node must be reachable from the CLI to finish startup + if pgrep -f -- '--wsrep-cluster-address=gcomm://(\W|$)' >/dev/null; then + check_mysql_ready + return $? + # a joiner node must have an established socket connection before testing further + elif pgrep -f -- '--wsrep-cluster-address=gcomm://\w' >/dev/null; then + local connections + connections=$(ss -tnH state established src :4567 or dst :4567 | wc -l) + if ! test "${connections}" -ge 0; then + log_state "waiting for mysql to join a galera cluster" + return 1 + fi + else + log_state "could not determine galera startup mode" + exit 1 + fi + + # a joiner node requires additional startup checks + if [ -e /var/lib/mysql/mysql.sock ]; then + # good case, mysql is ready to be probed from the CLI + # check WSREP status like the regular liveness probe + local status + local comment + status=$(get_mysql_status wsrep_cluster_status) + comment=$(get_mysql_status wsrep_local_state_comment) + if [ "${status}" = "Primary" -a "${comment}" = "Synced" ]; then + return 0 + elif [ "${status}" = "Primary" ]; then + log_state "waiting to be synced with the cluster" + return 1 + elif [ "${status}" = "Non-primary" -a "${comment}" = "Synced"]; then + log_state "mysql is connected to a non-primary partition, server stopped" + exit 1 + else + log_state "waiting for connection to a primary partition" + return 1 + fi + else + # if there is no socket, mysql may be running an SST... + if check_sst_in_progress; then + log_state "waiting for SST to finish" + return 1 + fi + + # ... if no SST was detected, it may have finished before + # we probed it. Check a last time whether we can connect to mysql + if check_mysql_ready; then + return 0 + fi + + # At this stage, mysql is either trying to connect to a boostrap node + # that resolved to an old pod IP, or it is is connected to a + # non-primary partition. Either way, this is not recoverable, so + # make the probe fail and let k8s kill the mysql server. + + log_state "could not find a primary partition to connect to" + exit 1 + fi + return 1 +} + + +# startup probe loops until the node started or joined a galera cluster # readiness and liveness probes are run by k8s only after start probe succeeded case "$1" in + startup) + if [ -z "$2" ]; then + echo "startup timeout option missing" + exit 1 + fi + TIME_TIMEOUT=$2 + + # Run the entire check in a single startup probe to avoid spurious + # "Unhealthy" k8s events to be logged. The probe stops in error + # if the startup timeout is reached + rc=1 + while [ $rc -ne 0 ]; do + if check_mysql_startup; then + exit 0 + else + sleep ${STARTUP_WAIT}; + [ $SECONDS -ge $TIME_TIMEOUT ] && exit 1 + fi + done + exit $rc + ;; readiness) # If the node is e.g. a donor, it cannot serve traffic - mysql_status_check wsrep_local_state_comment Synced + check_mysql_status wsrep_local_state_comment Synced ;; liveness) # If the node is not in the primary partition, the failed liveness probe # will make k8s restart this pod - mysql_status_check wsrep_cluster_status Primary + check_mysql_status wsrep_cluster_status Primary ;; *) echo "Invalid probe option '$1'" diff --git a/templates/galera/config/galera.cnf.in b/templates/galera/config/galera.cnf.in index 744c344a..edf2b64e 100644 --- a/templates/galera/config/galera.cnf.in +++ b/templates/galera/config/galera.cnf.in @@ -47,7 +47,7 @@ wsrep_debug = 0 wsrep_drupal_282555_workaround = 0 wsrep_on = ON wsrep_provider = /usr/lib64/galera/libgalera_smm.so -wsrep_provider_options = gmcast.listen_addr=tcp://{ PODIP }:4567 +wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567 wsrep_retry_autocommit = 1 wsrep_slave_threads = 1 wsrep_sst_method = rsync diff --git a/templates/galera/config/galera_tls.cnf.in b/templates/galera/config/galera_tls.cnf.in index 69b831c5..63fc1971 100644 --- a/templates/galera/config/galera_tls.cnf.in +++ b/templates/galera/config/galera_tls.cnf.in @@ -4,7 +4,7 @@ ssl-cert = /etc/pki/tls/certs/galera.crt ssl-key = /etc/pki/tls/private/galera.key ssl-ca = /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem ssl-cipher = !SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1 -wsrep_provider_options = gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem; +wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem; [sst] sockopt = cipher=!SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1