Skip to content

Commit

Permalink
Improve tracking and error reporting of startup probe
Browse files Browse the repository at this point in the history
Currently startup probe are scheduled with defaults from k8s
(scheduled every 10s, failure threshold of 3). As galera joiner
nodes can take a long time to start, this generates unecessary
unhealthy events.

Rework how the startup probe work by allowing a single, long
probe which internally loops while probe the startup state.
Throughout the startup process, keep track of the specific
startup phase so in case the startup times out, the probe can
log a precise error.

Also rework how joiner nodes are tracked, to fail early in case
galera cannot join a primary partition, to avoid the server
being stuck until indefinitely until the startup probe times out.

A subsequent commit will provide the ability to override probe
settings and timeouts.

Jira: OSPRH-11392
  • Loading branch information
dciabrin committed Dec 23, 2024
1 parent 1709c6e commit 340ee75
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 16 deletions.
3 changes: 3 additions & 0 deletions pkg/mariadb/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ const (

// ActivePodSelectorKey - Selector key used to configure A/P service behavior
ActivePodSelectorKey = "statefulset.kubernetes.io/pod-name"

// Time allowed during a the startup probe (in seconds)
StartupProbeTimeout = 240
)
11 changes: 8 additions & 3 deletions pkg/mariadb/statefulset.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package mariadb

import (
"strconv"

common "github.com/openstack-k8s-operators/lib-common/modules/common"
"github.com/openstack-k8s-operators/lib-common/modules/common/affinity"
mariadbv1 "github.com/openstack-k8s-operators/mariadb-operator/api/v1beta1"
Expand Down Expand Up @@ -112,6 +114,7 @@ func getGaleraInitContainers(g *mariadbv1.Galera) []corev1.Container {
}

func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Container {
timeout := strconv.Itoa(StartupProbeTimeout)
containers := []corev1.Container{{
Image: g.Spec.ContainerImage,
Name: "galera",
Expand Down Expand Up @@ -144,11 +147,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
StartupProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup"},
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_probe.sh", "startup", timeout},
},
},
PeriodSeconds: 10,
FailureThreshold: 30,
// extra seconds so that the script is not preempted by k8s
TimeoutSeconds: StartupProbeTimeout + 10,
// the current probe implementation assumes a single failure threshold
FailureThreshold: 1,
},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Expand Down
203 changes: 192 additions & 11 deletions templates/galera/bin/mysql_probe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,212 @@ read -s -u 3 3< /var/lib/secrets/dbpassword MYSQL_PWD || true
export MYSQL_PWD

PROBE_USER=root
function mysql_status_check {

MYSQL_SOCKET=/var/lib/mysql/mysql.sock
SST_IN_PROGRESS=/var/lib/mysql/sst_in_progress

CHECK_RETRY=10
CHECK_WAIT=0.5
STARTUP_WAIT=2

LAST_STATE=""
function log_state {
local state="$1"
# do not duplicate error logs in the probe, to minimize the
# output in k8s events in case the probe fails
if [ "${LAST_STATE}" != "${state}" ]; then
LAST_STATE="${state}"
fi
}

function log_last_state {
if [ -n "${LAST_STATE}" ]; then
echo "${LAST_STATE}"
fi
}
trap log_last_state EXIT

function get_mysql_status {
local status=$1
local i
local out
for i in $(seq $CHECK_RETRY); do
out=$(mysql -u${PROBE_USER} -sNEe "show status like '${status}';" 2>&1)
if [ $? -eq 0 ]; then
echo "${out}" | tail -1
return 0
else
sleep ${CHECK_WAIT}
fi
done
# if we pass here, log the last error from mysql
echo "${out}" >&2
return 1
}

function check_mysql_status {
local status=$1
local expect=$2
set -x
mysql -u${PROBE_USER} -sNEe "show status like '${status}';" | tail -1 | grep -w -e "${expect}"
local val
local rc

val=$(get_mysql_status "${status}")
test "${val}" = "${expect}"
rc=$?
if [ $rc -ne 0 ]; then
log_state "${status} (${val}) differs from ${expect}"
fi
return $rc
}

# Consider the pod has "started" once mysql is reachable
# and is part of the primary partition
if [ "$1" = "startup" ]; then
mysql_status_check wsrep_cluster_status Primary
exit $?
fi
function check_sst_in_progress {
local i
# retry to give some time to mysql to set up the SST
for i in $(seq $CHECK_RETRY); do
if [ -e ${MYSQL_SOCKET} ]; then
return 1
elif [ -e ${SST_IN_PROGRESS} ]; then
return 0
else
sleep ${CHECK_WAIT}
fi
done
return 1
}

function check_mysql_ready {
local i
# retry to give some time to mysql to create its socket
for i in $(seq $CHECK_RETRY); do
if [ -e ${MYSQL_SOCKET} ] && mysqladmin -s -u${PROBE_USER} ping >dev/null; then
return 0
else
sleep ${CHECK_WAIT}
fi
done
return 1
}

# Monitor the startup sequence until the galera node is connected
# to a primary component and synced
# NOTE: as of mariadb 10.5, if mysql connects to a non-primary
# partition, it never creates any socket and gets stuck indefinitely.
# In that case, in order to not wait until the startup times out
# (very long), we error out of the probe so that the pod can restart
# and mysql reconnect to a primary partition if possible.
function check_mysql_startup {
# mysql initialization sequence:
# . mysql connects to a remote galera node over port 4567
# . mysql optionally runs a SST (port 4444), SST marker created on disk
# . only at this point, InnoDB is initialized, mysql pidfile and
# mysql socket are created on disk

if pgrep -f detect_gcomm_and_start.sh >/dev/null ; then
log_state "waiting for gcomm URI"
return 1
fi
# pidfile is not written on disk until mysql is ready,
# so look for the mysqld process instead
if ! pgrep -f /usr/libexec/mysqld >/dev/null ; then
log_state "waiting for mysql to start"
return 1
fi

# a bootstrap node must be reachable from the CLI to finish startup
if pgrep -f -- '--wsrep-cluster-address=gcomm://(\W|$)' >/dev/null; then
check_mysql_ready
return $?
# a joiner node must have an established socket connection before testing further
elif pgrep -f -- '--wsrep-cluster-address=gcomm://\w' >/dev/null; then
local connections
connections=$(ss -tnH state established src :4567 or dst :4567 | wc -l)
if ! test "${connections}" -ge 0; then
log_state "waiting for mysql to join a galera cluster"
return 1
fi
else
log_state "could not determine galera startup mode"
exit 1
fi

# a joiner node requires additional startup checks
if [ -e /var/lib/mysql/mysql.sock ]; then
# good case, mysql is ready to be probed from the CLI
# check WSREP status like the regular liveness probe
local status
local comment
status=$(get_mysql_status wsrep_cluster_status)
comment=$(get_mysql_status wsrep_local_state_comment)
if [ "${status}" = "Primary" -a "${comment}" = "Synced" ]; then
return 0
elif [ "${status}" = "Primary" ]; then
log_state "waiting to be synced with the cluster"
return 1
elif [ "${status}" = "Non-primary" -a "${comment}" = "Synced"]; then
log_state "mysql is connected to a non-primary partition, server stopped"
exit 1
else
log_state "waiting for connection to a primary partition"
return 1
fi
else
# if there is no socket, mysql may be running an SST...
if check_sst_in_progress; then
log_state "waiting for SST to finish"
return 1
fi

# ... if no SST was detected, it may have finished before
# we probed it. Check a last time whether we can connect to mysql
if check_mysql_ready; then
return 0
fi

# At this stage, mysql is either trying to connect to a boostrap node
# that resolved to an old pod IP, or it is is connected to a
# non-primary partition. Either way, this is not recoverable, so
# make the probe fail and let k8s kill the mysql server.

log_state "could not find a primary partition to connect to"
exit 1
fi
return 1
}


# startup probe loops until the node started or joined a galera cluster
# readiness and liveness probes are run by k8s only after start probe succeeded

case "$1" in
startup)
if [ -z "$2" ]; then
echo "startup timeout option missing"
exit 1
fi
TIME_TIMEOUT=$2

# Run the entire check in a single startup probe to avoid spurious
# "Unhealthy" k8s events to be logged. The probe stops in error
# if the startup timeout is reached
rc=1
while [ $rc -ne 0 ]; do
if check_mysql_startup; then
exit 0
else
sleep ${STARTUP_WAIT};
[ $SECONDS -ge $TIME_TIMEOUT ] && exit 1
fi
done
exit $rc
;;
readiness)
# If the node is e.g. a donor, it cannot serve traffic
mysql_status_check wsrep_local_state_comment Synced
check_mysql_status wsrep_local_state_comment Synced
;;
liveness)
# If the node is not in the primary partition, the failed liveness probe
# will make k8s restart this pod
mysql_status_check wsrep_cluster_status Primary
check_mysql_status wsrep_cluster_status Primary
;;
*)
echo "Invalid probe option '$1'"
Expand Down
2 changes: 1 addition & 1 deletion templates/galera/config/galera.cnf.in
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ wsrep_debug = 0
wsrep_drupal_282555_workaround = 0
wsrep_on = ON
wsrep_provider = /usr/lib64/galera/libgalera_smm.so
wsrep_provider_options = gmcast.listen_addr=tcp://{ PODIP }:4567
wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567
wsrep_retry_autocommit = 1
wsrep_slave_threads = 1
wsrep_sst_method = rsync
Expand Down
2 changes: 1 addition & 1 deletion templates/galera/config/galera_tls.cnf.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ssl-cert = /etc/pki/tls/certs/galera.crt
ssl-key = /etc/pki/tls/private/galera.key
ssl-ca = /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
ssl-cipher = !SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1
wsrep_provider_options = gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;
wsrep_provider_options = pc.wait_prim=FALSE;gcache.recover=no;gmcast.listen_addr=tcp://{ PODIP }:4567;socket.ssl_key=/etc/pki/tls/private/galera.key;socket.ssl_cert=/etc/pki/tls/certs/galera.crt;socket.ssl_cipher={ SSL_CIPHER };socket.ssl_ca=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem;

[sst]
sockopt = cipher=!SSLv2:kEECDH:kRSA:kEDH:kPSK:+3DES:!aNULL:!eNULL:!MD5:!EXP:!RC4:!SEED:!IDEA:!DES:!SSLv3:!TLSv1
Expand Down

0 comments on commit 340ee75

Please sign in to comment.