diff --git a/check_psqlserver b/check_psqlserver index 56ee58ddff022b0a8516b85187ee574f43e3652c..cae1640bdfaa433f834d718db094eb2cc329aeff 100755 --- a/check_psqlserver +++ b/check_psqlserver @@ -17,12 +17,13 @@ # 2023-06-08 v0.3 <axel.hahn@unibe.ch> show unknown if database connection fails # 2023-06-08 v0.4 <axel.hahn@unibe.ch> get summary for cronflicts and problems # 2023-06-09 v0.5 <axel.hahn@unibe.ch> deltaunit can be set as parameter +# 2023-06-13 v0.6 <axel.hahn@unibe.ch> no output on activity; update replication check # ====================================================================== . $(dirname $0)/inc_pluginfunctions self_APPNAME=$( basename $0 | tr [:lower:] [:upper:] ) -self_APPVERSION=0.5 +self_APPVERSION=0.6 # --- other vars... cfgfile=/etc/icingaclient/.psql.conf @@ -232,11 +233,11 @@ case "${sMode}" in typeset -i iQOther; iQOther=$iQTotal-$iQActive-$iQIdle-$iQFastpath descr="Running total: $iQTotal ... active: $iQActive idle: $iQIdle fastpath: $iQFastpath other: $iQOther" - typeset -i iMax; iMax=20 - out=$( grep "^.[a-z]" <<< "${_out}" | cut -c 1-150 | tr '|' ':' | head -${iMax} ) - if [ ${iQTotal} -gt ${iMax} ]; then - out+="${NL}(showing ${iMax} of ${iQTotal} lines)${NL}" - fi + # typeset -i iMax; iMax=20 + # out=$( grep "^.[a-z]" <<< "${_out}" | cut -c 1-150 | tr '|' ':' | head -${iMax} ) + # if [ ${iQTotal} -gt ${iMax} ]; then + # out+="${NL}(showing ${iMax} of ${iQTotal} lines)${NL}" + # fi ph.perfadd "running-total" "${iQTotal}" ph.perfadd "running-active" "${iQActive}" ph.perfadd "running-idle" "${iQIdle}" @@ -302,13 +303,52 @@ case "${sMode}" in ;; "replication") - _out=$( psql -c "select * from pg_stat_replication" 2>&1 ) || _queryFailed + # _out=$( psql -c "select * from pg_stat_replication" 2>&1 ) || _queryFailed + _out=$( psql -c "select application_name,client_addr,state,write_lag,flush_lag,replay_lag,GREATEST(write_lag,flush_lag,replay_lag) as max_lag,sync_state from pg_stat_replication" 2>&1 ) || _queryFailed if tail -1 <<< "$_out" | grep "(0 rows)" >/dev/null ; then - descr="No data in pg_stat_replication = no replication here." + descr="None (no data in pg_stat_replication)" out="" else descr="status (from pg_stat_replication)" - out=$( echo "${_out}${NL}" | tr '|' ':' ) + data=$( sed -n "3,\$p" <<< "${_out}" | tr -d ' ' | grep -v '^(' ) + + # --- check: state + typeset -i iTotal; iTotal=$( wc -l <<< "${data}" ) + typeset -i iStateStreaming; iStateStreaming=$( cut -f 3 -d '|' <<< "${data}" | grep -c 'streaming' ) + typeset -i iStateOther; iStateOther=$iTotal-$iStateStreaming + + ph.perfadd "total" "${iTotal}" + ph.perfadd "state-streaming" "${iStateStreaming}" + ph.perfadd "state-other" "${iStateOther}" + + if [ $iStateOther -gt 0 ]; then + ph.setStatus "warning" + descr+=" ... state warning" + out+="WARNING: a replication doesn't have the state 'streaming'.${NL}" + out+=" - startup: This WAL sender is starting up.${NL}" + out+=" - catchup: This WAL sender's connected standby is catching up with the primary.${NL}" + out+=" - streaming: This WAL sender is streaming changes after its connected standby server has caught up with the primary.${NL}" + out+=" - backup: This WAL sender is sending a backup.${NL}" + out+=" - stopping: This WAL sender is stopping.${NL}" + out+="" + fi + + # --- check: max_leg + # max_leg is the maximum value of write_lag,flush_lag,replay_lag + # value as sec + "." + millisec ... or empty + MaxLag=$( cut -f 7 -d '|' <<< "${data}" | sort | tail -1 | awk -F: '{ print ($1 * 3600) + ($2 * 60) + $3 }' ) + test -z "$MaxLag" && MaxLag="0" + + ph.perfadd "max-lag" "${MaxLag}" + + if ! grep "^0[\.]*" <<< "$MaxLag" >/dev/null; then + ph.setStatus "warning" + descr+=" ... lag warning" + out+="WARNING: !!! experimantal !!! a lag larger 1 sec was detected. Maybe a target server is out of sync.'.${NL}" + fi + + out+=$( echo "${_out}${NL}" | tr '|' ':' ) + fi ;;