#!/bin/bash # ====================================================================== # # Check PSQL SERVER # # requirements: # - psql client # # installation: # - execute check_psqlserver -i # # ---------------------------------------------------------------------- # Postgresql docs: # https://www.postgresql.org/docs/current/monitoring-stats.html # ---------------------------------------------------------------------- # 2023-06-07 v0.2 <axel.hahn@unibe.ch> # 2023-06-08 v0.3 <axel.hahn@unibe.ch> show unknown if database connection fails # 2023-06-08 v0.4 <axel.hahn@unibe.ch> get summary for cronflicts and problems # 2023-06-09 v0.5 <axel.hahn@unibe.ch> deltaunit can be set as parameter # 2023-06-13 v0.6 <axel.hahn@unibe.ch> no output on activity; update replication check # 2023-06-16 v0.7 <axel.hahn@unibe.ch> update help text # 2023-08-30 v0.8 <axel.hahn@unibe.ch> reverse return code in ph.hasParamoption to unix like return codes: 0=true; <>0 = false # 2023-09-18 v0.9 <axel.hahn@unibe.ch> prevent broken pipe message in journallog # ====================================================================== . $(dirname $0)/inc_pluginfunctions self_APPNAME=$( basename $0 | tr [:lower:] [:upper:] ) self_APPVERSION=0.9 # --- other vars... cfgfile=/etc/icingaclient/.psql.conf myuser=icingamonitor # new line NL=" " out="" # ---------------------------------------------------------------------- # FUNCTIONS # ---------------------------------------------------------------------- # exit with status unknown if a database query failed function _queryFailed(){ ph.setStatus "unknown" echo "UNKNOWN: the database query failed" echo "Quick troubleshooter:" echo "* install a database user $myuser with command $(basename $0) -i" echo "* check pg_hba.conf login fails if it set to ident (eg. change it to md5)" ph.exit } # uninstall database user function _uninstall(){ echo "UNINSTALL ..." su - postgres -c "psql -c \"DROP USER ${myuser};\"" 2>&1 unset PGHOST unset PGUSER unset PGPASSWORD unset PGDATABASE } # (re)install database user for monitoring function _install(){ echo "INSTALLING ..." local pwlength=64 echo "- check psql connection..." su postgres -c "psql -V postgres" if [ $? -ne 0 ]; then echo "ERROR: psql connect without password failed." exit 1 fi echo "- creating database user $myuser with random password ($pwlength chars)..." mypw=$( head /dev/urandom | tr -dc A-Za-z0-9 | head -c $pwlength ) SQL1="CREATE USER ${myuser} WITH PASSWORD '${mypw}' CONNECTION LIMIT 5"; SQL2="GRANT pg_monitor TO ${myuser};" # SQL2="GRANT MONITOR_QUERIES TO ${myuser};" if ! su - postgres -c "psql -c \"${SQL1}\" 2>&1"; then echo "ERROR: psql command to create user failed." exit 1 fi echo "- grant ..." if ! su - postgres -c "psql -c \"${SQL2}\" 2>&1" ; then echo "ERROR: psql command to grant permissions failed." # exit 1 fi echo "- creating config file $cfgfile ... " cat >$cfgfile <<EOF # # generated on $(date) # export PGUSER=${myuser} export PGPASSWORD=${mypw} export PGHOST=localhost # set default database because a user db won't be created export PGDATABASE=postgres EOF if ! ls -l $cfgfile; then echo "ERROR: creation of config file failed." exit 1 fi } # show help function _usage(){ local _self=$( basename $0 ) cat <<EOH ______________________________________________________________________ ${self_APPNAME} :: v${self_APPVERSION} (c) Institute for Medical Education - University of Bern Licence: GNU GPL 3 ______________________________________________________________________ USAGE: $_self [OPTIONS] -m METHOD OPTIONS: -h this help -i install monitoring user (must be executed as root) -u uninstall monitoring user (must be executed as root) PARAMETERS: -m method; valid methods are: activity Count running processes and queries conflicts Count of detected conflicts dbrows Count of database row actions diskblock Count of diskblocks physically read or coming from cache problems Count of problems and troublemakers replication Replication status and lag time transactions Count of transactions over all databases EXAMPLES: $_self -i $_self -m activity EOH } # render incremental counters from integer results of a given sql query # global string out output of check # param string database query # param string time unit; one of sec|min; default: sec function renderCounters(){ local _query; _query="$1" local _sDeltaUnit; _sDeltaUnit=${2:-sec} local _out; _out=$( psql -c "${_query} " 2>&1) || _queryFailed local _iCounter; typeset -i _iCounter=0 local _header; _header=$( echo "${_out}" | head -1 | tr -d ' ') local _data; _data=$( echo "${_out}" | head -3 | tail -1 | tr -d ' ') local _iSpeed; typeset -i _iSpeed=0 local _sStoreid; _sStoreid=$( md5sum <<< "${_query}" | awk '{ print $1 }' ) local _iValue; typeset -i _iValue # read psql result and put columns and values to an array IFS="|" read -ra aCols <<< "$_header" read -ra aVals <<< "$_data" for sColumn in ${aCols[*]} do _iValue=${aVals[$_iCounter]} _iSpeed=$( ph.perfdeltaspeed "psql-${sColumn}-${_sStoreid}" ${_iValue} sec $_sDeltaUnit) out=${out}$( printf "%-25s: %15s %s \n" "${sColumn}" "${_iValue}" "... delta = ${_iSpeed} per $_sDeltaUnit${NL}" ) ph.perfadd "${sColumn}" "${_iSpeed}" _iCounter+=1 done } # ---------------------------------------------------------------------- # MAIN # ---------------------------------------------------------------------- ph.hasParamoption "i" "$@"; bOptInstall=$? ph.hasParamoption "u" "$@"; bOptUninstall=$? ph.hasParamoption "h" "$@"; bOptHelp=$? if [ $bOptHelp -eq 0 -o $# -lt 1 ]; then _usage exit 0 fi # --- check required tools # ph.require mysql # --- install if [ $bOptInstall -eq 0 -a "$( whoami )" = "root" ]; then if [ -f $cfgfile ]; then ph.status "SKIP installation. config file already exists: $cfgfile." ph.exit fi _uninstall _install ph.status "Installation was done" ph.exit fi # --- uninstall if [ $bOptUninstall -eq 0 -a "$( whoami )" = "root" ]; then _uninstall rm -f $cfgfile ph.status "Uninstalled." ph.exit fi # --- check installation if ! grep $myuser $cfgfile >/dev/null 2>&1; then ph.abort "PSQL access not possible yet. You need to install the monitoring user first: as root execute `basename $0` -i" fi . "${cfgfile}" # ---------------------------------------------------------------------- sMode=$(ph.getValueWithParam '' m "$@") case "${sMode}" in "activity") _out=$( psql -c "select backend_type,state,query from pg_stat_activity" 2>&1 ) || _queryFailed typeset -i iQTotal; iQTotal=$( tail -1 <<< "$_out" | cut -f 1 -d ' ' | tr -d '(' ) typeset -i iQActive; iQActive=$( awk '{ print $4 }' <<< "$_out" | grep -c "active" ) typeset -i iQIdle; iQIdle=$( awk '{ print $4 }' <<< "$_out" | grep -c "idle" ) typeset -i iQFastpath; iQFastpath=$(awk '{ print $4 }' <<< "$_out" | grep -c "fastpath" ) typeset -i iQOther; iQOther=$iQTotal-$iQActive-$iQIdle-$iQFastpath descr="Running total: $iQTotal ... active: $iQActive idle: $iQIdle fastpath: $iQFastpath other: $iQOther" out+="${NL}" out+="Hint: Query for details:${NL}" out+="select * from pg_stat_activity.${NL}" # typeset -i iMax; iMax=20 # out=$( grep "^.[a-z]" <<< "${_out}" | cut -c 1-150 | tr '|' ':' | head -${iMax} ) # if [ ${iQTotal} -gt ${iMax} ]; then # out+="${NL}(showing ${iMax} of ${iQTotal} lines)${NL}" # fi ph.perfadd "running-total" "${iQTotal}" ph.perfadd "running-active" "${iQActive}" ph.perfadd "running-idle" "${iQIdle}" ph.perfadd "running-fastpath" "${iQFastpath}" ph.perfadd "running-other" "${iQOther}" ;; "conflicts") deltaunit=min descr="" renderCounters "select \ sum(confl_tablespace) as confl_tablespace, \ sum(confl_lock) as confl_lock, \ sum(confl_snapshot) as confl_snapshot, \ sum(confl_bufferpin) as confl_bufferpin, \ sum(confl_deadlock) as confl_deadlock \ from pg_stat_database_conflicts " "${deltaunit}" # sum of 7th column (with delta per sec) typeset -i iTotal; iTotal=$( echo "$out" | awk '{ sum+=$7} END { print sum;}' ) if [ $iTotal -gt 0 ]; then ph.setStatus "critical" descr+=" ... $iTotal per min conflicts detected" else descr+=" ... OK, nothing was found" fi out+="${NL}" out+="Hint: Query for details per database:${NL}" out+="select * from pg_stat_database_conflicts.${NL}" ;; "dbrows") descr="Count of database row actions" renderCounters "select \ sum(tup_returned) as tup_returned, \ sum(tup_fetched) as tup_fetched, \ sum(tup_inserted) as tup_inserted, \ sum(tup_updated) as tup_updated, \ sum(tup_deleted) as tup_deleted \ from pg_stat_database " out+="${NL}" out+="Hint: Query for details per database:${NL}" out+="select * from pg_stat_database.${NL}" ;; "diskblock") descr="Count of diskblocks physically read or coming from cache (from pg_stat_database)" renderCounters "select \ sum(blks_read) as blks_read, \ sum(blks_hit) as blks_hit \ from pg_stat_database " out+="${NL}" out+="Hint: Query for details per database:${NL}" out+="select * from pg_stat_database.${NL}" ;; "problems") deltaunit=min descr="Problems and troublemakers (from pg_stat_database)" renderCounters "select \ sum(conflicts) as conflicts, \ sum(deadlocks) as deadlocks, \ sum(checksum_failures) as checksumfailures, \ sum(temp_files) as temp_files, \ sum(temp_bytes) as temp_bytes \ from pg_stat_database" "${deltaunit}" # sum of 7th column (with delta per sec) typeset -i iTotal; iTotal=$( echo "$out" | awk '{ sum+=$7} END { print sum;}' ) if [ $iTotal -gt 0 ]; then ph.setStatus "critical" descr+="... $iTotal (per min) Problems and troublemakers were found" else descr+=" ... OK, nothing was found" fi out+="${NL}" out+="Hint: Query for details per database:${NL}" out+="select * from pg_stat_database.${NL}" ;; "replication") # _out=$( psql -c "select * from pg_stat_replication" 2>&1 ) || _queryFailed _out=$( psql -c "select application_name,client_addr,state,write_lag,flush_lag,replay_lag,GREATEST(write_lag,flush_lag,replay_lag) as max_lag,sync_state from pg_stat_replication" 2>&1 ) || _queryFailed if tail -1 <<< "$_out" | grep "(0 rows)" >/dev/null ; then descr="None (no data in pg_stat_replication)" out="" else descr="status" data=$( sed -n "3,\$p" <<< "${_out}" | tr -d ' ' | grep -v '^(' ) # --- check: state typeset -i iTotal; iTotal=$( wc -l <<< "${data}" ) typeset -i iStateStreaming; iStateStreaming=$( cut -f 3 -d '|' <<< "${data}" | grep -c 'streaming' ) typeset -i iStateOther; iStateOther=$iTotal-$iStateStreaming ph.perfadd "total" "${iTotal}" ph.perfadd "state-streaming" "${iStateStreaming}" ph.perfadd "state-other" "${iStateOther}" if [ $iStateOther -gt 0 ]; then ph.setStatus "warning" descr+=" ... state warning" out+="WARNING: a replication doesn't have the state 'streaming'.${NL}" out+=" - startup: This WAL sender is starting up.${NL}" out+=" - catchup: This WAL sender's connected standby is catching up with the primary.${NL}" out+=" - streaming: This WAL sender is streaming changes after its connected standby server has caught up with the primary.${NL}" out+=" - backup: This WAL sender is sending a backup.${NL}" out+=" - stopping: This WAL sender is stopping.${NL}" out+="" else out+="OK: all replications have the state 'streaming'.${NL}" fi # --- check: max_leg # max_leg is the maximum value of write_lag,flush_lag,replay_lag # value as sec + "." + millisec ... or empty MaxLag=$( cut -f 7 -d '|' <<< "${data}" | sort 2>/dev/null | tail -1 | awk -F: '{ print ($1 * 3600) + ($2 * 60) + $3 }' ) test -z "$MaxLag" && MaxLag="0" ph.perfadd "max-lag" "${MaxLag}" if ! grep "^0[\.]*" <<< "$MaxLag" >/dev/null; then ph.setStatus "warning" descr+=" ... lag warning" out+="WARNING: !!! experimantal !!! a lag larger 1 sec was detected. Maybe a target server is out of sync.'.${NL}" else out+="OK: maximum lag is $MaxLag sec (below 1 sec).${NL}" fi out+=$( echo "${_out}${NL}" | tr '|' ':' ) out+="${NL}" out+="Hint: Query for details per replication:${NL}" out+="select * from pg_stat_replication.${NL}" fi ;; "transactions") descr="Count of transactions over all databases" renderCounters "select \ sum(xact_commit) as commit, \ sum(xact_rollback) as rollback \ from pg_stat_database " out+="${NL}" out+="Hint: Query for details per database:${NL}" out+="select * from pg_stat_database.${NL}" ;; *) echo "ERRROR: [${sMode}] is an INVALID mode" _usage ph.abort esac ph.status "Pgsql $sMode :: $descr" echo " $out" ph.exit # ----------------------------------------------------------------------