#!/bin/bash # ====================================================================== # # Check CPU usage # # requirements: # - top # - bc # # ---------------------------------------------------------------------- # 2020-03-10 v1.0 <axel.hahn@iml.unibe.ch> # 2020-03-23 v1.1 <axel.hahn@iml.unibe.ch> added more data # 2020-07-08 v1.2 <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps." # 2020-07-17 v1.3 <axel.hahn@iml.unibe.ch> use ph.require to check binaries # 2021-02-10 v1.4 <axel.hahn@iml.unibe.ch> added critical io wait # 2021-10-28 v1.5 <axel.hahn@iml.unibe.ch> Use 2nd update of top # 2021-12-10 v1.6 <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits # 2022-03-09 v1.7 <axel.hahn@iml.unibe.ch> show most cpu intensive processes # 2022-03-10 v1.8 <axel.hahn@iml.unibe.ch> add cli param -p; update help # 2022-03-22 v1.9 <axel.hahn@iml.unibe.ch> fix syntax error on 100% idle # 2022-04-14 v1.10 <axel.hahn@iml.unibe.ch> show consuming cpu processes with top and ps # 2022-08-29 v1.11 <axel.hahn@iml.unibe.ch> replace pipe to prevent start of metrics section # 2022-08-29 v1.12 <axel.hahn@iml.unibe.ch> fix: replace pipe # 2023-02-13 v1.13 <axel.hahn@iml.unibe.ch> small shell fixes # 2023-07-27 v1.14 <axel.hahn@unibe.ch> update help page # 2023-09-18 v1.15 <axel.hahn@unibe.ch> prevent broken pipe message in journallog # ====================================================================== . $(dirname $0)/inc_pluginfunctions export self_APPVERSION=1.15 # ---------------------------------------------------------------------- # functions # ---------------------------------------------------------------------- function showHelp(){ local _self; _self=$(basename $0) cat <<EOF $( ph.showImlHelpHeader ) check cpu usage and cpu wait Cpu infos are taken from output of top command. On higher cpu usage it can show processes that cause cpu waits and with most cpu consumption. SYNTAX: $(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT] OPTIONS: -w VALUE cpu usage warning level (default: 75) -c VALUE cpu usage critical level (default: 90) -i VALUE io wait critical level (default: 50) -p VALUE show process info with highest cpu consumption if usage is > NN %; default: 50 -h or --help show this help. PARAMETERS: None. EXAMPLE: $(basename $0) -w 60 -c 80 -p 40 EOF } # ---------------------------------------------------------------------- # MAIN # ---------------------------------------------------------------------- # --- check required tools ph.require bc top # --- check param -h case "$1" in "--help"|"-h") showHelp exit 0 ;; *) esac # ---------------------------------------------------------------------- # set default / override from command line params typeset -i iWarnLimit=$( ph.getValueWithParam 75 w "$@") typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@") typeset -i iCriticalWait=$( ph.getValueWithParam 50 i "$@") typeset -i iMinCpuUsageToShowProcesses=$( ph.getValueWithParam 50 p "$@") # ---------------------------------------------------------------------- # get data # get cpu status i.e. # %Cpu(s): 33.3 us, 9.5 sy, 0.0 ni, 57.1 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st # us, user : time running un-niced user processes # sy, system : time running kernel processes # ni, nice : time running niced user processes # id, idle : time spent in the kernel idle handler # wa, IO-wait : time waiting for I/O completion # hi : time spent servicing hardware interrupts # si : time spent servicing software interrupts # st : time stolen from this vm by the hypervisor # top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile # FIX read cpu from 2nd output of top data=$( top -b -n 2 -d 0.1 | grep -i "^%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" ) cpuUser=$( echo "$data" | grep "us" | awk '{ print $1 }' ) cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' ) cpuNice=$( echo "$data" | grep "ni" | awk '{ print $1 }' ) cpuIdle=$( echo "$data" | grep "id" | awk '{ print $1 }' ) cpuWait=$( echo "$data" | grep "wa" | awk '{ print $1 }' ) cpuHi=$( echo "$data" | grep "hi" | awk '{ print $1 }' ) cpuSi=$( echo "$data" | grep "si" | awk '{ print $1 }' ) cpuSt=$( echo "$data" | grep "st" | awk '{ print $1 }' ) cpuNonIdle=$(echo 100-$cpuIdle | bc) sInfo="INFO : cpu is in normal ranges." if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then ph.setStatus "critical" sInfo="HINT : cpu WAIT is high - check hardware issues" else if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then ph.setStatus "critical" sInfo="HINT : cpu usage is high - check processes" else ph.setStatus "warning" sInfo="HINT : cpu usage is high - check processes" fi fi fi # ---------------------------------------------------------------------- # output # --- status output ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}" # v1.6: show processes of cpu wait (status D in proces list) plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' ) # replace pipe to prevent start of metrics section echo "$plist" | grep "[0-9]" >/dev/null \ && echo \ && echo "For analysis of cpu waits - processes with status D:" \ && echo "$plist" | tr '|' ':' # v1.7: show most consuming processes if usage is > nn % typeset -i iUsed iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' ) if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then echo echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes" echo "output of top :" topout=$( top -b -n 1 -d 0.1 ) typeset -i iStart iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' ) typeset -i iEnd iEnd=$iStart+5 echo "$topout" | sed -n "${iStart},${iEnd}p" | tr '|' ':' echo echo "output of ps:" ps aux | head -1; ps aux | sort -nrk 3,3 2>/dev/null | head -n 5 | tr '|' ':' echo fi echo " Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait} $sInfo Legend: hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it) swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...) st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine) nice - Time spent running niced user processes (User defined priority) wait - Time spent on waiting on IO peripherals (eg. disk) system - Time spent in kernel space user - Time spent in user space idle - Time spent in idle operations " # --- performance data usage ph.perfadd "cpu-usage" "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100 # for graphite module: send limits # ph.perfadd "cpu-warn" $iWarnLimit "" "" 0 100 # ph.perfadd "cpu-crit" $iCriticalLimit "" "" 0 100 ph.perfadd "cpu-wait" "${cpuWait}" "" "$iCriticalWait" 0 100 # --- performance data single values ph.perfadd "cpu-system" "${cpuSystem}" "" "" 0 100 ph.perfadd "cpu-user" "${cpuUser}" "" "" 0 100 ph.perfadd "cpu-idle" "${cpuIdle}" "" "" 0 100 ph.perfadd "cpu-nice" "${cpuNice}" "" "" 0 100 ph.perfadd "cpu-hwi" "${cpuHi}" "" "" 0 100 ph.perfadd "cpu-swi" "${cpuSi}" "" "" 0 100 ph.perfadd "cpu-st" "${cpuSt}" "" "" 0 100 ph.exit # ----------------------------------------------------------------------