#!/bin/bash # ====================================================================== # # Check CPU usage # # requirements: # - top # - bc # # ---------------------------------------------------------------------- # 2020-03-10 v1.0 <axel.hahn@iml.unibe.ch> # 2020-03-23 v1.1 <axel.hahn@iml.unibe.ch> added more data # 2020-07-08 v1.2 <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps." # 2020-07-17 v1.3 <axel.hahn@iml.unibe.ch> use ph.require to check binaries # 2021-02-10 v1.4 <axel.hahn@iml.unibe.ch> added critical io wait # 2021-10-28 v1.5 <axel.hahn@iml.unibe.ch> Use 2nd update of top # 2021-12-10 v1.6 <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits # ====================================================================== . `dirname $0`/inc_pluginfunctions tmpfile=/tmp/check_cpu_$$ # ---------------------------------------------------------------------- # functions # ---------------------------------------------------------------------- function showHelp(){ cat <<EOF ______________________________________________________________________ CHECK_CPU check cpu usage and cpu wait v1.6 (c) Institute for Medical Education - Univerity of Bern Licence: GNU GPL 3 ______________________________________________________________________ Cpu infos are taken from output of top command. SYNTAX: `basename $0` [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] OPTIONS: -w VALUE cpu usage warning level (default: 75) -c VALUE cpu usage critical level (default: 90) -i VALUE io wait critical level (default: 50) -h or --help show this help. PARAMETERS: None. EXAMPLE: `basename $0` -w 60 -c 80 -i 40 EOF } # ---------------------------------------------------------------------- # MAIN # ---------------------------------------------------------------------- # --- check required tools ph.require bc top # --- check param -h case "$1" in "--help"|"-h") showHelp exit 0 ;; *) esac # set default / override from command line params typeset -i iWarnLimit=` ph.getValueWithParam 75 w "$@"` typeset -i iCriticalLimit=` ph.getValueWithParam 90 c "$@"` typeset -i iCriticalWait=` ph.getValueWithParam 50 i "$@"` # get cpu status i.e. # %Cpu(s): 33.3 us, 9.5 sy, 0.0 ni, 57.1 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st # us, user : time running un-niced user processes # sy, system : time running kernel processes # ni, nice : time running niced user processes # id, idle : time spent in the kernel idle handler # wa, IO-wait : time waiting for I/O completion # hi : time spent servicing hardware interrupts # si : time spent servicing software interrupts # st : time stolen from this vm by the hypervisor # top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile # FIX read cpu from 2nd output of top top -b -n 2 -d 0.1 | grep -i "^\%Cpu" | tail -1 >$tmpfile cpuUser=` awk '{ print $2 }' $tmpfile` cpuSystem=` awk '{ print $4 }' $tmpfile` cpuNice=` awk '{ print $6 }' $tmpfile` cpuIdle=` awk '{ print $8 }' $tmpfile` cpuWait=` awk '{ print $10 }' $tmpfile` cpuHi=` awk '{ print $12 }' $tmpfile` cpuSi=` awk '{ print $14 }' $tmpfile` cpuSt=` awk '{ print $16 }' $tmpfile` cpuNonIdle=`echo 100-$cpuIdle | bc` rm -f $tmpfile sInfo="INFO : cpu is in normal ranges." if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then ph.setStatus "critical" sInfo="HINT : cpu WAIT is high - check hardware issues" else if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then ph.setStatus "critical" sInfo="HINT : cpu usage is high - check preocesses" else ph.setStatus "warning" sInfo="HINT : cpu usage is high - check preocesses" fi fi fi # --- status output ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}" # v1.6: show processes of cpu wait (status D in proces list) plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' ) echo "$plist" | grep "[0-9]" >/dev/null \ && echo \ && echo "For analysis of cpu waits - processes with status D:" \ && echo "$plist" echo " Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait} $sInfo Legend: hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it) swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...) st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine) nice - Time spent running niced user processes (User defined priority) wait - Time spent on waiting on IO peripherals (eg. disk) system - Time spent in kernel space user - Time spent in user space idle - Time spent in idle operations " # --- performance data usage ph.perfadd "cpu-usage" "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100 # for graphite module: send limits # ph.perfadd "cpu-warn" $iWarnLimit "" "" 0 100 # ph.perfadd "cpu-crit" $iCriticalLimit "" "" 0 100 ph.perfadd "cpu-wait" "${cpuWait}" "" "$iCriticalWait" 0 100 # --- performance data single values ph.perfadd "cpu-system" "${cpuSystem}" "" "" 0 100 ph.perfadd "cpu-user" "${cpuUser}" "" "" 0 100 ph.perfadd "cpu-idle" "${cpuIdle}" "" "" 0 100 ph.perfadd "cpu-nice" "${cpuNice}" "" "" 0 100 ph.perfadd "cpu-hwi" "${cpuHi}" "" "" 0 100 ph.perfadd "cpu-swi" "${cpuSi}" "" "" 0 100 ph.perfadd "cpu-st" "${cpuSt}" "" "" 0 100 ph.exit # ----------------------------------------------------------------------