-
Hahn Axel (hahn) authoredHahn Axel (hahn) authored
check_cpu 7.33 KiB
#!/bin/bash
# ======================================================================
#
# Check CPU usage
#
# requirements:
# - top
# - bc
#
# ----------------------------------------------------------------------
# 2020-03-10 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-23 v1.1 <axel.hahn@iml.unibe.ch> added more data
# 2020-07-08 v1.2 <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps."
# 2020-07-17 v1.3 <axel.hahn@iml.unibe.ch> use ph.require to check binaries
# 2021-02-10 v1.4 <axel.hahn@iml.unibe.ch> added critical io wait
# 2021-10-28 v1.5 <axel.hahn@iml.unibe.ch> Use 2nd update of top
# 2021-12-10 v1.6 <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits
# 2022-03-09 v1.7 <axel.hahn@iml.unibe.ch> show most cpu intensive processes
# 2022-03-10 v1.8 <axel.hahn@iml.unibe.ch> add cli param -p; update help
# 2022-03-22 v1.9 <axel.hahn@iml.unibe.ch> fix syntax error on 100% idle
# 2022-04-14 v1.10 <axel.hahn@iml.unibe.ch> show consuming cpu processes with top and ps
# ======================================================================
. $(dirname $0)/inc_pluginfunctions
self_APPNAME=$( basename $0 | tr [:lower:] [:upper:] )
self_APPVERSION=1.9
# ----------------------------------------------------------------------
# functions
# ----------------------------------------------------------------------
function showHelp(){
cat <<EOF
______________________________________________________________________
$self_APPNAME
v$self_APPVERSION
(c) Institute for Medical Education - University of Bern
Licence: GNU GPL 3
______________________________________________________________________
check cpu usage and cpu wait
Cpu infos are taken from output of top command.
SYNTAX:
$(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT]
OPTIONS:
-w VALUE cpu usage warning level (default: 75)
-c VALUE cpu usage critical level (default: 90)
-i VALUE io wait critical level (default: 50)
-p VALUE show process info with highest cpu consumption if
usage is > NN %; default: 50
-h or --help show this help.
PARAMETERS:
None.
EXAMPLE:
$(basename $0) -w 60 -c 80 -p 40
EOF
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
# --- check required tools
ph.require bc top
# --- check param -h
case "$1" in
"--help"|"-h")
showHelp
exit 0
;;
*)
esac
# ----------------------------------------------------------------------
# set default / override from command line params
typeset -i iWarnLimit=$( ph.getValueWithParam 75 w "$@")
typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@")
typeset -i iCriticalWait=$( ph.getValueWithParam 50 i "$@")
typeset -i iMinCpuUsageToShowProcesses=$( ph.getValueWithParam 50 p "$@")
# ----------------------------------------------------------------------
# get data
# get cpu status i.e.
# %Cpu(s): 33.3 us, 9.5 sy, 0.0 ni, 57.1 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
# us, user : time running un-niced user processes
# sy, system : time running kernel processes
# ni, nice : time running niced user processes
# id, idle : time spent in the kernel idle handler
# wa, IO-wait : time waiting for I/O completion
# hi : time spent servicing hardware interrupts
# si : time spent servicing software interrupts
# st : time stolen from this vm by the hypervisor
# top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
# FIX read cpu from 2nd output of top
data=$( top -b -n 2 -d 0.1 | grep -i "^\%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" )
cpuUser=$( echo "$data" | grep "us" | awk '{ print $1 }' )
cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' )
cpuNice=$( echo "$data" | grep "ni" | awk '{ print $1 }' )
cpuIdle=$( echo "$data" | grep "id" | awk '{ print $1 }' )
cpuWait=$( echo "$data" | grep "wa" | awk '{ print $1 }' )
cpuHi=$( echo "$data" | grep "hi" | awk '{ print $1 }' )
cpuSi=$( echo "$data" | grep "si" | awk '{ print $1 }' )
cpuSt=$( echo "$data" | grep "st" | awk '{ print $1 }' )
cpuNonIdle=$(echo 100-$cpuIdle | bc)
sInfo="INFO : cpu is in normal ranges."
if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
ph.setStatus "critical"
sInfo="HINT : cpu WAIT is high - check hardware issues"
else
if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then
if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
ph.setStatus "critical"
sInfo="HINT : cpu usage is high - check processes"
else
ph.setStatus "warning"
sInfo="HINT : cpu usage is high - check processes"
fi
fi
fi
# ----------------------------------------------------------------------
# output
# --- status output
ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"
# v1.6: show processes of cpu wait (status D in proces list)
plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )
echo "$plist" | grep "[0-9]" >/dev/null \
&& echo \
&& echo "For analysis of cpu waits - processes with status D:" \
&& echo "$plist"
# v1.7: show most consuming processes if usage is > nn %
typeset -i iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' )
if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then
echo
echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes"
echo "output of top :"
topout=$( top -b -n 1 -d 0.1 )
typeset -i iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' )
typeset -i iEnd=$iStart+5
echo "$topout" | sed -n "${iStart},${iEnd}p"
echo
echo "output of ps:"
ps aux | head -1; ps aux | sort -nrk 3,3 | head -n 5
echo
fi
echo "
Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
$sInfo
Legend:
hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)
nice - Time spent running niced user processes (User defined priority)
wait - Time spent on waiting on IO peripherals (eg. disk)
system - Time spent in kernel space
user - Time spent in user space
idle - Time spent in idle operations
"
# --- performance data usage
ph.perfadd "cpu-usage" "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100
# for graphite module: send limits
# ph.perfadd "cpu-warn" $iWarnLimit "" "" 0 100
# ph.perfadd "cpu-crit" $iCriticalLimit "" "" 0 100
ph.perfadd "cpu-wait" "${cpuWait}" "" "$iCriticalWait" 0 100
# --- performance data single values
ph.perfadd "cpu-system" "${cpuSystem}" "" "" 0 100
ph.perfadd "cpu-user" "${cpuUser}" "" "" 0 100
ph.perfadd "cpu-idle" "${cpuIdle}" "" "" 0 100
ph.perfadd "cpu-nice" "${cpuNice}" "" "" 0 100
ph.perfadd "cpu-hwi" "${cpuHi}" "" "" 0 100
ph.perfadd "cpu-swi" "${cpuSi}" "" "" 0 100
ph.perfadd "cpu-st" "${cpuSt}" "" "" 0 100
ph.exit
# ----------------------------------------------------------------------