-
Hahn Axel (hahn) authoredHahn Axel (hahn) authored
check_cpu 7.69 KiB
#!/bin/bash
# ======================================================================
#
# Check CPU usage
#
# requirements:
# - top
# - bc
#
# ----------------------------------------------------------------------
# 2020-03-10 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-23 v1.1 <axel.hahn@iml.unibe.ch> added more data
# 2020-07-08 v1.2 <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps."
# 2020-07-17 v1.3 <axel.hahn@iml.unibe.ch> use ph.require to check binaries
# 2021-02-10 v1.4 <axel.hahn@iml.unibe.ch> added critical io wait
# 2021-10-28 v1.5 <axel.hahn@iml.unibe.ch> Use 2nd update of top
# 2021-12-10 v1.6 <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits
# 2022-03-09 v1.7 <axel.hahn@iml.unibe.ch> show most cpu intensive processes
# 2022-03-10 v1.8 <axel.hahn@iml.unibe.ch> add cli param -p; update help
# 2022-03-22 v1.9 <axel.hahn@iml.unibe.ch> fix syntax error on 100% idle
# 2022-04-14 v1.10 <axel.hahn@iml.unibe.ch> show consuming cpu processes with top and ps
# 2022-08-29 v1.11 <axel.hahn@iml.unibe.ch> replace pipe to prevent start of metrics section
# 2022-08-29 v1.12 <axel.hahn@iml.unibe.ch> fix: replace pipe
# 2023-02-13 v1.13 <axel.hahn@iml.unibe.ch> small shell fixes
# 2023-07-27 v1.14 <axel.hahn@unibe.ch> update help page
# 2023-09-18 v1.15 <axel.hahn@unibe.ch> prevent broken pipe message in journallog
# ======================================================================
. $(dirname $0)/inc_pluginfunctions
export self_APPVERSION=1.15
# ----------------------------------------------------------------------
# functions
# ----------------------------------------------------------------------
function showHelp(){
local _self; _self=$(basename $0)
cat <<EOF
$( ph.showImlHelpHeader )
check cpu usage and cpu wait
Cpu infos are taken from output of top command.
On higher cpu usage it can show processes that cause cpu waits and
with most cpu consumption.
SYNTAX:
$(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT]
OPTIONS:
-w VALUE cpu usage warning level (default: 75)
-c VALUE cpu usage critical level (default: 90)
-i VALUE io wait critical level (default: 50)
-p VALUE show process info with highest cpu consumption if
usage is > NN %; default: 50
-h or --help show this help.
PARAMETERS:
None.
EXAMPLE:
$(basename $0) -w 60 -c 80 -p 40
EOF
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
# --- check required tools
ph.require bc top
# --- check param -h
case "$1" in
"--help"|"-h")
showHelp
exit 0
;;
*)
esac
# ----------------------------------------------------------------------
# set default / override from command line params
typeset -i iWarnLimit=$( ph.getValueWithParam 75 w "$@")
typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@")
typeset -i iCriticalWait=$( ph.getValueWithParam 50 i "$@")
typeset -i iMinCpuUsageToShowProcesses=$( ph.getValueWithParam 50 p "$@")
# ----------------------------------------------------------------------
# get data
# get cpu status i.e.
# %Cpu(s): 33.3 us, 9.5 sy, 0.0 ni, 57.1 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
# us, user : time running un-niced user processes
# sy, system : time running kernel processes
# ni, nice : time running niced user processes
# id, idle : time spent in the kernel idle handler
# wa, IO-wait : time waiting for I/O completion
# hi : time spent servicing hardware interrupts
# si : time spent servicing software interrupts
# st : time stolen from this vm by the hypervisor
# top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
# FIX read cpu from 2nd output of top
data=$( top -b -n 2 -d 0.1 | grep -i "^%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" )
cpuUser=$( echo "$data" | grep "us" | awk '{ print $1 }' )
cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' )
cpuNice=$( echo "$data" | grep "ni" | awk '{ print $1 }' )
cpuIdle=$( echo "$data" | grep "id" | awk '{ print $1 }' )
cpuWait=$( echo "$data" | grep "wa" | awk '{ print $1 }' )
cpuHi=$( echo "$data" | grep "hi" | awk '{ print $1 }' )
cpuSi=$( echo "$data" | grep "si" | awk '{ print $1 }' )
cpuSt=$( echo "$data" | grep "st" | awk '{ print $1 }' )
cpuNonIdle=$(echo 100-$cpuIdle | bc)
sInfo="INFO : cpu is in normal ranges."
if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
ph.setStatus "critical"
sInfo="HINT : cpu WAIT is high - check hardware issues"
else
if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then
if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
ph.setStatus "critical"
sInfo="HINT : cpu usage is high - check processes"
else
ph.setStatus "warning"
sInfo="HINT : cpu usage is high - check processes"
fi
fi
fi
# ----------------------------------------------------------------------
# output
# --- status output
ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"
# v1.6: show processes of cpu wait (status D in proces list)
plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )
# replace pipe to prevent start of metrics section
echo "$plist" | grep "[0-9]" >/dev/null \
&& echo \
&& echo "For analysis of cpu waits - processes with status D:" \
&& echo "$plist" | tr '|' ':'
# v1.7: show most consuming processes if usage is > nn %
typeset -i iUsed
iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' )
if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then
echo
echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes"
echo "output of top :"
topout=$( top -b -n 1 -d 0.1 )
typeset -i iStart
iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' )
typeset -i iEnd
iEnd=$iStart+5
echo "$topout" | sed -n "${iStart},${iEnd}p" | tr '|' ':'
echo
echo "output of ps:"
ps aux | head -1; ps aux | sort -nrk 3,3 2>/dev/null | head -n 5 | tr '|' ':'
echo
fi
echo "
Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
$sInfo
Legend:
hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)
nice - Time spent running niced user processes (User defined priority)
wait - Time spent on waiting on IO peripherals (eg. disk)
system - Time spent in kernel space
user - Time spent in user space
idle - Time spent in idle operations
"
# --- performance data usage
ph.perfadd "cpu-usage" "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100
# for graphite module: send limits
# ph.perfadd "cpu-warn" $iWarnLimit "" "" 0 100
# ph.perfadd "cpu-crit" $iCriticalLimit "" "" 0 100
ph.perfadd "cpu-wait" "${cpuWait}" "" "$iCriticalWait" 0 100
# --- performance data single values
ph.perfadd "cpu-system" "${cpuSystem}" "" "" 0 100
ph.perfadd "cpu-user" "${cpuUser}" "" "" 0 100
ph.perfadd "cpu-idle" "${cpuIdle}" "" "" 0 100
ph.perfadd "cpu-nice" "${cpuNice}" "" "" 0 100
ph.perfadd "cpu-hwi" "${cpuHi}" "" "" 0 100
ph.perfadd "cpu-swi" "${cpuSi}" "" "" 0 100
ph.perfadd "cpu-st" "${cpuSt}" "" "" 0 100
ph.exit
# ----------------------------------------------------------------------