Skip to content
Snippets Groups Projects
check_cpu 7.69 KiB
#!/bin/bash
# ======================================================================
#
# Check CPU usage
#
# requirements:
# - top
# - bc
#
# ----------------------------------------------------------------------
# 2020-03-10  v1.0  <axel.hahn@iml.unibe.ch>
# 2020-03-23  v1.1  <axel.hahn@iml.unibe.ch>  added more data
# 2020-07-08  v1.2  <axel.hahn@iml.unibe.ch>  FIX: set "ph." instead "ps."
# 2020-07-17  v1.3  <axel.hahn@iml.unibe.ch>  use ph.require to check binaries
# 2021-02-10  v1.4  <axel.hahn@iml.unibe.ch>  added critical io wait
# 2021-10-28  v1.5  <axel.hahn@iml.unibe.ch>  Use 2nd update of top
# 2021-12-10  v1.6  <axel.hahn@iml.unibe.ch>  show processes with status D to find cpu waits
# 2022-03-09  v1.7  <axel.hahn@iml.unibe.ch>  show most cpu intensive processes
# 2022-03-10  v1.8  <axel.hahn@iml.unibe.ch>  add cli param -p; update help
# 2022-03-22  v1.9  <axel.hahn@iml.unibe.ch>  fix syntax error on 100% idle
# 2022-04-14  v1.10 <axel.hahn@iml.unibe.ch>  show consuming cpu processes with top and ps
# 2022-08-29  v1.11 <axel.hahn@iml.unibe.ch>  replace pipe to prevent start of metrics section
# 2022-08-29  v1.12 <axel.hahn@iml.unibe.ch>  fix: replace pipe
# 2023-02-13  v1.13 <axel.hahn@iml.unibe.ch>  small shell fixes
# 2023-07-27  v1.14 <axel.hahn@unibe.ch>      update help page
# 2023-09-18  v1.15 <axel.hahn@unibe.ch>      prevent broken pipe message in journallog
# ======================================================================


. $(dirname $0)/inc_pluginfunctions

export self_APPVERSION=1.15

# ----------------------------------------------------------------------
# functions
# ----------------------------------------------------------------------

function showHelp(){
    local _self; _self=$(basename $0)
    cat <<EOF
$( ph.showImlHelpHeader )

check cpu usage and cpu wait
Cpu infos are taken from output of top command.

On higher cpu usage it can show processes that cause cpu waits and
with most cpu consumption.

SYNTAX:
$(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT]

OPTIONS:

    -w VALUE       cpu usage warning level  (default: 75)
    -c VALUE       cpu usage critical level (default: 90)

    -i VALUE       io wait critical level   (default: 50)

    -p VALUE       show process info with highest cpu consumption if 
                   usage is > NN %; default: 50

    -h or --help   show this help.

PARAMETERS:

    None.

EXAMPLE:
$(basename $0) -w 60 -c 80 -p 40

EOF
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------

# --- check required tools
ph.require bc top


# --- check param -h
case "$1" in
    "--help"|"-h")
        showHelp
        exit 0
        ;;
    *)
esac

# ----------------------------------------------------------------------
# set default / override from command line params
typeset -i iWarnLimit=$(     ph.getValueWithParam 75 w "$@")
typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@")
typeset -i iCriticalWait=$(  ph.getValueWithParam 50 i "$@")
typeset -i iMinCpuUsageToShowProcesses=$(  ph.getValueWithParam 50 p "$@")

# ----------------------------------------------------------------------
# get data

# get cpu status i.e.
# %Cpu(s): 33.3 us,  9.5 sy,  0.0 ni, 57.1 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
#   us, user    : time running un-niced user processes
#   sy, system  : time running kernel processes
#   ni, nice    : time running niced user processes
#   id, idle    : time spent in the kernel idle handler
#   wa, IO-wait : time waiting for I/O completion
#   hi : time spent servicing hardware interrupts
#   si : time spent servicing software interrupts
#   st : time stolen from this vm by the hypervisor
# top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
# FIX read cpu from 2nd output of top

data=$( top -b -n 2 -d 0.1 | grep -i "^%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" )

cpuUser=$(   echo "$data" | grep "us" | awk '{ print $1 }' )
cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' )
cpuNice=$(   echo "$data" | grep "ni" | awk '{ print $1 }' )
cpuIdle=$(   echo "$data" | grep "id" | awk '{ print $1 }' )
cpuWait=$(   echo "$data" | grep "wa" | awk '{ print $1 }' )
cpuHi=$(     echo "$data" | grep "hi" | awk '{ print $1 }' )
cpuSi=$(     echo "$data" | grep "si" | awk '{ print $1 }' )
cpuSt=$(     echo "$data" | grep "st" | awk '{ print $1 }' )

cpuNonIdle=$(echo 100-$cpuIdle | bc)

sInfo="INFO  : cpu is in normal ranges."
if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
  ph.setStatus "critical"
  sInfo="HINT  : cpu WAIT is high - check hardware issues"
else
  if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then

      if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
          ph.setStatus "critical"
          sInfo="HINT  : cpu usage is high - check processes"
      else
          ph.setStatus "warning"
          sInfo="HINT  : cpu usage is high - check processes"
      fi
  fi
fi

# ----------------------------------------------------------------------
# output

# --- status output
ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"

# v1.6: show processes of cpu wait (status D in proces list)
plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )

# replace pipe to prevent start of metrics section
echo "$plist" | grep "[0-9]" >/dev/null \
    && echo \
    && echo "For analysis of cpu waits - processes with status D:" \
    && echo "$plist" | tr '|' ':'

# v1.7: show most consuming processes if usage is > nn %
typeset -i iUsed
iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' )
if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then
  echo
  echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes"
  echo "output of top :"
  topout=$( top -b -n 1 -d 0.1 )
  typeset -i iStart
  iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' )
  typeset -i iEnd
  iEnd=$iStart+5
  echo "$topout" | sed -n "${iStart},${iEnd}p" | tr '|' ':' 
  echo
  echo "output of ps:"
  ps aux | head -1; ps aux | sort -nrk 3,3 2>/dev/null | head -n 5 | tr '|' ':'
  echo 
fi 

echo "
Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
$sInfo

Legend:

   hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
   swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
    st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)

  nice - Time spent running niced user processes (User defined priority)
  wait - Time spent on waiting on IO peripherals (eg. disk)

system - Time spent in kernel space
  user - Time spent in user space
  idle - Time spent in idle operations
"


# --- performance data usage
ph.perfadd "cpu-usage"    "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100

# for graphite module: send limits
# ph.perfadd "cpu-warn"     $iWarnLimit      "" "" 0 100
# ph.perfadd "cpu-crit"     $iCriticalLimit  "" "" 0 100

ph.perfadd "cpu-wait"     "${cpuWait}"     "" "$iCriticalWait" 0 100

# --- performance data single values
ph.perfadd "cpu-system"   "${cpuSystem}"   "" "" 0 100
ph.perfadd "cpu-user"     "${cpuUser}"     "" "" 0 100
ph.perfadd "cpu-idle"     "${cpuIdle}"     "" "" 0 100
ph.perfadd "cpu-nice"     "${cpuNice}"     "" "" 0 100
ph.perfadd "cpu-hwi"      "${cpuHi}"       "" "" 0 100
ph.perfadd "cpu-swi"      "${cpuSi}"       "" "" 0 100
ph.perfadd "cpu-st"       "${cpuSt}"       "" "" 0 100

ph.exit

# ----------------------------------------------------------------------