#!/bin/bash
# ======================================================================
#
# Check CPU usage
#
# requirements:
# - top
# - bc
#
# ----------------------------------------------------------------------
# 2020-03-10  v1.0  <axel.hahn@iml.unibe.ch>
# 2020-03-23  v1.1  <axel.hahn@iml.unibe.ch> added more data
# 2020-07-08  v1.2  <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps."
# 2020-07-17  v1.3  <axel.hahn@iml.unibe.ch> use ph.require to check binaries
# 2021-02-10  v1.4  <axel.hahn@iml.unibe.ch> added critical io wait
# 2021-10-28  v1.5  <axel.hahn@iml.unibe.ch> Use 2nd update of top
# 2021-12-10  v1.6  <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits
# ======================================================================


. `dirname $0`/inc_pluginfunctions
tmpfile=/tmp/check_cpu_$$

# ----------------------------------------------------------------------
# functions
# ----------------------------------------------------------------------

function showHelp(){
cat <<EOF
______________________________________________________________________

CHECK_CPU check cpu usage and cpu wait v1.6

(c) Institute for Medical Education - Univerity of Bern
Licence: GNU GPL 3
______________________________________________________________________

Cpu infos are taken from output of top command.


SYNTAX:
`basename $0` [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT]

OPTIONS:

    -w VALUE       cpu usage warning level  (default: 75)
    -c VALUE       cpu usage critical level (default: 90)

    -i VALUE       io wait critical level   (default: 50)

    -h or --help   show this help.

PARAMETERS:

    None.

EXAMPLE:
`basename $0` -w 60 -c 80 -i 40

EOF
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------

# --- check required tools
ph.require bc top


# --- check param -h
case "$1" in
    "--help"|"-h")
        showHelp
        exit 0
        ;;
    *)
esac

# set default / override from command line params
typeset -i iWarnLimit=`     ph.getValueWithParam 75 w "$@"`
typeset -i iCriticalLimit=` ph.getValueWithParam 90 c "$@"`
typeset -i iCriticalWait=`  ph.getValueWithParam 50 i "$@"`


# get cpu status i.e.
# %Cpu(s): 33.3 us,  9.5 sy,  0.0 ni, 57.1 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
#   us, user    : time running un-niced user processes
#   sy, system  : time running kernel processes
#   ni, nice    : time running niced user processes
#   id, idle    : time spent in the kernel idle handler
#   wa, IO-wait : time waiting for I/O completion
#   hi : time spent servicing hardware interrupts
#   si : time spent servicing software interrupts
#   st : time stolen from this vm by the hypervisor
# top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
# FIX read cpu from 2nd output of top
top -b -n 2 -d 0.1 | grep -i "^\%Cpu" | tail -1 >$tmpfile

cpuUser=`   awk '{ print $2 }' $tmpfile`
cpuSystem=` awk '{ print $4 }' $tmpfile`
cpuNice=`   awk '{ print $6 }' $tmpfile`
cpuIdle=`   awk '{ print $8 }' $tmpfile`
cpuWait=`   awk '{ print $10 }' $tmpfile`
cpuHi=`     awk '{ print $12 }' $tmpfile`
cpuSi=`     awk '{ print $14 }' $tmpfile`
cpuSt=`     awk '{ print $16 }' $tmpfile`
cpuNonIdle=`echo 100-$cpuIdle | bc`

rm -f $tmpfile

sInfo="INFO  : cpu is in normal ranges."
if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
  ph.setStatus "critical"
  sInfo="HINT  : cpu WAIT is high - check hardware issues"
else
  if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then

      if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
          ph.setStatus "critical"
          sInfo="HINT  : cpu usage is high - check preocesses"
      else
          ph.setStatus "warning"
          sInfo="HINT  : cpu usage is high - check preocesses"
      fi
  fi
fi

# --- status output
ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"

# v1.6: show processes of cpu wait (status D in proces list)
plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )
echo "$plist" | grep "[0-9]" >/dev/null \
    && echo \
    && echo "For analysis of cpu waits - processes with status D:" \
    && echo "$plist"

echo "
Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
$sInfo

Legend:

   hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
   swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
    st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)

  nice - Time spent running niced user processes (User defined priority)
  wait - Time spent on waiting on IO peripherals (eg. disk)

system - Time spent in kernel space
  user - Time spent in user space
  idle - Time spent in idle operations
"


# --- performance data usage
ph.perfadd "cpu-usage"    "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100

# for graphite module: send limits
# ph.perfadd "cpu-warn"     $iWarnLimit      "" "" 0 100
# ph.perfadd "cpu-crit"     $iCriticalLimit  "" "" 0 100

ph.perfadd "cpu-wait"     "${cpuWait}"     "" "$iCriticalWait" 0 100

# --- performance data single values
ph.perfadd "cpu-system"   "${cpuSystem}"   "" "" 0 100
ph.perfadd "cpu-user"     "${cpuUser}"     "" "" 0 100
ph.perfadd "cpu-idle"     "${cpuIdle}"     "" "" 0 100
ph.perfadd "cpu-nice"     "${cpuNice}"     "" "" 0 100
ph.perfadd "cpu-hwi"      "${cpuHi}"       "" "" 0 100
ph.perfadd "cpu-swi"      "${cpuSi}"       "" "" 0 100
ph.perfadd "cpu-st"       "${cpuSt}"       "" "" 0 100

ph.exit

# ----------------------------------------------------------------------