Skip to content
Snippets Groups Projects
Select Git revision
  • 0e4c3e0945aa20cc7c25593cb195b798a7693057
  • master default protected
  • simple-task/7248-eol-check-add-node-22
  • 6877_check_iml_deployment
4 results

check_cpu

Blame
  • user avatar
    Hahn Axel (hahn) authored
    e75eb71b
    History
    check_cpu 7.33 KiB
    #!/bin/bash
    # ======================================================================
    #
    # Check CPU usage
    #
    # requirements:
    # - top
    # - bc
    #
    # ----------------------------------------------------------------------
    # 2020-03-10  v1.0  <axel.hahn@iml.unibe.ch>
    # 2020-03-23  v1.1  <axel.hahn@iml.unibe.ch> added more data
    # 2020-07-08  v1.2  <axel.hahn@iml.unibe.ch> FIX: set "ph." instead "ps."
    # 2020-07-17  v1.3  <axel.hahn@iml.unibe.ch> use ph.require to check binaries
    # 2021-02-10  v1.4  <axel.hahn@iml.unibe.ch> added critical io wait
    # 2021-10-28  v1.5  <axel.hahn@iml.unibe.ch> Use 2nd update of top
    # 2021-12-10  v1.6  <axel.hahn@iml.unibe.ch> show processes with status D to find cpu waits
    # 2022-03-09  v1.7  <axel.hahn@iml.unibe.ch> show most cpu intensive processes
    # 2022-03-10  v1.8  <axel.hahn@iml.unibe.ch> add cli param -p; update help
    # 2022-03-22  v1.9  <axel.hahn@iml.unibe.ch> fix syntax error on 100% idle
    # 2022-04-14  v1.10 <axel.hahn@iml.unibe.ch> show consuming cpu processes with top and ps
    # ======================================================================
    
    
    . $(dirname $0)/inc_pluginfunctions
    
    self_APPNAME=$( basename $0 | tr [:lower:] [:upper:] )
    self_APPVERSION=1.9
    
    # ----------------------------------------------------------------------
    # functions
    # ----------------------------------------------------------------------
    
    function showHelp(){
    cat <<EOF
    ______________________________________________________________________
    
    $self_APPNAME 
    v$self_APPVERSION
    
    (c) Institute for Medical Education - University of Bern
    Licence: GNU GPL 3
    ______________________________________________________________________
    
    check cpu usage and cpu wait
    Cpu infos are taken from output of top command.
    
    SYNTAX:
    $(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT]
    
    OPTIONS:
    
        -w VALUE       cpu usage warning level  (default: 75)
        -c VALUE       cpu usage critical level (default: 90)
    
        -i VALUE       io wait critical level   (default: 50)
    
        -p VALUE       show process info with highest cpu consumption if 
                       usage is > NN %; default: 50
    
        -h or --help   show this help.
    
    PARAMETERS:
    
        None.
    
    EXAMPLE:
    $(basename $0) -w 60 -c 80 -p 40
    
    EOF
    }
    # ----------------------------------------------------------------------
    # MAIN
    # ----------------------------------------------------------------------
    
    # --- check required tools
    ph.require bc top
    
    
    # --- check param -h
    case "$1" in
        "--help"|"-h")
            showHelp
            exit 0
            ;;
        *)
    esac
    
    # ----------------------------------------------------------------------
    # set default / override from command line params
    typeset -i iWarnLimit=$(     ph.getValueWithParam 75 w "$@")
    typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@")
    typeset -i iCriticalWait=$(  ph.getValueWithParam 50 i "$@")
    typeset -i iMinCpuUsageToShowProcesses=$(  ph.getValueWithParam 50 p "$@")
    
    # ----------------------------------------------------------------------
    # get data
    
    # get cpu status i.e.
    # %Cpu(s): 33.3 us,  9.5 sy,  0.0 ni, 57.1 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
    #   us, user    : time running un-niced user processes
    #   sy, system  : time running kernel processes
    #   ni, nice    : time running niced user processes
    #   id, idle    : time spent in the kernel idle handler
    #   wa, IO-wait : time waiting for I/O completion
    #   hi : time spent servicing hardware interrupts
    #   si : time spent servicing software interrupts
    #   st : time stolen from this vm by the hypervisor
    # top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
    # FIX read cpu from 2nd output of top
    
    data=$( top -b -n 2 -d 0.1 | grep -i "^\%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" )
    
    cpuUser=$(   echo "$data" | grep "us" | awk '{ print $1 }' )
    cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' )
    cpuNice=$(   echo "$data" | grep "ni" | awk '{ print $1 }' )
    cpuIdle=$(   echo "$data" | grep "id" | awk '{ print $1 }' )
    cpuWait=$(   echo "$data" | grep "wa" | awk '{ print $1 }' )
    cpuHi=$(     echo "$data" | grep "hi" | awk '{ print $1 }' )
    cpuSi=$(     echo "$data" | grep "si" | awk '{ print $1 }' )
    cpuSt=$(     echo "$data" | grep "st" | awk '{ print $1 }' )
    
    cpuNonIdle=$(echo 100-$cpuIdle | bc)
    
    sInfo="INFO  : cpu is in normal ranges."
    if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
      ph.setStatus "critical"
      sInfo="HINT  : cpu WAIT is high - check hardware issues"
    else
      if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then
    
          if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
              ph.setStatus "critical"
              sInfo="HINT  : cpu usage is high - check processes"
          else
              ph.setStatus "warning"
              sInfo="HINT  : cpu usage is high - check processes"
          fi
      fi
    fi
    
    # ----------------------------------------------------------------------
    # output
    
    # --- status output
    ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"
    
    # v1.6: show processes of cpu wait (status D in proces list)
    plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )
    echo "$plist" | grep "[0-9]" >/dev/null \
        && echo \
        && echo "For analysis of cpu waits - processes with status D:" \
        && echo "$plist"
    
    # v1.7: show most consuming processes if usage is > nn %
    typeset -i iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' )
    if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then
      echo
      echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes"
      echo "output of top :"
      topout=$( top -b -n 1 -d 0.1 )
      typeset -i iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' )
      typeset -i iEnd=$iStart+5
      echo "$topout" | sed -n "${iStart},${iEnd}p"
      echo
      echo "output of ps:"
      ps aux | head -1; ps aux | sort -nrk 3,3 | head -n 5
      echo 
    fi 
    
    echo "
    Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
    $sInfo
    
    Legend:
    
       hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
       swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
        st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)
    
      nice - Time spent running niced user processes (User defined priority)
      wait - Time spent on waiting on IO peripherals (eg. disk)
    
    system - Time spent in kernel space
      user - Time spent in user space
      idle - Time spent in idle operations
    "
    
    
    # --- performance data usage
    ph.perfadd "cpu-usage"    "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100
    
    # for graphite module: send limits
    # ph.perfadd "cpu-warn"     $iWarnLimit      "" "" 0 100
    # ph.perfadd "cpu-crit"     $iCriticalLimit  "" "" 0 100
    
    ph.perfadd "cpu-wait"     "${cpuWait}"     "" "$iCriticalWait" 0 100
    
    # --- performance data single values
    ph.perfadd "cpu-system"   "${cpuSystem}"   "" "" 0 100
    ph.perfadd "cpu-user"     "${cpuUser}"     "" "" 0 100
    ph.perfadd "cpu-idle"     "${cpuIdle}"     "" "" 0 100
    ph.perfadd "cpu-nice"     "${cpuNice}"     "" "" 0 100
    ph.perfadd "cpu-hwi"      "${cpuHi}"       "" "" 0 100
    ph.perfadd "cpu-swi"      "${cpuSi}"       "" "" 0 100
    ph.perfadd "cpu-st"       "${cpuSt}"       "" "" 0 100
    
    ph.exit
    
    # ----------------------------------------------------------------------