Skip to content
Snippets Groups Projects
Select Git revision
  • 43e913bebcee2f0c1769159e3e1259dafd39021a
  • master default protected
  • simple-task/7248-eol-check-add-node-22
  • 6877_check_iml_deployment
4 results

check_snmp_includes

Blame
  • check_cpu 7.69 KiB
    #!/bin/bash
    # ======================================================================
    #
    # Check CPU usage
    #
    # requirements:
    # - top
    # - bc
    #
    # ----------------------------------------------------------------------
    # 2020-03-10  v1.0  <axel.hahn@iml.unibe.ch>
    # 2020-03-23  v1.1  <axel.hahn@iml.unibe.ch>  added more data
    # 2020-07-08  v1.2  <axel.hahn@iml.unibe.ch>  FIX: set "ph." instead "ps."
    # 2020-07-17  v1.3  <axel.hahn@iml.unibe.ch>  use ph.require to check binaries
    # 2021-02-10  v1.4  <axel.hahn@iml.unibe.ch>  added critical io wait
    # 2021-10-28  v1.5  <axel.hahn@iml.unibe.ch>  Use 2nd update of top
    # 2021-12-10  v1.6  <axel.hahn@iml.unibe.ch>  show processes with status D to find cpu waits
    # 2022-03-09  v1.7  <axel.hahn@iml.unibe.ch>  show most cpu intensive processes
    # 2022-03-10  v1.8  <axel.hahn@iml.unibe.ch>  add cli param -p; update help
    # 2022-03-22  v1.9  <axel.hahn@iml.unibe.ch>  fix syntax error on 100% idle
    # 2022-04-14  v1.10 <axel.hahn@iml.unibe.ch>  show consuming cpu processes with top and ps
    # 2022-08-29  v1.11 <axel.hahn@iml.unibe.ch>  replace pipe to prevent start of metrics section
    # 2022-08-29  v1.12 <axel.hahn@iml.unibe.ch>  fix: replace pipe
    # 2023-02-13  v1.13 <axel.hahn@iml.unibe.ch>  small shell fixes
    # 2023-07-27  v1.14 <axel.hahn@unibe.ch>      update help page
    # 2023-09-18  v1.15 <axel.hahn@unibe.ch>      prevent broken pipe message in journallog
    # ======================================================================
    
    
    . $(dirname $0)/inc_pluginfunctions
    
    export self_APPVERSION=1.15
    
    # ----------------------------------------------------------------------
    # functions
    # ----------------------------------------------------------------------
    
    function showHelp(){
        local _self; _self=$(basename $0)
        cat <<EOF
    $( ph.showImlHelpHeader )
    
    check cpu usage and cpu wait
    Cpu infos are taken from output of top command.
    
    On higher cpu usage it can show processes that cause cpu waits and
    with most cpu consumption.
    
    SYNTAX:
    $(basename $0) [-w WARN_LIMIT] [-c CRITICAL_LIMIT] [-i CRITICAL_IO_WAIT] [-p PROCESS_LIMIT]
    
    OPTIONS:
    
        -w VALUE       cpu usage warning level  (default: 75)
        -c VALUE       cpu usage critical level (default: 90)
    
        -i VALUE       io wait critical level   (default: 50)
    
        -p VALUE       show process info with highest cpu consumption if 
                       usage is > NN %; default: 50
    
        -h or --help   show this help.
    
    PARAMETERS:
    
        None.
    
    EXAMPLE:
    $(basename $0) -w 60 -c 80 -p 40
    
    EOF
    }
    # ----------------------------------------------------------------------
    # MAIN
    # ----------------------------------------------------------------------
    
    # --- check required tools
    ph.require bc top
    
    
    # --- check param -h
    case "$1" in
        "--help"|"-h")
            showHelp
            exit 0
            ;;
        *)
    esac
    
    # ----------------------------------------------------------------------
    # set default / override from command line params
    typeset -i iWarnLimit=$(     ph.getValueWithParam 75 w "$@")
    typeset -i iCriticalLimit=$( ph.getValueWithParam 90 c "$@")
    typeset -i iCriticalWait=$(  ph.getValueWithParam 50 i "$@")
    typeset -i iMinCpuUsageToShowProcesses=$(  ph.getValueWithParam 50 p "$@")
    
    # ----------------------------------------------------------------------
    # get data
    
    # get cpu status i.e.
    # %Cpu(s): 33.3 us,  9.5 sy,  0.0 ni, 57.1 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
    #   us, user    : time running un-niced user processes
    #   sy, system  : time running kernel processes
    #   ni, nice    : time running niced user processes
    #   id, idle    : time spent in the kernel idle handler
    #   wa, IO-wait : time waiting for I/O completion
    #   hi : time spent servicing hardware interrupts
    #   si : time spent servicing software interrupts
    #   st : time stolen from this vm by the hypervisor
    # top -b -n 1 | head -5 | grep "^\%Cpu" >$tmpfile
    # FIX read cpu from 2nd output of top
    
    data=$( top -b -n 2 -d 0.1 | grep -i "^%Cpu" | tail -1 | cut -f 2- -d ':' | tr ',' "\n" )
    
    cpuUser=$(   echo "$data" | grep "us" | awk '{ print $1 }' )
    cpuSystem=$( echo "$data" | grep "sy" | awk '{ print $1 }' )
    cpuNice=$(   echo "$data" | grep "ni" | awk '{ print $1 }' )
    cpuIdle=$(   echo "$data" | grep "id" | awk '{ print $1 }' )
    cpuWait=$(   echo "$data" | grep "wa" | awk '{ print $1 }' )
    cpuHi=$(     echo "$data" | grep "hi" | awk '{ print $1 }' )
    cpuSi=$(     echo "$data" | grep "si" | awk '{ print $1 }' )
    cpuSt=$(     echo "$data" | grep "st" | awk '{ print $1 }' )
    
    cpuNonIdle=$(echo 100-$cpuIdle | bc)
    
    sInfo="INFO  : cpu is in normal ranges."
    if [ "$(echo "${cpuWait} > ${iCriticalWait}" | bc)" -eq 1 ]; then
      ph.setStatus "critical"
      sInfo="HINT  : cpu WAIT is high - check hardware issues"
    else
      if [ "$(echo "${cpuNonIdle} > ${iWarnLimit}" | bc)" -eq 1 ]; then
    
          if [ "$(echo "${cpuNonIdle} > ${iCriticalLimit}" | bc)" -eq 1 ]; then
              ph.setStatus "critical"
              sInfo="HINT  : cpu usage is high - check processes"
          else
              ph.setStatus "warning"
              sInfo="HINT  : cpu usage is high - check processes"
          fi
      fi
    fi
    
    # ----------------------------------------------------------------------
    # output
    
    # --- status output
    ph.status "CPU-USAGE [%] ${cpuNonIdle} ... user: ${cpuUser} - system: ${cpuSystem} - idle: ${cpuIdle} - wait: ${cpuWait}"
    
    # v1.6: show processes of cpu wait (status D in proces list)
    plist=$( ps aux | awk '$8 ~ /(D|STAT)/ { print $0 }' )
    
    # replace pipe to prevent start of metrics section
    echo "$plist" | grep "[0-9]" >/dev/null \
        && echo \
        && echo "For analysis of cpu waits - processes with status D:" \
        && echo "$plist" | tr '|' ':'
    
    # v1.7: show most consuming processes if usage is > nn %
    typeset -i iUsed
    iUsed=$( echo $cpuNonIdle | cut -f 1 -d '.' )
    if [ $iUsed -gt $iMinCpuUsageToShowProcesses ]; then
      echo
      echo "CPU usage is higher $iMinCpuUsageToShowProcesses percent ... showing most consuming processes"
      echo "output of top :"
      topout=$( top -b -n 1 -d 0.1 )
      typeset -i iStart
      iStart=$( echo "$topout" | grep -n "PID.*USER" | cut -f 1 -d ':' )
      typeset -i iEnd
      iEnd=$iStart+5
      echo "$topout" | sed -n "${iStart},${iEnd}p" | tr '|' ':' 
      echo
      echo "output of ps:"
      ps aux | head -1; ps aux | sort -nrk 3,3 2>/dev/null | head -n 5 | tr '|' ':'
      echo 
    fi 
    
    echo "
    Limits: usage warn at ${iWarnLimit} .. critical at ${iCriticalLimit} .. io wait limit ${iCriticalWait}
    $sInfo
    
    Legend:
    
       hwi - Time spent handling hardware interrupt routines. (Whenever a peripheral unit want attention form the CPU, it literally pulls a line, to signal the CPU to service it)
       swi - Time spent handling software interrupt routines. (a piece of code, calls an interrupt routine...)
        st - Time spent on involuntary waits by virtual cpu while hypervisor is servicing another processor (stolen from a virtual machine)
    
      nice - Time spent running niced user processes (User defined priority)
      wait - Time spent on waiting on IO peripherals (eg. disk)
    
    system - Time spent in kernel space
      user - Time spent in user space
      idle - Time spent in idle operations
    "
    
    
    # --- performance data usage
    ph.perfadd "cpu-usage"    "${cpuNonIdle}" $iWarnLimit $iCriticalLimit 0 100
    
    # for graphite module: send limits
    # ph.perfadd "cpu-warn"     $iWarnLimit      "" "" 0 100
    # ph.perfadd "cpu-crit"     $iCriticalLimit  "" "" 0 100
    
    ph.perfadd "cpu-wait"     "${cpuWait}"     "" "$iCriticalWait" 0 100
    
    # --- performance data single values
    ph.perfadd "cpu-system"   "${cpuSystem}"   "" "" 0 100
    ph.perfadd "cpu-user"     "${cpuUser}"     "" "" 0 100
    ph.perfadd "cpu-idle"     "${cpuIdle}"     "" "" 0 100
    ph.perfadd "cpu-nice"     "${cpuNice}"     "" "" 0 100
    ph.perfadd "cpu-hwi"      "${cpuHi}"       "" "" 0 100
    ph.perfadd "cpu-swi"      "${cpuSi}"       "" "" 0 100
    ph.perfadd "cpu-st"       "${cpuSt}"       "" "" 0 100
    
    ph.exit
    
    # ----------------------------------------------------------------------