Skip to content
Snippets Groups Projects
check_ceph_osd 3.83 KiB
#!/bin/bash
# ======================================================================
#
# Icinga/ Nagios Check
# CEPH OSD STATUS
#
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
#   - sudo permissions on ceph command
#
# SYNTAX:
#   - check_ceph_status -w [count for warnming] -w [count for critical]
#     No parameter required
#
# RESULT:
#     OK        = all OSDs up
#     WARNING   = 1 OSD is down
#     CRITITCAL = min. 2 OSDs are down
#     UNKNOWN   = ceph osd tree is not executable
#
# ----------------------------------------------------------------------
# 2020-03-04  v1.0  <axel.hahn@iml.unibe.ch>
# 2020-03-05  v1.1  <axel.hahn@iml.unibe.ch>  added params -w -c 
# 2020-03-05  v1.2  <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2022-10-21  v1.3  <axel.hahn@unibe.ch>     remove grep: warning: stray \ before white space
# 2023-04-24  v1.4  <axel.hahn@unibe.ch>     update for newer ceph versions
# 2023-06-19  v1.5  <axel.hahn@unibe.ch>     add help and param support; no more tmpfile
# 2023-07-27  v1.6  <axel.hahn@unibe.ch>     shorten ceph exec; show output on error; shell fixes
# 2023-10-20  v1.7  <axel.hahn@unibe.ch>     harden sudo command execution
# ======================================================================

. $(dirname $0)/inc_pluginfunctions

export self_APPVERSION=1.7

# column number in output where to find the up/ down info
iColUpDown=5

# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------

function showHelp(){
    local _self; _self=$(basename $0)
    cat <<EOF
$( ph.showImlHelpHeader )

Show cheph osd status: how many OSDs exist and how many are up/ down.
This check sends performance data.

On your cluster you might want to increase the values for warning and
critical level.

SYNTAX:
$_self [-w WARN_LIMIT] [-c CRITICAL_LIMIT]

OPTIONS:
    -h or --help   show this help.
    -w VALUE       warning level  (default: 1)
    -c VALUE       critical level (default: 2)

EXAMPLE:
$_self
    no parameters; normal usage to get the ceph osd status

$_self -c 10
    change to critical level if 10 osds are down.

EOF
}

# fetch lines of OSD entries only from output of ceph osd tree
# global  string  data  output of command ceph osd tree
function getOsd(){
    grep "^ *[0-9]" <<< "$data"
}

# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
# --- check param -h
case "$1" in
    "--help"|"-h")
        showHelp
        exit 0
        ;;
    *)
esac

# --- check required tools
ph.require ceph

if ! data=$( sudo -n /bin/ceph osd tree 2>&1 ); then
    ph.abort "UNKNOWN: No sudo permissions to execute ceph commands."
fi

# set default / override from command line params
typeset -i iWarnLimit;     iWarnLimit=$(     ph.getValueWithParam 1 w "$@")
typeset -i iCriticalLimit; iCriticalLimit=$( ph.getValueWithParam 2 c "$@")

typeset -i iOsdTotal;      iOsdTotal=$( getOsd | wc -l)
typeset -i iOsdDown;       iOsdDown=$(  getOsd | awk '{ print $iColUpDown }' | grep "down" | wc -l)
typeset -i iOsdUp;         iOsdUp=$(    getOsd | awk '{ print $iColUpDown }' | grep "up"   | wc -l)

if [ $iOsdDown -lt $iWarnLimit ]; then
    ph.setStatus "ok"
else
    if [ $iOsdDown -ge $iCriticalLimit ]; then
        ph.setStatus "critical"
    else
        ph.setStatus "warning"
    fi
fi

ph.status "Check of available OSDs - $iOsdTotal OSDs total .. $iOsdUp up .. $iOsdDown down (Limits: warn at $iWarnLimit; critical $iCriticalLimit)"
echo "$data"

ph.perfadd "osd-total"    "${iOsdTotal}"  "" "" 0 ${iOsdTotal}
ph.perfadd "osd-up"       "${iOsdUp}"     "" "" 0 ${iOsdTotal}
ph.perfadd "osd-down"     "${iOsdDown}"   "" "" 0 ${iOsdTotal}

ph.exit

# ----------------------------------------------------------------------