#!/bin/bash # ====================================================================== # # Icinga/ Nagios Check # CEPH OSD STATUS # # ---------------------------------------------------------------------- # # REQUIREMENTS: # - sudo permissions on ceph command # # SYNTAX: # - check_ceph_status -w [count for warnming] -w [count for critical] # No parameter required # # RESULT: # OK = all OSDs up # WARNING = 1 OSD is down # CRITITCAL = min. 2 OSDs are down # UNKNOWN = ceph osd tree is not executable # # ---------------------------------------------------------------------- # 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch> # 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> added params -w -c # 2020-03-05 v1.2 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions # 2022-10-21 v1.3 <axel.hahn@unibe.ch> remove grep: warning: stray \ before white space # 2023-04-24 v1.4 <axel.hahn@unibe.ch> update for newer ceph versions # 2023-06-19 v1.5 <axel.hahn@unibe.ch> add help and param support; no more tmpfile # 2023-07-27 v1.6 <axel.hahn@unibe.ch> shorten ceph exec; show output on error; shell fixes # 2023-10-20 v1.7 <axel.hahn@unibe.ch> harden sudo command execution # ====================================================================== . $(dirname $0)/inc_pluginfunctions export self_APPVERSION=1.7 # column number in output where to find the up/ down info iColUpDown=5 # ---------------------------------------------------------------------- # FUNCTIONS # ---------------------------------------------------------------------- function showHelp(){ local _self; _self=$(basename $0) cat <<EOF $( ph.showImlHelpHeader ) Show cheph osd status: how many OSDs exist and how many are up/ down. This check sends performance data. On your cluster you might want to increase the values for warning and critical level. SYNTAX: $_self [-w WARN_LIMIT] [-c CRITICAL_LIMIT] OPTIONS: -h or --help show this help. -w VALUE warning level (default: 1) -c VALUE critical level (default: 2) EXAMPLE: $_self no parameters; normal usage to get the ceph osd status $_self -c 10 change to critical level if 10 osds are down. EOF } # fetch lines of OSD entries only from output of ceph osd tree # global string data output of command ceph osd tree function getOsd(){ grep "^ *[0-9]" <<< "$data" } # ---------------------------------------------------------------------- # MAIN # ---------------------------------------------------------------------- # --- check param -h case "$1" in "--help"|"-h") showHelp exit 0 ;; *) esac # --- check required tools ph.require ceph if ! data=$( sudo -n /bin/ceph osd tree 2>&1 ); then ph.abort "UNKNOWN: No sudo permissions to execute ceph commands." fi # set default / override from command line params typeset -i iWarnLimit; iWarnLimit=$( ph.getValueWithParam 1 w "$@") typeset -i iCriticalLimit; iCriticalLimit=$( ph.getValueWithParam 2 c "$@") typeset -i iOsdTotal; iOsdTotal=$( getOsd | wc -l) typeset -i iOsdDown; iOsdDown=$( getOsd | awk '{ print $iColUpDown }' | grep "down" | wc -l) typeset -i iOsdUp; iOsdUp=$( getOsd | awk '{ print $iColUpDown }' | grep "up" | wc -l) if [ $iOsdDown -lt $iWarnLimit ]; then ph.setStatus "ok" else if [ $iOsdDown -ge $iCriticalLimit ]; then ph.setStatus "critical" else ph.setStatus "warning" fi fi ph.status "Check of available OSDs - $iOsdTotal OSDs total .. $iOsdUp up .. $iOsdDown down (Limits: warn at $iWarnLimit; critical $iCriticalLimit)" echo "$data" ph.perfadd "osd-total" "${iOsdTotal}" "" "" 0 ${iOsdTotal} ph.perfadd "osd-up" "${iOsdUp}" "" "" 0 ${iOsdTotal} ph.perfadd "osd-down" "${iOsdDown}" "" "" 0 ${iOsdTotal} ph.exit # ----------------------------------------------------------------------