Skip to content
Snippets Groups Projects
check_ceph_status 6.65 KiB
#!/bin/bash
# ======================================================================
#
# Icinga/ Nagios Check
# CEPH STATUS / HEALTH
#
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
#   - ceph and sudo permissions on it
#
# SYNTAX:
#   - check_ceph_status
#     No parameter required
#
# ----------------------------------------------------------------------
# 2020-03-04  v1.0  <axel.hahn@iml.unibe.ch>
# 2020-03-05  v1.1  <axel.hahn@iml.unibe.ch>  switch to ph.* helper functions
# 2021-03-31  v1.2  <axel.hahn@iml.unibe.ch>  estimate remaining time of ceph recovery
# 2021-04-12  v1.3  <axel.hahn@iml.unibe.ch>  if degraded items are 0 delete init file too
# 2023-04-24  v1.4  <axel.hahn@unibe.ch>      update for newer ceph versions
# ======================================================================

. $(dirname $0)/inc_pluginfunctions

initfile=/tmp/ceph-status-not-ok-start
tmpfile=/tmp/ceph-status.out_$$

iSleep=3
doLoop=1
doSingleLoop=1

# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n" 
tbl="|%10s |%10s |%10s |%10s | %s\n" 

line="____________________________________________________________________________________________________________"

# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------

function readCephStatus(){
        sudo ceph status >$tmpfile
        if [ $? -ne 0 ]; then
                rm -f $tmpfile
                ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
        fi
}
function getCephStatus(){
        cat $tmpfile | grep "health:" | awk '{ print $2 }' 
}
function getTotalObjects(){
        cat $tmpfile | grep "pgs:.*objects degraded" | awk '{ print $2 }' | cut -f 2 -d "/"
}
function getDegraded(){
        cat $tmpfile | grep "pgs:.*objects degraded" | awk '{ print $2 }' | cut -f 1 -d "/"
}
function getMisplaced(){
        cat $tmpfile | grep ".*objects misplaced" | awk '{ print $2 }' | cut -f 1 -d "/"
}


# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------


readCephStatus
if [ ! -f $initfile ]; then
        echo "
                sStart=\"$(date)\"
                iTsStart=$(date +%s)
                typeset -i iDeg=$(getDegraded)
                typeset -i iMis=$(getMisplaced)

        ">$initfile
fi
. $initfile

iLastDeg=$iDeg
iLastMis=$iMis

sCephStatus=$(getCephStatus)

if [ "$sCephStatus" = "HEALTH_OK" ]; then
        ph.setStatus "ok"
else
        if [ "$sCephStatus" = "HEALTH_WARN" ]; then
                ph.setStatus "warning"
        else
                ph.setStatus "critical"
        fi
fi

ph.status "Ceph status is $sCephStatus"
echo

while [ ! "$sCephStatus" = "HEALTH_OK" -a $doLoop = 1 ]; do

        typeset -i iObjCount=$iDeg+$iMis

        iTsNow=`date +%s`

        typeset -i iDegNow=$(getDegraded)
        typeset -i iMisNow=$(getMisplaced)

        typeset -i iTsDelta=$iTsNow-$iTsStart
        typeset -i iTsDeltaMin=$iTsDelta/60

        typeset -i iDoneDeg=$iDeg-$iDegNow
        typeset -i iDoneMis=$iMis-$iMisNow

        # typeset -i iDeltaDeg2=$iLastDeg-$iDegNow
        # typeset -i iDeltaMis2=$iLastMis-$iMisNow


        if [ $iTsDelta -gt 0 ]; then
                typeset -i iDegPerMin=$iDoneDeg/$iTsDelta*60
                if [ $iDegPerMin -gt 0 ]; then
                        # timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc`
                        timeByDeg=$(echo $iDegNow/$iDegPerMin | bc)

                else
                        timeByDeg="???"
                fi

                typeset -i iMisPerMin=$iDoneMis/$iTsDelta*60
                if [ $iMisPerMin -gt 0 ]; then
                        # timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc`
                        timeByMis=$(echo $iMisNow/$iMisPerMin | bc)
                else
                        timeByMis="???"
                fi
                if [ $iDegPerMin -gt 0 -o $iMisPerMin -gt 0 ]; then
                        sTimeLeft=$(echo "($iDegNow+$iMisNow)/($iDegPerMin+$iMisPerMin)" | bc)
                        if [ $sTimeLeft -gt 120 ]; then
                                sTimeLeft="$sTimeLeft min ... about $(echo $sTimeLeft/60 | bc) h"
                        else
                                sTimeLeft="$sTimeLeft min"
                        fi
                else
                        sTimeLeft="???"
                fi
        fi
        typeset -i iDoneTotal=$iDoneDeg+$iDoneMis
        typeset -i iNowTotal=$iDegNow+$iMisNow
        typeset -i iProgress=$(echo "$iDoneTotal*100/$iObjCount" | bc)
        if [ "$eater" = "o" ]; then
                eater="C"
        else
                eater="o"
        fi
        iLastDeg=$iDegNow
        iLastMis=$iMisNow


        # ----- output

        test $doSingleLoop = 1 || clear
        echo "Problem detected on $sStart ... running for $iTsDeltaMin min"

        # --- progress bar
        echo -n "["
        # printf '.%.0s' {1..100}
        # echo -n "] $iProgress %"
        # printf "\r["
        for ((j = 0 ; j < $iProgress ; j++)); do printf "#"; done
        printf "$eater"
        for ((j = $iProgress ; j<100; j++)); do printf "."; done
        echo -n "] $iProgress %"
        echo

        # echo $line
        # printf "           $tbl" "on start"  "delta last $iSleep s" "now"       "done"       "= per min"   "time left"
        # printf "degraded   $tbl" "$iDeg"     "$iDeltaDeg2"          "$iDegNow" "$iDoneDeg"   "$iDegPerMin" "$_timeByDeg"
        # printf "misplaced  $tbl" "$iMis"     "$iDeltaMis2"          "$iMisNow" "$iDoneMis"   "$iMisPerMin" "$_timeByMis"
        # printf "total      $tbl" $iObjCount  " "                     $iNowTotal  $iDoneTotal  " "           "$sTimeLeft"
        printf "           $tbl" "on start"  "now"       "done"       "= per min"   "time left"
        printf "degraded   $tbl" "$iDeg"     "$iDegNow" "$iDoneDeg"   "$iDegPerMin" "$_timeByDeg"
        printf "misplaced  $tbl" "$iMis"     "$iMisNow" "$iDoneMis"   "$iMisPerMin" "$_timeByMis"
        printf "total      $tbl" $iObjCount  $iNowTotal  $iDoneTotal  " "           "$sTimeLeft"
        echo $line
        cat $tmpfile
        rm -f $tmpfile 2>/dev/null


        if [ $doSingleLoop = 1 ]; then
                doLoop=0
        else
                sleep $iSleep
                readCephStatus
                sCephStatus=$(getCephStatus)
        fi

done

test $doSingleLoop = 0 -o "$sCephStatus" = "HEALTH_OK" && cat $tmpfile
echo
echo --- DONE `date`
test "$sCephStatus" = "HEALTH_OK" && rm -f $initfile 2>/dev/null
test $iDeg -eq 0 && rm -f $initfile 2>/dev/null
rm -f $tmpfile 2>/dev/null


ph.exit