-
Hahn Axel (hahn) authoredHahn Axel (hahn) authored
check_ceph_status 6.65 KiB
#!/bin/bash
# ======================================================================
#
# Icinga/ Nagios Check
# CEPH STATUS / HEALTH
#
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
# - ceph and sudo permissions on it
#
# SYNTAX:
# - check_ceph_status
# No parameter required
#
# ----------------------------------------------------------------------
# 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2021-03-31 v1.2 <axel.hahn@iml.unibe.ch> estimate remaining time of ceph recovery
# 2021-04-12 v1.3 <axel.hahn@iml.unibe.ch> if degraded items are 0 delete init file too
# 2023-04-24 v1.4 <axel.hahn@unibe.ch> update for newer ceph versions
# ======================================================================
. $(dirname $0)/inc_pluginfunctions
initfile=/tmp/ceph-status-not-ok-start
tmpfile=/tmp/ceph-status.out_$$
iSleep=3
doLoop=1
doSingleLoop=1
# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n"
tbl="|%10s |%10s |%10s |%10s | %s\n"
line="____________________________________________________________________________________________________________"
# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------
function readCephStatus(){
sudo ceph status >$tmpfile
if [ $? -ne 0 ]; then
rm -f $tmpfile
ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
fi
}
function getCephStatus(){
cat $tmpfile | grep "health:" | awk '{ print $2 }'
}
function getTotalObjects(){
cat $tmpfile | grep "pgs:.*objects degraded" | awk '{ print $2 }' | cut -f 2 -d "/"
}
function getDegraded(){
cat $tmpfile | grep "pgs:.*objects degraded" | awk '{ print $2 }' | cut -f 1 -d "/"
}
function getMisplaced(){
cat $tmpfile | grep ".*objects misplaced" | awk '{ print $2 }' | cut -f 1 -d "/"
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
readCephStatus
if [ ! -f $initfile ]; then
echo "
sStart=\"$(date)\"
iTsStart=$(date +%s)
typeset -i iDeg=$(getDegraded)
typeset -i iMis=$(getMisplaced)
">$initfile
fi
. $initfile
iLastDeg=$iDeg
iLastMis=$iMis
sCephStatus=$(getCephStatus)
if [ "$sCephStatus" = "HEALTH_OK" ]; then
ph.setStatus "ok"
else
if [ "$sCephStatus" = "HEALTH_WARN" ]; then
ph.setStatus "warning"
else
ph.setStatus "critical"
fi
fi
ph.status "Ceph status is $sCephStatus"
echo
while [ ! "$sCephStatus" = "HEALTH_OK" -a $doLoop = 1 ]; do
typeset -i iObjCount=$iDeg+$iMis
iTsNow=`date +%s`
typeset -i iDegNow=$(getDegraded)
typeset -i iMisNow=$(getMisplaced)
typeset -i iTsDelta=$iTsNow-$iTsStart
typeset -i iTsDeltaMin=$iTsDelta/60
typeset -i iDoneDeg=$iDeg-$iDegNow
typeset -i iDoneMis=$iMis-$iMisNow
# typeset -i iDeltaDeg2=$iLastDeg-$iDegNow
# typeset -i iDeltaMis2=$iLastMis-$iMisNow
if [ $iTsDelta -gt 0 ]; then
typeset -i iDegPerMin=$iDoneDeg/$iTsDelta*60
if [ $iDegPerMin -gt 0 ]; then
# timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc`
timeByDeg=$(echo $iDegNow/$iDegPerMin | bc)
else
timeByDeg="???"
fi
typeset -i iMisPerMin=$iDoneMis/$iTsDelta*60
if [ $iMisPerMin -gt 0 ]; then
# timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc`
timeByMis=$(echo $iMisNow/$iMisPerMin | bc)
else
timeByMis="???"
fi
if [ $iDegPerMin -gt 0 -o $iMisPerMin -gt 0 ]; then
sTimeLeft=$(echo "($iDegNow+$iMisNow)/($iDegPerMin+$iMisPerMin)" | bc)
if [ $sTimeLeft -gt 120 ]; then
sTimeLeft="$sTimeLeft min ... about $(echo $sTimeLeft/60 | bc) h"
else
sTimeLeft="$sTimeLeft min"
fi
else
sTimeLeft="???"
fi
fi
typeset -i iDoneTotal=$iDoneDeg+$iDoneMis
typeset -i iNowTotal=$iDegNow+$iMisNow
typeset -i iProgress=$(echo "$iDoneTotal*100/$iObjCount" | bc)
if [ "$eater" = "o" ]; then
eater="C"
else
eater="o"
fi
iLastDeg=$iDegNow
iLastMis=$iMisNow
# ----- output
test $doSingleLoop = 1 || clear
echo "Problem detected on $sStart ... running for $iTsDeltaMin min"
# --- progress bar
echo -n "["
# printf '.%.0s' {1..100}
# echo -n "] $iProgress %"
# printf "\r["
for ((j = 0 ; j < $iProgress ; j++)); do printf "#"; done
printf "$eater"
for ((j = $iProgress ; j<100; j++)); do printf "."; done
echo -n "] $iProgress %"
echo
# echo $line
# printf " $tbl" "on start" "delta last $iSleep s" "now" "done" "= per min" "time left"
# printf "degraded $tbl" "$iDeg" "$iDeltaDeg2" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
# printf "misplaced $tbl" "$iMis" "$iDeltaMis2" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
# printf "total $tbl" $iObjCount " " $iNowTotal $iDoneTotal " " "$sTimeLeft"
printf " $tbl" "on start" "now" "done" "= per min" "time left"
printf "degraded $tbl" "$iDeg" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
printf "misplaced $tbl" "$iMis" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
printf "total $tbl" $iObjCount $iNowTotal $iDoneTotal " " "$sTimeLeft"
echo $line
cat $tmpfile
rm -f $tmpfile 2>/dev/null
if [ $doSingleLoop = 1 ]; then
doLoop=0
else
sleep $iSleep
readCephStatus
sCephStatus=$(getCephStatus)
fi
done
test $doSingleLoop = 0 -o "$sCephStatus" = "HEALTH_OK" && cat $tmpfile
echo
echo --- DONE `date`
test "$sCephStatus" = "HEALTH_OK" && rm -f $initfile 2>/dev/null
test $iDeg -eq 0 && rm -f $initfile 2>/dev/null
rm -f $tmpfile 2>/dev/null
ph.exit