-
Hahn Axel (hahn) authoredHahn Axel (hahn) authored
check_ceph_status 7.51 KiB
#!/bin/bash
# ======================================================================
#
# Icinga/ Nagios Check
# CEPH STATUS / HEALTH
#
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
# - ceph and sudo permissions on it
#
# SYNTAX:
# - check_ceph_status
# No parameter required
#
# ----------------------------------------------------------------------
# 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2021-03-31 v1.2 <axel.hahn@iml.unibe.ch> estimate remaining time of ceph recovery
# 2021-04-12 v1.3 <axel.hahn@iml.unibe.ch> if degraded items are 0 delete init file too
# 2023-04-24 v1.4 <axel.hahn@unibe.ch> update for newer ceph versions
# 2023-06-19 v1.5 <axel.hahn@unibe.ch> add help and param support; no more tmpfile
# 2023-07-27 v1.6 <axel.hahn@unibe.ch> update help page
# 2023-10-20 v1.7 <axel.hahn@unibe.ch> harden sudo command execution
# 2024-02-29 v1.8 <axel.hahn@unibe.ch> show restore progress only for misplaed/ degraded objects
# ======================================================================
. $(dirname $0)/inc_pluginfunctions
export self_APPVERSION=1.8
initfile="/tmp/ceph-status-not-ok-start-$USER"
iSleep=3
doLoop=1
doSingleLoop=1
# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n"
tbl="|%10s |%10s |%10s |%10s | %s\n"
line="____________________________________________________________________________________________________________"
# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------
function showHelp(){
local _self; _self=$(basename $0)
cat <<EOF
$( ph.showImlHelpHeader )
Show ceph health status.
The state of the check switches to warning if HEALTH_WARN was detected
and is error of other HEALTH values than HEALTH_WARN or HEALTH_OK.
In the output is the complete output of the command "ceph status".
If degraded objects are found it shows the progress of repair process.
SYNTAX:
$_self
OPTIONS:
-h or --help show this help.
EOF
}
function readCephStatus(){
if ! data=$( sudo -n /bin/ceph status 2>&1 ); then
ph.abort "UNKNOWN: No sudo permissions to execute ceph commands."
fi
}
function getCephStatus(){
grep "health:" <<< "$data" | awk '{ print $2 }'
}
function getTotalObjects(){
grep "pgs:.*objects degraded" <<< "$data" | awk '{ print $2 }' | cut -f 2 -d "/"
}
function getDegraded(){
grep "pgs:.*objects degraded" <<< "$data" | awk '{ print $2 }' | cut -f 1 -d "/"
}
function getMisplaced(){
grep ".*objects misplaced" <<< "$data" | awk '{ print $2 }' | cut -f 1 -d "/"
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
# --- check param -h
case "$1" in
"--help"|"-h")
showHelp
exit 0
;;
*)
esac
# --- check required tools
ph.require ceph
readCephStatus
if [ ! -f $initfile ]; then
echo "
sStart=\"$(date)\"
iTsStart=$(date +%s)
typeset -i iDeg=$(getDegraded)
typeset -i iMis=$(getMisplaced)
">$initfile
fi
. $initfile
iLastDeg=$iDeg
iLastMis=$iMis
typeset -i iObjCount=$iDeg+$iMis
sCephStatus=$(getCephStatus)
if [ "$sCephStatus" = "HEALTH_OK" ]; then
ph.setStatus "ok"
else
if [ "$sCephStatus" = "HEALTH_WARN" ]; then
ph.setStatus "warning"
else
ph.setStatus "critical"
fi
fi
ph.status "Ceph status is $sCephStatus"
echo
while [ ! "$sCephStatus" = "HEALTH_OK" ] && [ $iObjCount -gt 0 ] && [ $doLoop = 1 ]; do
iTsNow=$( date +%s )
typeset -i iDegNow=$(getDegraded)
typeset -i iMisNow=$(getMisplaced)
typeset -i iTsDelta=$iTsNow-$iTsStart
typeset -i iTsDeltaMin=$iTsDelta/60
typeset -i iDoneDeg=$iDeg-$iDegNow
typeset -i iDoneMis=$iMis-$iMisNow
# typeset -i iDeltaDeg2=$iLastDeg-$iDegNow
# typeset -i iDeltaMis2=$iLastMis-$iMisNow
if [ $iDegNow+$iMisNow -gt 0 ] && [ $iTsDelta -gt 0 ]; then
typeset -i iDegPerMin=$iDoneDeg/$iTsDelta*60
if [ $iDegPerMin -gt 0 ]; then
# timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc`
timeByDeg=$(echo $iDegNow/$iDegPerMin | bc)
else
timeByDeg="???"
fi
typeset -i iMisPerMin=$iDoneMis/$iTsDelta*60
if [ $iMisPerMin -gt 0 ]; then
# timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc`
timeByMis=$(echo $iMisNow/$iMisPerMin | bc)
else
timeByMis="???"
fi
if [ $iDegPerMin -gt 0 -o $iMisPerMin -gt 0 ]; then
sTimeLeft=$(echo "($iDegNow+$iMisNow)/($iDegPerMin+$iMisPerMin)" | bc)
if [ $sTimeLeft -gt 120 ]; then
sTimeLeft="$sTimeLeft min ... about $(echo $sTimeLeft/60 | bc) h"
else
sTimeLeft="$sTimeLeft min"
fi
else
sTimeLeft="???"
fi
fi
typeset -i iDoneTotal=$iDoneDeg+$iDoneMis
typeset -i iNowTotal=$iDegNow+$iMisNow
typeset -i iProgress=$(echo "$iDoneTotal*100/$iObjCount" | bc)
if [ "$eater" = "o" ]; then
eater="C"
else
eater="o"
fi
iLastDeg=$iDegNow
iLastMis=$iMisNow
# ----- output
test $doSingleLoop = 1 || clear
echo "Problem detected on $sStart ... running for $iTsDeltaMin min"
# --- progress bar
echo -n "["
# printf '.%.0s' {1..100}
# echo -n "] $iProgress %"
# printf "\r["
for ((j = 0 ; j < $iProgress ; j++)); do printf "#"; done
printf "$eater"
for ((j = $iProgress ; j<100; j++)); do printf "."; done
echo -n "] $iProgress %"
echo
# echo $line
# printf " $tbl" "on start" "delta last $iSleep s" "now" "done" "= per min" "time left"
# printf "degraded $tbl" "$iDeg" "$iDeltaDeg2" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
# printf "misplaced $tbl" "$iMis" "$iDeltaMis2" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
# printf "total $tbl" $iObjCount " " $iNowTotal $iDoneTotal " " "$sTimeLeft"
printf " $tbl" "on start" "now" "done" "= per min" "time left"
printf "degraded $tbl" "$iDeg" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
printf "misplaced $tbl" "$iMis" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
printf "total $tbl" $iObjCount $iNowTotal $iDoneTotal " " "$sTimeLeft"
echo $line
echo "$data"
if [ $doSingleLoop = 1 ]; then
doLoop=0
else
sleep $iSleep
readCephStatus
sCephStatus=$(getCephStatus)
fi
done
test $doSingleLoop = 0 -o "$sCephStatus" = "HEALTH_OK" && echo "$data"
echo
echo --- DONE $( date )
test "$sCephStatus" = "HEALTH_OK" && rm -f $initfile 2>/dev/null
test $iDeg -eq 0 && rm -f $initfile 2>/dev/null
ph.exit