diff --git a/check_ceph_status b/check_ceph_status index 7eb665b835b12e5f01deeca325f584c2efe7e836..8ea08c28c3331c8e75d0d430e0b1ce6eb351c407 100755 --- a/check_ceph_status +++ b/check_ceph_status @@ -7,7 +7,7 @@ # ---------------------------------------------------------------------- # # REQUIREMENTS: -# - ceph +# - ceph and sudo permissions on it # # SYNTAX: # - check_ceph_status @@ -15,41 +15,190 @@ # # ---------------------------------------------------------------------- # 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch> -# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions +# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions +# 2021-03-31 v1.2 <axel.hahn@iml.unibe.ch> estimate remaining time of ceph recovery +# 2021-04-12 v1.3 <axel.hahn@iml.unibe.ch> if degraded items are 0 delete init file too # ====================================================================== . `dirname $0`/inc_pluginfunctions -tmpfile=/tmp/ceph_status_output_$$ +initfile=/tmp/ceph-status-not-ok-start +tmpfile=/tmp/ceph-status.out_$$ + +iSleep=3 +doLoop=1 +doSingleLoop=1 + +# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n" +tbl="|%10s |%10s |%10s |%10s | %s\n" + +line="____________________________________________________________________________________________________________" + +# ---------------------------------------------------------------------- +# FUNCTIONS +# ---------------------------------------------------------------------- + +function readCephStatus(){ + sudo ceph status >$tmpfile + if [ $? -ne 0 ]; then + rm -f $tmpfile + ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands." + fi +} +function getCephStatus(){ + cat $tmpfile | grep "health" | awk '{ print $2 }' +} +function getTotalObjects(){ + cat $tmpfile | grep "recovery.*objects degraded" | awk '{ print $2 }' | cut -f 2 -d "/" +} +function getDegraded(){ + cat $tmpfile | grep "recovery.*objects degraded" | awk '{ print $2 }' | cut -f 1 -d "/" +} +function getMisplaced(){ + cat $tmpfile | grep "recovery.*objects misplaced" | awk '{ print $2 }' | cut -f 1 -d "/" +} + # ---------------------------------------------------------------------- # MAIN # ---------------------------------------------------------------------- -sudo /bin/ceph health > $tmpfile 2>&1 -grep "HEALTH_" $tmpfile >/dev/null -if [ $? -ne 0 ]; then - rm -f $tmpfile - ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands." + + +readCephStatus +if [ ! -f $initfile ]; then + echo " + sStart=\"`date`\" + iTsStart=`date +%s` + typeset -i iDeg=`getDegraded` + typeset -i iMis=`getMisplaced` + + ">$initfile fi +. $initfile -grep "HEALTH_OK" $tmpfile >/dev/null -if [ $? -eq 0 ]; then - ph.setStatus "ok" -else +iLastDeg=$iDeg +iLastMis=$iMis +sCephStatus=`getCephStatus` - grep "HEALTH_WARN" $tmpfile >/dev/null - if [ $? -eq 0 ]; then - ph.setStatus "warning" - else - ph.setStatus "critical" - fi +if [ "$sCephStatus" = "HEALTH_OK" ]; then + ph.setStatus "ok" +else + if [ "$sCephStatus" = "HEALTH_WARN" ]; then + ph.setStatus "warning" + else + ph.setStatus "critical" + fi fi -ph.status "Ceph status is `cat $tmpfile`" +ph.status "Ceph status is $sCephStatus" echo -sudo /bin/ceph status -rm -f $tmpfile -ph.exit +while [ ! "$sCephStatus" = "HEALTH_OK" -a $doLoop = 1 ]; do -# ---------------------------------------------------------------------- + typeset -i iObjCount=$iDeg+$iMis + + iTsNow=`date +%s` + + typeset -i iDegNow=`getDegraded` + typeset -i iMisNow=`getMisplaced` + + typeset -i iTsDelta=$iTsNow-$iTsStart + typeset -i iTsDeltaMin=$iTsDelta/60 + + typeset -i iDoneDeg=$iDeg-$iDegNow + typeset -i iDoneMis=$iMis-$iMisNow + + # typeset -i iDeltaDeg2=$iLastDeg-$iDegNow + # typeset -i iDeltaMis2=$iLastMis-$iMisNow + + + if [ $iTsDelta -gt 0 ]; then + typeset -i iDegPerMin=$iDoneDeg/$iTsDelta*60 + if [ $iDegPerMin -gt 0 ]; then + # timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc` + timeByDeg=`echo $iDegNow/$iDegPerMin | bc` + + else + timeByDeg="???" + fi + + typeset -i iMisPerMin=$iDoneMis/$iTsDelta*60 + if [ $iMisPerMin -gt 0 ]; then + # timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc` + timeByMis=`echo $iMisNow/$iMisPerMin | bc` + else + timeByMis="???" + fi + if [ $iDegPerMin -gt 0 -o $iMisPerMin -gt 0 ]; then + sTimeLeft=`echo "($iDegNow+$iMisNow)/($iDegPerMin+$iMisPerMin)" | bc` + if [ $sTimeLeft -gt 120 ]; then + sTimeLeft="$sTimeLeft min ... about `echo $sTimeLeft/60 | bc` h" + else + sTimeLeft="$sTimeLeft min" + fi + else + sTimeLeft="???" + fi + fi + typeset -i iDoneTotal=$iDoneDeg+$iDoneMis + typeset -i iNowTotal=$iDegNow+$iMisNow + typeset -i iProgress=`echo "$iDoneTotal*100/$iObjCount" | bc` + if [ "$eater" = "o" ]; then + eater="C" + else + eater="o" + fi + iLastDeg=$iDegNow + iLastMis=$iMisNow + + + # ----- output + + test $doSingleLoop = 1 || clear + # echo "Rebuild detected on $sStart ... running for $iTsDelta s [= $iTsDeltaMin min]" + echo "Rebuild detected on $sStart ... running for $iTsDeltaMin min" + + # --- progress bar + echo -n "[" + # printf '.%.0s' {1..100} + # echo -n "] $iProgress %" + # printf "\r[" + for ((j = 0 ; j < $iProgress ; j++)); do printf "#"; done + printf "$eater" + for ((j = $iProgress ; j<100; j++)); do printf "."; done + echo -n "] $iProgress %" + echo + + # echo $line + # printf " $tbl" "on start" "delta last $iSleep s" "now" "done" "= per min" "time left" + # printf "degraded $tbl" "$iDeg" "$iDeltaDeg2" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg" + # printf "misplaced $tbl" "$iMis" "$iDeltaMis2" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis" + # printf "total $tbl" $iObjCount " " $iNowTotal $iDoneTotal " " "$sTimeLeft" + printf " $tbl" "on start" "now" "done" "= per min" "time left" + printf "degraded $tbl" "$iDeg" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg" + printf "misplaced $tbl" "$iMis" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis" + printf "total $tbl" $iObjCount $iNowTotal $iDoneTotal " " "$sTimeLeft" + echo $line + cat $tmpfile + rm -f $tmpfile 2>/dev/null + + + if [ $doSingleLoop = 1 ]; then + doLoop=0 + else + sleep $iSleep + readCephStatus + sCephStatus=`getCephStatus` + fi + +done + +test $doSingleLoop = 0 -o "$sCephStatus" = "HEALTH_OK" && cat $tmpfile +echo +echo --- DONE `date` +test "$sCephStatus" = "HEALTH_OK" && rm -f $initfile 2>/dev/null +test $iDeg -eq 0 && rm -f $initfile 2>/dev/null +rm -f $tmpfile 2>/dev/null + + +ph.exit diff --git a/check_fs_errors b/check_fs_errors index 7f960889dfc6c9d90ed931fdc1e1b9a1fdbed9ef..3978fd20cd282bcd2499799b2f172b117c91bea6 100755 --- a/check_fs_errors +++ b/check_fs_errors @@ -8,7 +8,8 @@ # - sudo permission on /bin/journalctl # # ---------------------------------------------------------------------- -# 2021-03-23 v0.0 <axel.hahn@iml.unibe.ch> +# 2021-03-23 v1.0 <axel.hahn@iml.unibe.ch> +# 2021-03-30 v1.1 <axel.hahn@iml.unibe.ch> max age of detected errors: since yesterday (commented) # ====================================================================== @@ -66,6 +67,8 @@ esac # ----- MAKE CHECK +# sincedate=$( date +%Y-%m-%d --date 'yesterday' ) +# out=$( sudo /bin/journalctl --since $sincedate | grep 'kernel: ' | grep -v 'check_fs_errors' | grep -E '(error|fail)' | grep 'inconsistent' ) out=$( sudo /bin/journalctl | grep 'kernel: ' | grep -v 'check_fs_errors' | grep -E '(error|fail)' | grep 'inconsistent' ) test ! -z "$out" && ph.setStatus "critical" diff --git a/readme.md b/readme.md index 73fa61af2c92cad9a1f86f2a81c57c950ab6966a..5d5521cf1e9de3fc8b1d9fe581ef51dd8429660a 100644 --- a/readme.md +++ b/readme.md @@ -34,8 +34,6 @@ There is one include script used by all checks: * check_disk-io * check_dns_responsetime * [check_eol](check_eol.md) -* check_fs_errors - check journal for ext4 consistency errors -* check_fs_writable - for virtual nmachines: detect if filesystem is readonly * check_haproxy_health * check_haproxy_status * check_memory