Skip to content
Snippets Groups Projects
Commit 1dfbc60f authored by Hahn Axel (hahn)'s avatar Hahn Axel (hahn)
Browse files

add check_fs_errors; update check_ceph_status

parent 4ab5c935
Branches
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
# - ceph
# - ceph and sudo permissions on it
#
# SYNTAX:
# - check_ceph_status
......@@ -15,41 +15,190 @@
#
# ----------------------------------------------------------------------
# 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2021-03-31 v1.2 <axel.hahn@iml.unibe.ch> estimate remaining time of ceph recovery
# 2021-04-12 v1.3 <axel.hahn@iml.unibe.ch> if degraded items are 0 delete init file too
# ======================================================================
. `dirname $0`/inc_pluginfunctions
tmpfile=/tmp/ceph_status_output_$$
initfile=/tmp/ceph-status-not-ok-start
tmpfile=/tmp/ceph-status.out_$$
iSleep=3
doLoop=1
doSingleLoop=1
# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n"
tbl="|%10s |%10s |%10s |%10s | %s\n"
line="____________________________________________________________________________________________________________"
# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------
function readCephStatus(){
sudo ceph status >$tmpfile
if [ $? -ne 0 ]; then
rm -f $tmpfile
ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
fi
}
function getCephStatus(){
cat $tmpfile | grep "health" | awk '{ print $2 }'
}
function getTotalObjects(){
cat $tmpfile | grep "recovery.*objects degraded" | awk '{ print $2 }' | cut -f 2 -d "/"
}
function getDegraded(){
cat $tmpfile | grep "recovery.*objects degraded" | awk '{ print $2 }' | cut -f 1 -d "/"
}
function getMisplaced(){
cat $tmpfile | grep "recovery.*objects misplaced" | awk '{ print $2 }' | cut -f 1 -d "/"
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
sudo /bin/ceph health > $tmpfile 2>&1
grep "HEALTH_" $tmpfile >/dev/null
if [ $? -ne 0 ]; then
rm -f $tmpfile
ph.abort "UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
readCephStatus
if [ ! -f $initfile ]; then
echo "
sStart=\"`date`\"
iTsStart=`date +%s`
typeset -i iDeg=`getDegraded`
typeset -i iMis=`getMisplaced`
">$initfile
fi
. $initfile
grep "HEALTH_OK" $tmpfile >/dev/null
if [ $? -eq 0 ]; then
ph.setStatus "ok"
else
iLastDeg=$iDeg
iLastMis=$iMis
sCephStatus=`getCephStatus`
grep "HEALTH_WARN" $tmpfile >/dev/null
if [ $? -eq 0 ]; then
ph.setStatus "warning"
else
ph.setStatus "critical"
fi
if [ "$sCephStatus" = "HEALTH_OK" ]; then
ph.setStatus "ok"
else
if [ "$sCephStatus" = "HEALTH_WARN" ]; then
ph.setStatus "warning"
else
ph.setStatus "critical"
fi
fi
ph.status "Ceph status is `cat $tmpfile`"
ph.status "Ceph status is $sCephStatus"
echo
sudo /bin/ceph status
rm -f $tmpfile
ph.exit
while [ ! "$sCephStatus" = "HEALTH_OK" -a $doLoop = 1 ]; do
# ----------------------------------------------------------------------
typeset -i iObjCount=$iDeg+$iMis
iTsNow=`date +%s`
typeset -i iDegNow=`getDegraded`
typeset -i iMisNow=`getMisplaced`
typeset -i iTsDelta=$iTsNow-$iTsStart
typeset -i iTsDeltaMin=$iTsDelta/60
typeset -i iDoneDeg=$iDeg-$iDegNow
typeset -i iDoneMis=$iMis-$iMisNow
# typeset -i iDeltaDeg2=$iLastDeg-$iDegNow
# typeset -i iDeltaMis2=$iLastMis-$iMisNow
if [ $iTsDelta -gt 0 ]; then
typeset -i iDegPerMin=$iDoneDeg/$iTsDelta*60
if [ $iDegPerMin -gt 0 ]; then
# timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc`
timeByDeg=`echo $iDegNow/$iDegPerMin | bc`
else
timeByDeg="???"
fi
typeset -i iMisPerMin=$iDoneMis/$iTsDelta*60
if [ $iMisPerMin -gt 0 ]; then
# timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc`
timeByMis=`echo $iMisNow/$iMisPerMin | bc`
else
timeByMis="???"
fi
if [ $iDegPerMin -gt 0 -o $iMisPerMin -gt 0 ]; then
sTimeLeft=`echo "($iDegNow+$iMisNow)/($iDegPerMin+$iMisPerMin)" | bc`
if [ $sTimeLeft -gt 120 ]; then
sTimeLeft="$sTimeLeft min ... about `echo $sTimeLeft/60 | bc` h"
else
sTimeLeft="$sTimeLeft min"
fi
else
sTimeLeft="???"
fi
fi
typeset -i iDoneTotal=$iDoneDeg+$iDoneMis
typeset -i iNowTotal=$iDegNow+$iMisNow
typeset -i iProgress=`echo "$iDoneTotal*100/$iObjCount" | bc`
if [ "$eater" = "o" ]; then
eater="C"
else
eater="o"
fi
iLastDeg=$iDegNow
iLastMis=$iMisNow
# ----- output
test $doSingleLoop = 1 || clear
# echo "Rebuild detected on $sStart ... running for $iTsDelta s [= $iTsDeltaMin min]"
echo "Rebuild detected on $sStart ... running for $iTsDeltaMin min"
# --- progress bar
echo -n "["
# printf '.%.0s' {1..100}
# echo -n "] $iProgress %"
# printf "\r["
for ((j = 0 ; j < $iProgress ; j++)); do printf "#"; done
printf "$eater"
for ((j = $iProgress ; j<100; j++)); do printf "."; done
echo -n "] $iProgress %"
echo
# echo $line
# printf " $tbl" "on start" "delta last $iSleep s" "now" "done" "= per min" "time left"
# printf "degraded $tbl" "$iDeg" "$iDeltaDeg2" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
# printf "misplaced $tbl" "$iMis" "$iDeltaMis2" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
# printf "total $tbl" $iObjCount " " $iNowTotal $iDoneTotal " " "$sTimeLeft"
printf " $tbl" "on start" "now" "done" "= per min" "time left"
printf "degraded $tbl" "$iDeg" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
printf "misplaced $tbl" "$iMis" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
printf "total $tbl" $iObjCount $iNowTotal $iDoneTotal " " "$sTimeLeft"
echo $line
cat $tmpfile
rm -f $tmpfile 2>/dev/null
if [ $doSingleLoop = 1 ]; then
doLoop=0
else
sleep $iSleep
readCephStatus
sCephStatus=`getCephStatus`
fi
done
test $doSingleLoop = 0 -o "$sCephStatus" = "HEALTH_OK" && cat $tmpfile
echo
echo --- DONE `date`
test "$sCephStatus" = "HEALTH_OK" && rm -f $initfile 2>/dev/null
test $iDeg -eq 0 && rm -f $initfile 2>/dev/null
rm -f $tmpfile 2>/dev/null
ph.exit
......@@ -8,7 +8,8 @@
# - sudo permission on /bin/journalctl
#
# ----------------------------------------------------------------------
# 2021-03-23 v0.0 <axel.hahn@iml.unibe.ch>
# 2021-03-23 v1.0 <axel.hahn@iml.unibe.ch>
# 2021-03-30 v1.1 <axel.hahn@iml.unibe.ch> max age of detected errors: since yesterday (commented)
# ======================================================================
......@@ -66,6 +67,8 @@ esac
# ----- MAKE CHECK
# sincedate=$( date +%Y-%m-%d --date 'yesterday' )
# out=$( sudo /bin/journalctl --since $sincedate | grep 'kernel: ' | grep -v 'check_fs_errors' | grep -E '(error|fail)' | grep 'inconsistent' )
out=$( sudo /bin/journalctl | grep 'kernel: ' | grep -v 'check_fs_errors' | grep -E '(error|fail)' | grep 'inconsistent' )
test ! -z "$out" && ph.setStatus "critical"
......
......@@ -34,8 +34,6 @@ There is one include script used by all checks:
* check_disk-io
* check_dns_responsetime
* [check_eol](check_eol.md)
* check_fs_errors - check journal for ext4 consistency errors
* check_fs_writable - for virtual nmachines: detect if filesystem is readonly
* check_haproxy_health
* check_haproxy_status
* check_memory
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment