Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
icinga-checks
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
IML Open Source
icinga-checks
Commits
1dfbc60f
Commit
1dfbc60f
authored
4 years ago
by
Hahn Axel (hahn)
Browse files
Options
Downloads
Patches
Plain Diff
add check_fs_errors; update check_ceph_status
parent
4ab5c935
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
check_ceph_status
+172
-23
172 additions, 23 deletions
check_ceph_status
check_fs_errors
+4
-1
4 additions, 1 deletion
check_fs_errors
readme.md
+0
-2
0 additions, 2 deletions
readme.md
with
176 additions
and
26 deletions
check_ceph_status
+
172
−
23
View file @
1dfbc60f
...
...
@@ -7,7 +7,7 @@
# ----------------------------------------------------------------------
#
# REQUIREMENTS:
# - ceph
# - ceph
and sudo permissions on it
#
# SYNTAX:
# - check_ceph_status
...
...
@@ -15,41 +15,190 @@
#
# ----------------------------------------------------------------------
# 2020-03-04 v1.0 <axel.hahn@iml.unibe.ch>
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2020-03-05 v1.1 <axel.hahn@iml.unibe.ch> switch to ph.* helper functions
# 2021-03-31 v1.2 <axel.hahn@iml.unibe.ch> estimate remaining time of ceph recovery
# 2021-04-12 v1.3 <axel.hahn@iml.unibe.ch> if degraded items are 0 delete init file too
# ======================================================================
.
`
dirname
$0
`
/inc_pluginfunctions
tmpfile
=
/tmp/ceph_status_output_
$$
initfile
=
/tmp/ceph-status-not-ok-start
tmpfile
=
/tmp/ceph-status.out_
$$
iSleep
=
3
doLoop
=
1
doSingleLoop
=
1
# tbl="|%10s |%18s |%10s |%10s |%10s |%10s\n"
tbl
=
"|%10s |%10s |%10s |%10s | %s
\n
"
line
=
"____________________________________________________________________________________________________________"
# ----------------------------------------------------------------------
# FUNCTIONS
# ----------------------------------------------------------------------
function
readCephStatus
(){
sudo
ceph status
>
$tmpfile
if
[
$?
-ne
0
]
;
then
rm
-f
$tmpfile
ph.abort
"UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
fi
}
function
getCephStatus
(){
cat
$tmpfile
|
grep
"health"
|
awk
'{ print $2 }'
}
function
getTotalObjects
(){
cat
$tmpfile
|
grep
"recovery.*objects degraded"
|
awk
'{ print $2 }'
|
cut
-f
2
-d
"/"
}
function
getDegraded
(){
cat
$tmpfile
|
grep
"recovery.*objects degraded"
|
awk
'{ print $2 }'
|
cut
-f
1
-d
"/"
}
function
getMisplaced
(){
cat
$tmpfile
|
grep
"recovery.*objects misplaced"
|
awk
'{ print $2 }'
|
cut
-f
1
-d
"/"
}
# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
sudo
/bin/ceph health
>
$tmpfile
2>&1
grep
"HEALTH_"
$tmpfile
>
/dev/null
if
[
$?
-ne
0
]
;
then
rm
-f
$tmpfile
ph.abort
"UNKNOWN: ceph is not available or no sudo permissions to execute ceph commands."
readCephStatus
if
[
!
-f
$initfile
]
;
then
echo
"
sStart=
\"
`
date
`
\"
iTsStart=
`
date
+%s
`
typeset -i iDeg=
`
getDegraded
`
typeset -i iMis=
`
getMisplaced
`
"
>
$initfile
fi
.
$initfile
grep
"HEALTH_OK"
$tmpfile
>
/dev/null
if
[
$?
-eq
0
]
;
then
ph.setStatus
"ok"
else
iLastDeg
=
$iDeg
iLastMis
=
$iMis
sCephStatus
=
`
getCephStatus
`
grep
"HEALTH_WARN"
$tmpfile
>
/dev/null
if
[
$?
-eq
0
]
;
then
ph.setStatus
"warning"
else
ph.setStatus
"critical"
fi
if
[
"
$sCephStatus
"
=
"HEALTH_OK"
]
;
then
ph.setStatus
"ok"
else
if
[
"
$sCephStatus
"
=
"HEALTH_WARN"
]
;
then
ph.setStatus
"warning"
else
ph.setStatus
"critical"
fi
fi
ph.status
"Ceph status is
`
cat
$tmpfile
`
"
ph.status
"Ceph status is
$sCephStatus
"
echo
sudo
/bin/ceph status
rm
-f
$tmpfile
ph.exit
while
[
!
"
$sCephStatus
"
=
"HEALTH_OK"
-a
$doLoop
=
1
]
;
do
# ----------------------------------------------------------------------
typeset
-i
iObjCount
=
$iDeg
+
$iMis
iTsNow
=
`
date
+%s
`
typeset
-i
iDegNow
=
`
getDegraded
`
typeset
-i
iMisNow
=
`
getMisplaced
`
typeset
-i
iTsDelta
=
$iTsNow
-
$iTsStart
typeset
-i
iTsDeltaMin
=
$iTsDelta
/60
typeset
-i
iDoneDeg
=
$iDeg
-
$iDegNow
typeset
-i
iDoneMis
=
$iMis
-
$iMisNow
# typeset -i iDeltaDeg2=$iLastDeg-$iDegNow
# typeset -i iDeltaMis2=$iLastMis-$iMisNow
if
[
$iTsDelta
-gt
0
]
;
then
typeset
-i
iDegPerMin
=
$iDoneDeg
/
$iTsDelta
*
60
if
[
$iDegPerMin
-gt
0
]
;
then
# timeByDeg=`echo $iTsDelta*$iObjCount/$iDoneDeg/60 - $iTsDelta/60 | bc`
timeByDeg
=
`
echo
$iDegNow
/
$iDegPerMin
| bc
`
else
timeByDeg
=
"???"
fi
typeset
-i
iMisPerMin
=
$iDoneMis
/
$iTsDelta
*
60
if
[
$iMisPerMin
-gt
0
]
;
then
# timeByMis=`echo $iTsDelta*$iObjCount/$iDoneMis/60 - $iTsDelta/60 | bc`
timeByMis
=
`
echo
$iMisNow
/
$iMisPerMin
| bc
`
else
timeByMis
=
"???"
fi
if
[
$iDegPerMin
-gt
0
-o
$iMisPerMin
-gt
0
]
;
then
sTimeLeft
=
`
echo
"(
$iDegNow
+
$iMisNow
)/(
$iDegPerMin
+
$iMisPerMin
)"
| bc
`
if
[
$sTimeLeft
-gt
120
]
;
then
sTimeLeft
=
"
$sTimeLeft
min ... about
`
echo
$sTimeLeft
/60 | bc
`
h"
else
sTimeLeft
=
"
$sTimeLeft
min"
fi
else
sTimeLeft
=
"???"
fi
fi
typeset
-i
iDoneTotal
=
$iDoneDeg
+
$iDoneMis
typeset
-i
iNowTotal
=
$iDegNow
+
$iMisNow
typeset
-i
iProgress
=
`
echo
"
$iDoneTotal
*100/
$iObjCount
"
| bc
`
if
[
"
$eater
"
=
"o"
]
;
then
eater
=
"C"
else
eater
=
"o"
fi
iLastDeg
=
$iDegNow
iLastMis
=
$iMisNow
# ----- output
test
$doSingleLoop
=
1
||
clear
# echo "Rebuild detected on $sStart ... running for $iTsDelta s [= $iTsDeltaMin min]"
echo
"Rebuild detected on
$sStart
... running for
$iTsDeltaMin
min"
# --- progress bar
echo
-n
"["
# printf '.%.0s' {1..100}
# echo -n "] $iProgress %"
# printf "\r["
for
((
j
=
0
;
j <
$iProgress
;
j++
))
;
do
printf
"#"
;
done
printf
"
$eater
"
for
((
j
=
$iProgress
;
j<100
;
j++
))
;
do
printf
"."
;
done
echo
-n
"]
$iProgress
%"
echo
# echo $line
# printf " $tbl" "on start" "delta last $iSleep s" "now" "done" "= per min" "time left"
# printf "degraded $tbl" "$iDeg" "$iDeltaDeg2" "$iDegNow" "$iDoneDeg" "$iDegPerMin" "$_timeByDeg"
# printf "misplaced $tbl" "$iMis" "$iDeltaMis2" "$iMisNow" "$iDoneMis" "$iMisPerMin" "$_timeByMis"
# printf "total $tbl" $iObjCount " " $iNowTotal $iDoneTotal " " "$sTimeLeft"
printf
"
$tbl
"
"on start"
"now"
"done"
"= per min"
"time left"
printf
"degraded
$tbl
"
"
$iDeg
"
"
$iDegNow
"
"
$iDoneDeg
"
"
$iDegPerMin
"
"
$_timeByDeg
"
printf
"misplaced
$tbl
"
"
$iMis
"
"
$iMisNow
"
"
$iDoneMis
"
"
$iMisPerMin
"
"
$_timeByMis
"
printf
"total
$tbl
"
$iObjCount
$iNowTotal
$iDoneTotal
" "
"
$sTimeLeft
"
echo
$line
cat
$tmpfile
rm
-f
$tmpfile
2>/dev/null
if
[
$doSingleLoop
=
1
]
;
then
doLoop
=
0
else
sleep
$iSleep
readCephStatus
sCephStatus
=
`
getCephStatus
`
fi
done
test
$doSingleLoop
=
0
-o
"
$sCephStatus
"
=
"HEALTH_OK"
&&
cat
$tmpfile
echo
echo
---
DONE
`
date
`
test
"
$sCephStatus
"
=
"HEALTH_OK"
&&
rm
-f
$initfile
2>/dev/null
test
$iDeg
-eq
0
&&
rm
-f
$initfile
2>/dev/null
rm
-f
$tmpfile
2>/dev/null
ph.exit
This diff is collapsed.
Click to expand it.
check_fs_errors
+
4
−
1
View file @
1dfbc60f
...
...
@@ -8,7 +8,8 @@
# - sudo permission on /bin/journalctl
#
# ----------------------------------------------------------------------
# 2021-03-23 v0.0 <axel.hahn@iml.unibe.ch>
# 2021-03-23 v1.0 <axel.hahn@iml.unibe.ch>
# 2021-03-30 v1.1 <axel.hahn@iml.unibe.ch> max age of detected errors: since yesterday (commented)
# ======================================================================
...
...
@@ -66,6 +67,8 @@ esac
# ----- MAKE CHECK
# sincedate=$( date +%Y-%m-%d --date 'yesterday' )
# out=$( sudo /bin/journalctl --since $sincedate | grep 'kernel: ' | grep -v 'check_fs_errors' | grep -E '(error|fail)' | grep 'inconsistent' )
out
=
$(
sudo
/bin/journalctl |
grep
'kernel: '
|
grep
-v
'check_fs_errors'
|
grep
-E
'(error|fail)'
|
grep
'inconsistent'
)
test
!
-z
"
$out
"
&&
ph.setStatus
"critical"
...
...
This diff is collapsed.
Click to expand it.
readme.md
+
0
−
2
View file @
1dfbc60f
...
...
@@ -34,8 +34,6 @@ There is one include script used by all checks:
*
check_disk-io
*
check_dns_responsetime
*
[
check_eol
](
check_eol.md
)
*
check_fs_errors - check journal for ext4 consistency errors
*
check_fs_writable - for virtual nmachines: detect if filesystem is readonly
*
check_haproxy_health
*
check_haproxy_status
*
check_memory
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment