diff --git a/check_smartstatus b/check_smartstatus index ecd5584d740ecf57646a6ce3ebc89dbd03f1c550..2e691f0e5b4e66e3305003e5909911b81c641c98 100755 --- a/check_smartstatus +++ b/check_smartstatus @@ -38,11 +38,14 @@ # 2021-10-28 v1.3 <axel.hahn@iml.unibe.ch> detect -d param for smartctl # 2022-07-08 v1.4 <axel.hahn@iml.unibe.ch> remove pipe in status line # 2023-10-20 v1.5 <axel.hahn@unibe.ch> harden sudo command execution +# 2024-06-07 v1.6 <axel.hahn@unibe.ch> add help page; use smartctl --scan to detect devices # ====================================================================== . $(dirname $0)/inc_pluginfunctions +self_APPVERSION=1.6 + typeset -i iFound=0 typeset -i iErrors=0 sOut= @@ -56,6 +59,35 @@ export PATH # functions # ---------------------------------------------------------------------- +# show help +function showHelp(){ + local self=$( basename $0 ) + cat <<EOH +$( ph.showImlHelpHeader ) + +Show status of local S.M.A.R.T. devices. + +SYNTAX: + $self [-h] [-l] [devices] + +OPTIONS: + + -h|--help show this help. + -l|--list list devices only. + +PARAMETERS: + +EXAMPLES + + $self + Scan all local disks + + $self -l + List all local disks without scanning them. + +EOH +} + function detectParam(){ local _mydevice=$1 local _moreparam @@ -74,7 +106,8 @@ function checkDrive(){ sLabel="^SMART.*Health" sOK="(ok|passed)" - sOut="$sOut ; ${device}:" + test -n "$sOut" && sOut="$sOut ; " + sOut="${device}:" ls -ld $device >/dev/null 2>&1 @@ -91,7 +124,7 @@ function checkDrive(){ # --- check health moreparam=$( detectParam $device ) echo sudo smartctl -Ha $device $moreparam >>$tmpfile 2>&1 - sudo smartctl -Ha $device $moreparam >>$tmpfile 2>&1 + sudo -n smartctl -Ha $device $moreparam >>$tmpfile 2>&1 rcs=$? echo $device - rc=$rcs >>$tmpfile echo >>$tmpfile @@ -103,7 +136,7 @@ function checkDrive(){ else grep -i "$sLabel" $tmpfile >/dev/null if [ $? -eq 0 ]; then - status=`grep -i "$sLabel" $tmpfile | cut -f 2 -d ":"` + status=$( grep -i "$sLabel" $tmpfile | cut -f 2 -d ":") sOut="$sOut ${status}" echo $status | grep -Ei "$sOK" >>$tmpfile if [ $? -ne 0 ]; then @@ -124,10 +157,26 @@ function checkDrive(){ fi } +function listDevices(){ + sudo -n smartctl --scan | grep -v "/dev/bus/" +} + # ---------------------------------------------------------------------- # main # ---------------------------------------------------------------------- +# parse params +customaction= +while [[ "$#" -gt 0 ]]; do case $1 in + -h|--help) showHelp; exit 0;; + -l|--list) customaction="list"; shift 1 ;; + *) if grep "^-" <<< "$1" >/dev/null ; then + echo; echo "ERROR: Unknown parameter: $1"; echo; exit 2 + fi + break; + ;; +esac; done + ph.require smartctl rm -f $tmpDetailsfile 2>/dev/null @@ -137,8 +186,15 @@ if ! sudo -n smartctl -h >/dev/null 2>&1; then ph.abort "UNKNOWN: No sudo permissions to execute smartctl." fi +if [ "$customaction" == "list" ]; then + echo "Devices to scan:" + listDevices | sed "s#^#- #" + exit 0 +fi + # --- loop over sd devices -for mydevice in $(ls -1 /dev/sd* | grep -v "[0-9]") +# for mydevice in $(ls -1 /dev/sd* | grep -v "[0-9]") +for mydevice in $( listDevices | cut -f 1 -d " " ) do iFound=$iFound+1 @@ -154,12 +210,9 @@ do checkDrive $mydevice done -ph.status "SMART check on $iFound HDs - $iErrors errors - $sOut" -cat $tmpDetailsfile -rm -f $tmpDetailsfile - +ph.status "SMART check on $iFound Disks - $iErrors errors - $sOut" +cat $tmpDetailsfile 2>&1 && rm -f $tmpDetailsfile ph.exit - # ---------------------------------------------------------------------- diff --git a/docs/20_Checks/_index.md b/docs/20_Checks/_index.md index 0a06f3aae7377591d60b81c882f2fdc706cfade9..f8a7774e60f4e6ae2b0cc7ded88cdc4ac68d65c6 100644 --- a/docs/20_Checks/_index.md +++ b/docs/20_Checks/_index.md @@ -47,7 +47,7 @@ There is one include script used by all checks: * [check_rearbackup](check_rearbackup.md) * [check_reboot_required](check_reboot_required.md) * [check_requirements](check_requirements.md) -* check_smartstatus +* [check_smartstatus](check_smartstatus.md) * [check_snmp_data](check_snmp_data.md) * check_snmp_printer * check_snmp_switch diff --git a/docs/20_Checks/check_smartstatus.md b/docs/20_Checks/check_smartstatus.md new file mode 100644 index 0000000000000000000000000000000000000000..7131c8372b147759f8edba4b103d8d21a9ec0d74 --- /dev/null +++ b/docs/20_Checks/check_smartstatus.md @@ -0,0 +1,151 @@ +# Check_smartstatus + +## Introduction + +**check_smartstatus** is a plugin run a smartctl check to verify the disk status of all local harddisks/ ssds. + +It works on physical machines only. + +## Requirements + +* `smartctl` + +The icinga user needs sudo permissions on the smartctl binary. + +```txt +icingaclient ALL=(ALL) NOPASSWD: /sbin/smartctl +``` + +## Syntax + +```txt +______________________________________________________________________ + +CHECK_SMARTSTATUS +v1.6 + +(c) Institute for Medical Education - University of Bern +Licence: GNU GPL 3 + +https://os-docs.iml.unibe.ch/icinga-checks/Checks/check_smartstatus.html +______________________________________________________________________ + +Show status of local S.M.A.R.T. devices. + +SYNTAX: + check_smartstatus [-h] [-l] [devices] + +OPTIONS: + + -h|--help show this help. + -l|--list list devices only. + +PARAMETERS: + +EXAMPLES + + check_smartstatus + Scan all local disks + + check_smartstatus -l + List all local disks without scanning them. + +``` + +### Parameters + +(none) + +## Examples + +Fort testing purposes: Show devices only without scanning them: + +```txt +./check_smartstatus -l +Devices to scan: +- /dev/nvme0 -d nvme # /dev/nvme0, NVMe device +``` + +Without parameter `check_smartstatus` will loop over all found devices and perform a SMART scan on each. You get a status line with a summary followed by the output sections for each disk. + +This is the output of a single SSD: + +```txt +OK: SMART check on 1 Disks - 0 errors - /dev/nvme0: PASSED +SMART/Health Information (NVMe Log 0x02) +---------------------------------------------------------------------- + +/dev/nvme0 + +sudo smartctl -Ha /dev/nvme0 +smartctl 7.4 2023-08-01 r5530 [x86_64-linux-6.9.2-1-MANJARO] (local build) +Copyright (C) 2002-23, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Model Number: SKHynix_HFS001TEJ9X162N +Serial Number: AJC9N469110209D22 +Firmware Version: 51730A10 +PCI Vendor/Subsystem ID: 0x1c5c +IEEE OUI Identifier: 0xace42e +Controller ID: 0 +NVMe Version: 1.4 +Number of Namespaces: 1 +Namespace 1 Size/Capacity: 1,024,209,543,168 [1.02 TB] +Namespace 1 Formatted LBA Size: 512 +Namespace 1 IEEE EUI-64: ace42e 0035db84db +Local Time is: Fri Jun 7 12:59:02 2024 CEST +Firmware Updates (0x16): 3 Slots, no Reset required +Optional Admin Commands (0x0017): Security Format Frmw_DL Self_Test +Optional NVM Commands (0x00df): Comp Wr_Unc DS_Mngmt Wr_Zero Sav/Sel_Feat Timestmp Verify +Log Page Attributes (0x1e): Cmd_Eff_Lg Ext_Get_Lg Telmtry_Lg Pers_Ev_Lg +Maximum Data Transfer Size: 64 Pages +Warning Comp. Temp. Threshold: 86 Celsius +Critical Comp. Temp. Threshold: 87 Celsius + +Supported Power States +St Op Max Active Idle RL RT WL WT Ent_Lat Ex_Lat + 0 + 7.50W - - 0 0 0 0 5 305 + 1 + 3.9000W - - 1 1 1 1 30 330 + 2 + 1.5000W - - 2 2 2 2 100 400 + 3 - 0.0500W - - 3 3 3 3 500 1500 + 4 - 0.0050W - - 4 4 4 4 1000 9000 + +Supported LBA Sizes (NSID 0x1) +Id Fmt Data Metadt Rel_Perf + 0 + 512 0 0 + +=== START OF SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +SMART/Health Information (NVMe Log 0x02) +Critical Warning: 0x00 +Temperature: 43 Celsius +Available Spare: 100% +Available Spare Threshold: 10% +Percentage Used: 0% +Data Units Read: 6,589,009 [3.37 TB] +Data Units Written: 3,879,914 [1.98 TB] +Host Read Commands: 39,241,205 +Host Write Commands: 72,717,841 +Controller Busy Time: 2,112 +Power Cycles: 176 +Power On Hours: 642 +Unsafe Shutdowns: 21 +Media and Data Integrity Errors: 0 +Error Information Log Entries: 0 +Warning Comp. Temperature Time: 0 +Critical Comp. Temperature Time: 0 +Temperature Sensor 1: 40 Celsius +Temperature Sensor 2: 37 Celsius + +Error Information (NVMe Log 0x01, 16 of 256 entries) +No Errors Logged + +Self-test Log (NVMe Log 0x06) +Self-test status: No self-test in progress +No Self-tests Logged + +/dev/nvme0 - rc=0 + +PASSED SMART/Health Information (NVMe Log 0x02) +```