#!/bin/bash
###
#
# Check harddisk temperature using smartctl utility. This plugins reuiqred the
# following sudoers rule:
# nagios ALL=(ALL) NOPASSWD: /usr/sbin/smartctl *
#
# @AUTHOR: Patrik Dufresne (http://patrikdufresne.com)
# Copyright 2015 Patrik Dufresne
# Last modified 2015-04-10
# Please send all comments, suggestions, bugs and patches to (info AT patrikdufresne DOT com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
###
# you probably want to uncomment the following lines while developing
#shopt -o -s xtrace
# if you turn this on, then you have to alter the check_value calls to be able to check $?
#shopt -o -s errexit
# You should provide a meaningful VERSION
VERSION=0.2
# Who can be contacted about this?
AUTHOR="Patrik Dufresne"
# Name what is being checked to be printed out next to OK/WARNING/CRITICAL/UNKNOWN
SERVICE="SMART"
# Replacement for the exit function, will cleanup any tempfiles or such
# before exiting.
function cleanup {
exit $1
}
declare -rx PROGNAME=${0##*/}
declare -rx PROGPATH=${0%/*}/
if [ -r "${PROGPATH}utils.sh" ] ; then
source "${PROGPATH}utils.sh"
else
echo "Can't find utils.sh. This plugin needs to be run from the same directory as utils.sh which is most likely something like /usr/lib/nagios/plugins or /usr/lib64/nagios/plugins"
printf "Currently being run from %s\n" "$PROGPATH"
# Since we couldn't define STATE_UNKNOWN since reading utils.sh
# failed, we use 3 here but everywhere else after this use cleanup $STATE
cleanup 3
fi
# Set STATE to UNKNOWN as soon as we can (right after reading in util.sh
# where the STATES are defined)
STATE=$STATE_UNKNOWN
# make sure that any external commands are installed, in the PATH and
# executable. The following example is stupid because of course date is
# installed but it's the only command this trivial check really uses
SMARTCTL=/usr/sbin/smartctl
if [ ! -x "$SMARTCTL" ] ; then
echo "$SMARTCTL utility is not installed, in your path and executable. Exiting."
cleanup $STATE_UNKNOWN
fi
# provide a quick one liner of how to use the program
function usage {
printf " %s %s for Nagios - Usage %s -s \
-w -c \
[-t timeout] [-v [-v [-v]]]\n" "$PROGNAME" "$VERSION" "$PROGNAME"
cleanup $STATE_UNKNOWN
}
# provide detailed explanations of the command line syntax
function longhelp {
# put your long help here
printf "%s plugin version %s for Nagios by %s
-h, --help Display this message.
-d, --device Set the device to monitor (may be define multiple time)
-w, --warning=val Set the temperature warning threshold.
-c, --critical=val Set the temperature critical threshold.
-t, --timeout=sec Set script timeout in seconds.
-v, --verbose Up the verbosity level by one.
--verbosity=val Set the verbosity level to val.
-V, --version Print version information.
--range_help Explain threshold ranges.
" "$PROGNAME" "$VERSION" "$AUTHOR"
cleanup $STATE_UNKNOWN
}
# explanatory function you probably want to keep
function range_help {
printf "
The format for ranges in Nagios can be confusing and it isn't always followed.
[@]start[:[end]]
Here are some example ranges:
Range | Generate an alert if value is | In English
--------+-----------------------------------+---------------------------------
10 | outside the range of {0 .. 10} | Greater than 10
@10 | inside the range of {0 .. 10} | Less than or equal to 10
10: | outside {10 .. ∞} | Greater than 10
~:10 | outside the range of {-∞ .. 10} | Less than 10 including negative
10:20 | outside the range of {10 .. 20} | Between 10 and 20
@10:20 | inside the range of {10 .. 20} | Anything from 10 to 20
10 | outside the range of {0 .. 10} | Greater than 10 or less than 0
Formal Rules:
1. start ≤ end
2. start and ":" is not required if start=0
3. if range is of format \"start:\" and end is not specified, end is infinity
4. to specify negative infinity, use "~"
5. alert is raised if metric is outside start and end range (inclusive)
6. if range starts with "@", then alert if inside this range (inclusive)
10 < 0 or > 10, (outside the range of {0 .. 10})
10: < 10, (outside {10 .. ∞})
~:10 > 10, (outside the range of {-∞ .. 10})
10:20 < 10 or > 20, (outside the range of {10 .. 20})
@10:20 ≥ 10 and ≤ 20, (inside the range of {10 .. 20})
10 < 0 or > 10, (outside the range of {0 .. 10})
More help at http://nagiosplug.sourceforge.net/developer-guidelines.html
"
cleanup $STATE_UNKNOWN
}
# use getopt, trust me on this one. It's the easiest way
getopt -T
if [ $? -ne 4 ] ; then
printf "%s: getopt is in compatibility mode.\n" "$SCRIPT"
cleanup $STATE_UNKNOWN
fi
# Tell it which switches and longswitches you'll take and place a trailing
# colon (:) on the ones take arguments. Nagios guidelines require you to
# use all the ones specified below with the exception of --verbosity which I've
# added to circumvent the awkward -v -v -v syntax. Getopt takes care of
# positional parameters and errors for missing expected arguments so we can
# shift later without checking
RESULT=`getopt --name "$SCRIPT" --options "-h,-V,-v,-c:,-w:,-t:,-d:" --longoptions "help,version,verbose,verbosity:,warning:,critical:,timeout:,device:" -- "$@"`
# make the result of getopt your new argument list ($@)
eval set -- "$RESULT"
declare WARNING
declare CRITICAL
# all scripts should have a mechanism to terminate themselves if they are
# running for too long. Scripts you might think of as innocuous could end
# up waiting forever on I/O, especially if a disk is failing
declare -i TIMELIMIT=15
# Nagios defines behavior for VERBOSITY 0 (default) through 3
declare -i VERBOSITY=0
while [ $# -gt 0 ] ; do
case "$1" in
-h | --help)
longhelp;;
-V | --version)
print_revision "$PROGNAME" "$VERSION"
cleanup $STATE;;
-v | --verbose)
VERBOSITY=$(($VERBOSITY + 1));;
--verbosity)
shift
VERBOSITY=$1;;
-w | --warning)
shift
WARNING=$1;;
-c | --critical)
shift
CRITICAL=$1;;
-t | --timeout)
shift
TIMELIMIT=$1;;
-d | --device)
shift
DEVICES="$DEVICES $1";;
--)
shift
break;;
*)
echo "Option $1 not supported. Ignored." >&2;;
esac
shift
done
#Verbosity level Type of output
#0 Single line, minimal output. Summary
#1 Single line, additional information (eg list processes that fail)
#2 Multi line, configuration debug output (eg ps command used)
#3 Lots of detail for plugin problem diagnosis
if [ $VERBOSITY -gt 3 ] ; then
shopt -o -s xtrace
fi
# Check that the thresholds provided are valid strings
if [ -z "$WARNING" -o -z "$CRITICAL" ] ; then
range_help
else
# positive values only
WARNFORMAT=`echo "$WARNING" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
CRITFORMAT=`echo "$CRITICAL" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
OK=$(( $WARNFORMAT + $CRITFORMAT ))
if [ $OK -lt 2 ] ; then
echo "Please check the format of your warning and critical thresholds."
range_help
fi
fi
# what needs to happen in the event of a timeout
function timeout {
echo "UNKNOWN - script timed out after $TIMELIMIT seconds."
cleanup $STATE_UNKNOWN
}
# since we've processed the options which potentially set the timeout limit,
# we can setup a timeout trap now
trap timeout USR1
# what we're doing here sending a USR1 signal back to this process which
# we just set a trap to catch and run the timeout function the syntax of
# this is important and very odd - if you know of a better way to do this,
# please email me what we're doing is starting another process in the
# background that sleeps for TIMELIMIT seconds and then uses pgrep when
# it 'wakes up' to see if a process with our number, name and user exists,
# only then will the USR1 signal be sent we have to use pgrep so that we
# don't sent a USR1 signal to just any program. The only risk we run with
# this is sending USR1 to another instance of this script that just happens
# to get assigned the same process ID it should be reasonable to assume
# that your Nagios check interval is greater than the specified timeout
# still, if you havea better idea...
( sleep $TIMELIMIT; if [ `pgrep -U $USER -f "$SCRIPT" | grep -c ^$$$` -gt 0 ] ; then kill -USR1 $$ ; fi; ) /dev/null &
# Check if a devices is provided
if [ -z "$DEVICES" ] ; then
# Scan for device list
SMARTCTL_DATA="$(sudo smartctl --scan)"
DEVICES=$(echo "$SMARTCTL_DATA" | cut -d ' ' -f 1 2>/dev/null)
if [[ $(echo "$SMARTCTL_DATA" | grep -c 'UNRECOGNIZED OPTION') -gt 0 ]]; then
# --scan option was not supported, use shell globbing
DEVICES="$(ls /dev/sd?)"
fi
fi
# Make sure we found devices.
if [ -z "$DEVICES" ] ; then
usage
fi
# Loop on each devices
# Build some kind of data array to latter parsing.
COUNT=-1
DATA=()
for DEVICE in $DEVICES; do
# Check if device exists
if [ ! -b "$DEVICE" ]; then
echo "UNKNOWN - The device '$DEVICE' is invalid."
cleanup $STATE_UNKNOWN
fi
# Get SMART data
LINES=`sudo -n $SMARTCTL -a $DEVICE`
if [[ -z "$LINES" ]]; then
#Given device doesn't exists
printf "UNKNOWN - Given device '$DEVICE' doesn't exists.\n"
cleanup $STATE_UNKNOWN
fi
# Check if device support SMART
ENABLED=$(echo "$LINES" | grep -c 'SMART support is: Enabled')
# Parse the data.
if [ $ENABLED -gt 0 ]; then
HEALTH=`echo $(echo "$LINES" | grep "SMART overall-health" | head -n 1 | cut -d ':' -f 2 )`
TEMPERATURE=$(echo "$LINES" | grep "Temperature_Celsius" | head -n 1 | awk '{print $10;}')
[ -z "$TEMPERATURE" ] && TEMPERATURE="0"
ID_005=$(echo "$LINES" | grep "Reallocated_Sector_Ct" | head -n 1 | awk '{print $10;}')
[ -z "$ID_005" ] && ID_005="0"
ID_196=$(echo "$LINES" | grep "Reallocated_Event_Count" | head -n 1 | awk '{print $10;}')
[ -z "$ID_196" ] && ID_196="0"
ID_198=$(echo "$LINES" | grep "Offline_Uncorrectable" | head -n 1 | awk '{print $10;}')
[ -z "$ID_198" ] && ID_198="0"
ID_199=$(echo "$LINES" | grep "UDMA_CRC_Error_Count" | head -n 1 | awk '{print $10;}')
[ -z "$ID_199" ] && ID_199="0"
DATA+=("$DEVICE $HEALTH $TEMPERATURE $ID_005 $ID_196 $ID_198 $ID_199")
let "COUNT += 1"
fi
done
# Once we're done doing work that could take any real time, we can end the
# trap because from here on out it's just comparisons and string
# concatenation
trap - USR1
function check_value {
# If the range starts with an @, alert if value is inside the range,
# otherwise alert if value is outside of range.
INSIDE=`echo "$1" | grep -c '^@'`
RANGE=`echo "$1" | sed 's/^@//'`
# Start is anything left of the colon or 0.
# End is anything right of the colon or the whole string if there's no
# colon or infinity if there is a colon and nothing to the right of it
# is there a colon?
PARTS=`echo "$RANGE" | awk -F : '{ print NF }'`
if [ $PARTS -gt 1 ] ; then
START=${RANGE%%:*}
END=${RANGE##*:}
else
START=0
END=$RANGE
fi
# 4. to specify negative infinity, use "~"
if [ "$START" == "~" ] ; then
START=-999999999
fi
if [ -z "$END" ] ; then
END=999999999
fi
if [ $START -gt $END ] ; then
echo "In threshold START:END, START must be less than or equal to END"
range_help
fi
# if the range starts with an @, alert if value is inside the range, otherwise alert if value is outside of range
# all ranges are inclusive of endpoints so we use less than or equal on the inside and just less than on the outside
if [ "$INSIDE" -gt 0 ] ; then
if [ "$START" -le "$2" -a "$2" -le "$END" ] ; then
return 1
fi
else
if [ "$2" -lt "$START" -o "$END" -lt "$2" ] ; then
return 1
fi
fi
return 0
}
# CHECK DATA
STATE=$STATE_OK
OUT=""
PERF="|"
for i in `seq 0 $COUNT`; do
LINE=$(echo "${DATA[$i]}")
if [ $VERBOSITY -gt 2 ] ; then
echo $LINE
fi
DEVICE=$(echo "$LINE" | awk '{print $1;}')
DEVICE=$(basename "$DEVICE")
HEALTH=$(echo "$LINE" | awk '{print $2;}')
TEMPERATURE=$(echo "$LINE" | awk '{print $3;}')
ID_005=$(echo "$LINE" | awk '{print $4;}')
ID_196=$(echo "$LINE" | awk '{print $5;}')
ID_198=$(echo "$LINE" | awk '{print $6;}')
ID_199=$(echo "$LINE" | awk '{print $7;}')
# Append performance data
PERF+=" ${DEVICE}_temp=$TEMPERATURE ${DEVICE}_reallocated_sector_count=$ID_005 ${DEVICE}_reallocated_event_count=$ID_196 ${DEVICE}_offline_uncorrectable=$ID_198 ${DEVICE}_udma_crc_error_count=$ID_199"
OUT+="$DEVICE"
# Check critical thresholds
if [ "$HEALTH" != "PASSED" ]; then
STATE=$STATE_CRITICAL
OUT+=" $HEALTH"
fi
# Check sectors
if [ "$ID_005" -gt 5 -o "$ID_196" -gt 5 -o "$ID_198" -gt 5 -o "$ID_199" -gt 5 ]; then
STATE=$STATE_CRITICAL
OUT+=" bad sectors"
fi
# Check temperature
if [ ! -z "$CRITICAL" ]; then
check_value "$CRITICAL" "$TEMPERATURE"
if [ $? -gt 0 ] ; then
STATE=$STATE_CRITICAL
OUT+=" critical temperature"
fi
fi
# Check warning thresholds
if [ ! -z "$WARNING" -a $STATE != $STATE_CRITICAL ]; then
check_value "$WARNING" "$TEMPERATURE"
if [ $? -gt 0 ] ; then
STATE=$STATE_WARNING
OUT+=" warning temperature"
fi
fi
if [ ! -z "$OUT" -a $i -lt $COUNT ]; then
OUT+=", "
fi
done
case $STATE in
$STATE_OK)
printf "%s OK: %s\n" "$SERVICE" "$OUT $PERF";;
$STATE_WARNING)
printf "%s WARNING: %s\n" "$SERVICE" "$OUT $PERF";;
$STATE_CRITICAL)
printf "%s CRITICAL: %s\n" "$SERVICE" "$OUT $PERF";;
$STATE_UNKNOWN)
printf "%s UNKNOWN: %s\n" "$SERVICE" "$OUT $PERF";;
esac
cleanup $STATE