Skip to content
check_sensors_ikus 16.8 KiB
Newer Older
# Check hardware sensor using `ipmitool` or `freeipmi`.
# Required the following sudoers
#
#     nagios ALL=(ALL) NOPASSWD: /usr/bin/ipmitool sensor
# OR
#     nagios ALL=(ALL) NOPASSWD: /usr/sbin/ipmi-sensors
#     nagios ALL=(ALL) NOPASSWD: /usr/sbin/ipmi-dcmi
#
# @AUTHOR: Patrik Dufresne (http://patrikdufresne.com)
# Copyright 2015 Patrik Dufresne
# Last modified 2015-02-22
# Please send all comments, suggestions, bugs and patches to (info AT patrikdufresne DOT com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
###

# you probably want to uncomment the following lines while developing
#shopt -o -s xtrace
# if you turn this on, then you have to alter the check_value calls to be able to check $?
#shopt -o -s errexit

# You should provide a meaningful VERSION
VERSION=0.1
# Who can be contacted about this?
AUTHOR="Patrik Dufresne"
# Name what is being checked to be printed out next to OK/WARNING/CRITICAL/UNKNOWN
SERVICE="SENSOR"

# Replacement for the exit function, will cleanup any tempfiles or such
# before exiting.
function cleanup {
}

declare -rx PROGNAME=${0##*/}
declare -rx PROGPATH=${0%/*}/

if [ -r "${PROGPATH}utils.sh" ] ; then
    source "${PROGPATH}utils.sh"
    echo "Can't find utils.sh. This plugin needs to be run from the same directory as utils.sh which is most likely something like /usr/lib/nagios/plugins or /usr/lib64/nagios/plugins"
    printf "Currently being run from %s\n" "$PROGPATH"
    # Since we couldn't define STATE_UNKNOWN since reading utils.sh
    # failed, we use 3 here but everywhere else after this use cleanup $STATE
fi

# Set STATE to UNKNOWN as soon as we can (right after reading in util.sh
# where the STATES are defined)
STATE=$STATE_UNKNOWN

# make sure that any external commands are installed, in the PATH and
# executable. The following example is stupid because of course date is
# installed but it's the only command this trivial check really uses
IPMITOOL=/usr/bin/ipmitool
FREEIPMI=/usr/sbin/ipmi-sensors
SENSORS=/usr/bin/sensors
if [ ! -x "$IPMITOOL" -a ! -x "$FREEIPMI" -a ! -x "$SENSORS" ] ; then
    echo "neither $IPMITOOL, $FREEIPMI or $SENSORS is installed, in your path and executable. Exiting."
    cleanup $STATE_UNKNOWN
fi

# provide detailed explanations of the command line syntax
function longhelp {
    printf " %s %s for Nagios - Usage %s [-s <sensor name> \
-w <warning threshold> -c <critical threshold>] \
[-t timeout] [-v [-v [-v]]]\n" "$PROGNAME" "$VERSION" "$PROGNAME"
    cleanup $STATE_UNKNOWN
    # put your long help here
    printf "%s plugin version %s for Nagios by %s
  -h, --help          Display this message.
  -s, --sensor        Set the sensor to monitor
  -w, --warning=val   Set the warning percentage threshold.
  -c, --critical=val  Set the critical percentage threshold.
  -t, --timeout=sec   Set script timeout in seconds.
  -v, --verbose       Up the verbosity level by one.
  --verbosity=val     Set the verbosity level to val.
  -V, --version       Print version information.
  --range_help        Explain threshold ranges.
" "$PROGNAME" "$VERSION" "$AUTHOR"
    cleanup $STATE_UNKNOWN
}

# explanatory function you probably want to keep
function range_help {
The format for ranges in Nagios can be confusing and it isn't always followed.

[@]start[:[end]]

Here are some example ranges:

Range   |  Generate an alert if value is    |  In English
--------+-----------------------------------+---------------------------------
10      |  outside the range of {0 .. 10}   |  Greater than 10
@10     |  inside the range of {0 .. 10}    |  Less than or equal to 10
10:     |  outside {10 .. ∞}                |  Greater than 10
~:10    |  outside the range of {-∞ .. 10}  |  Less than 10 including negative
10:20   |  outside the range of {10 .. 20}  |  Between 10 and 20
@10:20  |  inside the range of {10 .. 20}   |  Anything from 10 to 20
10      |  outside the range of {0 .. 10}   |  Greater than 10 or less than 0

Formal Rules:
1. start ≤ end
2. start and ":" is not required if start=0
3. if range is of format \"start:\" and end is not specified, end is infinity
4. to specify negative infinity, use "~"
5. alert is raised if metric is outside start and end range (inclusive)
6. if range starts with "@", then alert if inside this range (inclusive)
    10      < 0 or > 10, (outside the range of {0 .. 10})
    10:     < 10, (outside {10 .. ∞})
    ~:10    > 10, (outside the range of {-∞ .. 10})
    10:20   < 10 or > 20, (outside the range of {10 .. 20})
    @10:20  ≥ 10 and ≤ 20, (inside the range of {10 .. 20})
    10      < 0 or > 10, (outside the range of {0 .. 10})

More help at http://nagiosplug.sourceforge.net/developer-guidelines.html
"
    cleanup $STATE_UNKNOWN
}

# use getopt, trust me on this one. It's the easiest way
getopt -T
if [ $? -ne 4 ] ; then
    printf "%s: getopt is in compatibility mode.\n" "$SCRIPT"
    cleanup $STATE_UNKNOWN
fi

# Tell it which switches and longswitches you'll take and place a trailing
# colon (:) on the ones take arguments. Nagios guidelines require you to
# use all the ones specified below with the exception of --verbosity which I've
# added to circumvent the awkward -v -v -v syntax. Getopt takes care of
# positional parameters and errors for missing expected arguments so we can
# shift later without checking
RESULT=`getopt --name "$SCRIPT" --options "-h,-V,-v,-c:,-w:,-t:,-s:" --longoptions "help,version,verbose,verbosity:,warning:,critical:,timeout:,sensor:" -- "$@"`

# make the result of getopt your new argument list ($@)
eval set -- "$RESULT"

declare -i IDX=0
declare -a WARNING
declare -a CRITICAL
declare -a SENSOR
declare -a SENSOR_NAME
declare -a SENSOR_VALUE
declare -a SENSOR_UNIT
# all scripts should have a mechanism to terminate themselves if they are
# running for too long. Scripts you might think of as innocuous could end
# up waiting forever on I/O, especially if a disk is failing
declare -i TIMELIMIT=15
# Nagios defines behavior for VERBOSITY 0 (default) through 3
declare -i VERBOSITY=0

while [ $# -gt 0 ] ; do
    case "$1" in
        -h | --help)
            longhelp;;
        -V | --version)
            print_revision "$PROGNAME" "$VERSION"
            cleanup $STATE;;
        -v | --verbose)
            VERBOSITY=$(($VERBOSITY + 1));;
        --verbosity)
            shift
            VERBOSITY=$1;;
        -w | --warning)
            shift
            WARNING[IDX-1]=$1;;
        -c | --critical)
            shift
            CRITICAL[IDX-1]=$1;;
        -t | --timeout)
            shift
            TIMELIMIT=$1;;
        -s | --sensor)
            shift
            SENSOR[IDX]=$1
            IDX=$(( $IDX + 1 ))
            ;;
        --)
            shift
            break;;
        *)
            echo "Option $1 not supported. Ignored." >&2;;
    esac
    shift
#Verbosity level    Type of output
#0            Single line, minimal output. Summary
#1            Single line, additional information (eg list processes that fail)
#2            Multi line, configuration debug output (eg ps command used)
#3            Lots of detail for plugin problem diagnosis
if [ $VERBOSITY -gt 2 ] ; then
    shopt -o -s xtrace
# Check that the thresholds provided are valid strings
for W in ${WARNING[@]}; do
    if [ ! -z "$W" ]; then
        FORMAT=`echo "$W" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
        if [ $FORMAT -lt 1 ] ; then
            echo "Please check the format of your warning thresholds: $W"
            range_help
        fi
    fi
done
for C in ${CRITICAL[@]}; do
    if [ ! -z "$C" ]; then
        FORMAT=`echo "$C" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
        if [ $FORMAT -lt 1 ] ; then
            echo "Please check the format of your critical thresholds: $C"
            range_help
        fi
    fi
done

function normalize_unit {
    if [ "$1" == "degreesC" ]; then
        echo "C"
    elif [ "$1" == "Volts" ]; then
        echo "V"
    elif [ "$1" == "discrete" ]; then
        echo ""
    elif [ "$1" == "discrete" ]; then
        echo ""
    elif [ "$1" == "Watts" ]; then
        echo "watts"
    else
        echo "$1"
    fi
}

function ipmi_enabled {
    [ -e /dev/ipmi0 -o -e /dev/ipmi/0 -o -e /dev/ipmidev/0 ]
}

# what needs to happen in the event of a timeout
function timeout {
    echo "UNKNOWN - script timed out after $TIMELIMIT seconds."
    cleanup $STATE_UNKNOWN
}
# since we've processed the options which potentially set the timeout limit,
# we can setup a timeout trap now
trap timeout USR1
    # what we're doing here sending a USR1 signal back to this process which
    # we just set a trap to catch and run the timeout function the syntax of
    # this is important and very odd - if you know of a better way to do this, 
    # please email me what we're doing is starting another process in the
    # background that sleeps for TIMELIMIT seconds and then uses pgrep when
    # it 'wakes up' to see if a process with our number, name and user exists,
    # only then will the USR1 signal be sent we have to use pgrep so that we
    # don't sent a USR1 signal to just any program. The only risk we run with
    # this is sending USR1 to another instance of this script that just happens
    # to get assigned the same process ID it should be reasonable to assume
    # that your Nagios check interval is greater than the specified timeout
    # still, if you havea better idea...
    ( sleep $TIMELIMIT; if [ `pgrep -U $USER -f "$SCRIPT" | grep -c ^$$$` -gt 0 ] ; then kill -USR1 $$ ; fi; ) </dev/null &>/dev/null &

    # If IPMITOOL is available, use it.
    if ipmi_enabled && [ -x "$IPMITOOL" ]; then
        # Parse raw data
        # FAN 1            | 1510.000   | RPM        | ok    | 400.000   | 585.000   | 770.000   | 29260.000 | 29815.000 | 30370.000
        DATA=$(sudo -n "$IPMITOOL" sensor)
        if [ $? -gt 0 ]; then
            printf "UNKNOWN - Fail to execute $IPMITOOL. Check if sudo rule exists.\n"
            cleanup $STATE_UNKNOWN
        fi
        # Capture data for each line.
        IDX=0
        while read LINE; do
            SENSOR_NAME[$IDX]=$(echo $LINE | cut -d '|' -f 1 | sed -e 's/[ \t]*//g')
            SENSOR_VALUE[$IDX]=$(echo $LINE | cut -d '|' -f 2 | sed -e 's/[ \t]*//g')
            SENSOR_UNIT[$IDX]=$(echo $LINE | cut -d '|' -f 3 | sed -e 's/[ \t]*//g')
            SENSOR_UNIT[$IDX]=$(normalize_unit "${SENSOR_UNIT[$IDX]}")
            IDX=$(( $IDX + 1 ))
        done <<< "$DATA"
    elif ipmi_enabled && [ -x "$FREEIPMI" ]; then
        # Parse raw data
        # ID   | Name            | Reading    | Units | Event
        # 4    | CPU Temp        | 25.00      | C     | 'OK'
        DATA=$(sudo -n "$FREEIPMI" --no-header-output --no-sensor-type-output --ignore-not-available-sensors)
        if [ $? -gt 0 ]; then
            printf "UNKNOWN - Fail to execute $FREEIPMI. Check if sudo rule exists.\n"
            cleanup $STATE_UNKNOWN
        fi
        # Capture data for each line.
        IDX=0
        while read LINE; do
            SENSOR_NAME[$IDX]=$(echo "$LINE" | cut -d '|' -f 2 | sed -e 's/[ \t]*//g')
            SENSOR_VALUE[$IDX]=$(echo "$LINE" | cut -d '|' -f 3 | sed -e 's/[ \t]*//g')
            SENSOR_UNIT[$IDX]=$(echo "$LINE" | cut -d '|' -f 4 | sed -e 's/[ \t]*//g')
            SENSOR_UNIT[$IDX]=$(normalize_unit "${SENSOR_UNIT[$IDX]}")
            IDX=$(( $IDX + 1 ))
        done <<< "$DATA"
        # Try to capture power stats
        # Current Power : 62 Watts
        DATA=$(sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics | grep 'Power' | egrep -o "^[^:]*:\s*[+\-]?[0-9,\.]+\s?[^ ]*")
        if [ $? -eq 0 ]; then
            while read LINE; do
                SENSOR_NAME[$IDX]=$(echo $LINE | cut -d ':' -f 1 | cut -d ' ' -f 1-2 | sed -e 's/[ \t]*//g')
                SENSOR_VALUE[$IDX]=$(echo $LINE | cut -d ':' -f 2 | sed -e 's/[ \t]*//g' -e 's/[^0-9,\.\-]//g')
                SENSOR_UNIT[$IDX]=$(echo $LINE | cut -d ':' -f 2 | sed -e 's/[ \t]*//g' -e 's/[0-9,\.\-+°]*//g')
                SENSOR_UNIT[$IDX]=$(normalize_unit "${SENSOR_UNIT[$IDX]}")
                IDX=$(( $IDX + 1 ))
            done <<< "$DATA"
        fi
    elif [ -x "$SENSORS" ]; then
        # Parse raw data
        # Vcore:         +0.78 V
        DATA=$("$SENSORS" | egrep -o "^[^:]*:\s*[+\-]?[0-9,\.]+\s?[^ ]*")
        if [ $? -gt 0 ]; then
            printf "UNKNOWN - Fail to execute $SENSORS.\n"
            cleanup $STATE_UNKNOWN
        fi
        # Capture data for each line.
        IDX=0
        while read LINE; do
            SENSOR_NAME[$IDX]=$(echo $LINE | cut -d ':' -f 1 | sed -e 's/[ \t]*//g')
            SENSOR_VALUE[$IDX]=$(echo $LINE | cut -d ':' -f 2 | sed -e 's/[ \t]*//g' -e 's/[^0-9,\.\-]//g')
            SENSOR_UNIT[$IDX]=$(echo $LINE | cut -d ':' -f 2 | sed -e 's/[ \t]*//g' -e 's/[0-9,\.\-+°]*//g')
            SENSOR_UNIT[$IDX]=$(normalize_unit "${SENSOR_UNIT[$IDX]}")
            IDX=$(( $IDX + 1 ))
        done <<< "$DATA"
    # Once we're done doing work that could take any real time, we can end the
    # trap because from here on out it's just comparisons and string
    # concatenation
trap - USR1

function check_value {
    if [ -z "$1" ]; then
        return 0
    fi
    # If the range starts with an @, alert if value is inside the range,
    # otherwise alert if value is outside of range.
    INSIDE=`echo "$1" | grep -c '^@'`
    RANGE=`echo "$1" | sed 's/^@//'`
    # Start is anything left of the colon or 0.
    # End is anything right of the colon or the whole string if there's no
    # colon or infinity if there is a colon and nothing to the right of it

    # is there a colon?
    PARTS=`echo "$RANGE" | awk -F : '{ print NF }'`
    if [ $PARTS -gt 1 ] ; then
        START=${RANGE%%:*}
        END=${RANGE##*:}
    else
        START=0
        END=$RANGE
    fi

    # 4. to specify negative infinity, use "~"
    if [ "$START" == "~" ] ; then
        START=-999999999
    fi

    if [ -z "$END" ] ; then
        END=999999999
    fi

    if [ $START -gt $END ] ; then
        echo "In threshold START:END, START must be less than or equal to END"
        range_help
    fi

    # if the range starts with an @, alert if value is inside the range, otherwise alert if value is outside of range
    # all ranges are inclusive of endpoints so we use less than or equal on the inside and just less than on the outside
    if [ "$INSIDE" -gt 0 ] ; then
        if [ $(echo "$START <= $2 && $2 <= $END" | bc) -eq 1 ] ; then
          return 1
        fi
    elif [ $(echo "$2 < $START || $END < $2" | bc) -eq 1 ] ; then
        return 1
    fi

    return 0
# check conditions - yes this is ugly, blame BASH. 
# If you want to blame me, please provide a cleaner way that is as fast or faster
IDX=0
DESC=""
STATE=$STATE_OK
while [ "x${SENSOR[IDX]}" != "x" ]; do
    IDX2=0
    VALUE=""
    while [ "x${SENSOR_NAME[IDX2]}" != "x" -a "${SENSOR_NAME[IDX2]}" != "${SENSOR[IDX]}" ]; do
        IDX2=$(( $IDX2 + 1 ))
    done
    if [ "${SENSOR_NAME[IDX2]}" != "${SENSOR[IDX]}" ]; then
        printf "UNKNOWN - Given sensors ${SENSOR[IDX]} doesn't exists.\n"
        cleanup $STATE_UNKNOWN
    fi
    VALUE="${SENSOR_VALUE[IDX2]}"
    check_value "${CRITICAL[IDX]}" "$VALUE"
    if [ $? -gt 0 ] ; then
        STATE=$STATE_CRITICAL
        if [ ! -z "$PERF" ]; then DESC="$DESC, "; fi
        DESC="$DESC${SENSOR_NAME[IDX2]}: ${SENSOR_VALUE[IDX2]}${SENSOR_UNIT[IDX2]}"
    fi
    if [ ! $STATE -eq $STATE_CRITICAL ]; then
        check_value "${WARNING[IDX]}" "$VALUE"
        if [ $? -gt 0 ] ; then
            STATE=$STATE_WARNING
            if [ ! -z "$PERF" ]; then DESC="$DESC, "; fi
            DESC="$DESC${SENSOR_NAME[IDX2]}: ${SENSOR_VALUE[IDX2]}${SENSOR_UNIT[IDX2]}"
        fi
    fi
    IDX=$(( $IDX + 1 ))
done

# STATE - Message | 'label'=value[unit of measure];[warn];[crit];[min];[max]

PERF=""
IDX=0
while [ "x${SENSOR_NAME[IDX]}" != "x" ]; do
    # Skip N/A values.
    if [ "${SENSOR_VALUE[IDX]}" != "N/A" -a "${SENSOR_VALUE[IDX]}" != "na" ]; then
        if [ ! -z "$PERF" ]; then
            PERF="$PERF "
        fi
        PERF="$PERF${SENSOR_NAME[IDX]}=${SENSOR_VALUE[IDX]}${SENSOR_UNIT[IDX]}"
    fi
    IDX=$(( $IDX + 1 ))
done

OUT="$SENSOR $VALUE $VALUE_UNIT | $SENSOR=${VALUE};$WARNING;$CRITICAL"
    $STATE_OK)
        printf "%s OK - %s\n" "$SERVICE" "$DESC | $PERF";;
    $STATE_WARNING)
        printf "%s WARNING - %s\n" "$SERVICE" "$DESC | $PERF";;
    $STATE_CRITICAL)
        printf "%s CRITICAL - %s\n" "$SERVICE" "$DESC | $PERF";;
    $STATE_UNKNOWN)
        printf "%s UNKNOWN - %s\n" "$SERVICE" "$DESC | $PERF";;