Skip to content
check_memory_ikus 12.1 KiB
Newer Older
#! /bin/bash

###
# This plugin monitor the memory usage for Linux operating system. This memory take
# consideration of ZFS arcstats memory usages.
#
# @AUTHOR: Patrik Dufresne (http://patrikdufresne.com)
# Copyright 2015 Patrik Dufresne
# Last modified 2015-03-06
#
# Please send all comments, suggestions, bugs and patches to (info AT patrikdufresne DOT com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
###

# You should provide a meaningful VERSION
VERSION=0.1
# Who can be contacted about this?
AUTHOR="Patrik Dufresne"
# Name what is being checked to be printed out next to OK/WARNING/CRITICAL/UNKNOWN
SERVICE="MEMORY"

# Replacement for the exit function, will cleanup any tempfiles or such
# before exiting.
function cleanup {
  exit $1
}

declare -rx PROGNAME=${0##*/}
declare -rx PROGPATH=${0%/*}/

if [ -r "${PROGPATH}utils.sh" ] ; then
  source "${PROGPATH}utils.sh"
else
  echo "Can't find utils.sh. This plugin needs to be run from the same directory as utils.sh which is most likely something like /usr/lib/nagios/plugins or /usr/lib64/nagios/plugins"
  printf "Currently being run from %s\n" "$PROGPATH"
  # Since we couldn't define STATE_UNKNOWN since reading utils.sh
  # failed, we use 3 here but everywhere else after this use cleanup $STATE
  cleanup 3
fi

# Set STATE to UNKNOWN as soon as we can (right after reading in util.sh
# where the STATES are defined)
STATE=$STATE_UNKNOWN

# provide a quick one liner of how to use the program
function usage {
  printf " %s %s for Nagios - Usage %s \
[-w <% warning memory usage>] [-c <% critical memory usage>] \
[-t timeout] [-v [-v [-v]]]\n" "$PROGNAME" "$VERSION" "$PROGNAME"
  cleanup $STATE_UNKNOWN
}

# provide detailed explanations of the command line syntax
function longhelp {
  # put your long help here
  printf "%s plugin version %s for Nagios by %s
  -h, --help                   Display this message.
  -w, --warning=val            Set the warning memory usage threshold.
  -c, --critical=val           Set the critical memory usage threshold.
  -t, --timeout=sec            Set script timeout in seconds.
  -v, --verbose                Up the verbosity level by one.
  --verbosity=val              Set the verbosity level to val.
  -V, --version                Print version information.
  --range_help                 Explain threshold ranges.
" "$PROGNAME" "$VERSION" "$AUTHOR"
  cleanup $STATE_UNKNOWN
}

# explanatory function you probably want to keep
function range_help {
  printf "
The format for ranges in Nagios can be confusing and it isn't always followed.

[@]start[:[end]]

Here are some example ranges:

Range   |  Generate an alert if value is    |  In English
--------+-----------------------------------+---------------------------------
10      |  outside the range of {0 .. 10}   |  Greater than 10
@10     |  inside the range of {0 .. 10}    |  Less than or equal to 10
10:     |  outside {10 .. ∞}                |  Greater than 10
~:10    |  outside the range of {-∞ .. 10}  |  Less than 10 including negative
10:20   |  outside the range of {10 .. 20}  |  Between 10 and 20
@10:20  |  inside the range of {10 .. 20}   |  Anything from 10 to 20
10      |  outside the range of {0 .. 10}   |  Greater than 10 or less than 0

Formal Rules:
1. start ≤ end
2. start and ":" is not required if start=0
3. if range is of format \"start:\" and end is not specified, end is infinity
4. to specify negative infinity, use "~"
5. alert is raised if metric is outside start and end range (inclusive)
6. if range starts with "@", then alert if inside this range (inclusive)
    10      < 0 or > 10, (outside the range of {0 .. 10})
    10:     < 10, (outside {10 .. ∞})
    ~:10    > 10, (outside the range of {-∞ .. 10})
    10:20   < 10 or > 20, (outside the range of {10 .. 20})
    @10:20  ≥ 10 and ≤ 20, (inside the range of {10 .. 20})
    10      < 0 or > 10, (outside the range of {0 .. 10})

More help at http://nagiosplug.sourceforge.net/developer-guidelines.html
"
  cleanup $STATE_UNKNOWN
}

#if [ $# -eq 0 ] ; then
#  usage
#fi

# use getopt, trust me on this one. It's the easiest way
getopt -T
if [ $? -ne 4 ] ; then
  printf "%s: getopt is in compatibility mode.\n" "$SCRIPT"
  cleanup $STATE_UNKNOWN
fi

# Tell it which switches and longswitches you'll take and place a trailing
# colon (:) on the ones take arguments. Nagios guidelines require you to
# use all the ones specified below with the exception of --verbosity which I've
# added to circumvent the awkward -v -v -v syntax. Getopt takes care of
# positional parameters and errors for missing expected arguments so we can
# shift later without checking
RESULT=`getopt --name "$SCRIPT" --options "-h,-V,-v,-t:,-w:,-c:" \
--longoptions "help,version,verbose,verbosity:,warning:,critical:,timeout:" -- "$@"`

# make the result of getopt your new argument list ($@)
eval set -- "$RESULT"

declare WARNING
declare CRITICAL
# all scripts should have a mechanism to terminate themselves if they are
# running for too long. Scripts you might think of as innocuous could end
# up waiting forever on I/O, especially if a disk is failing
declare -i TIMELIMIT=15
# Nagios defines behavior for VERBOSITY 0 (default) through 3
declare -i VERBOSITY=0

while [ $# -gt 0 ] ; do
  case "$1" in
    -h | --help)
      longhelp;;
    -V | --version)
      print_revision "$PROGNAME" "$VERSION"
      cleanup $STATE;;
    -v | --verbose)
      VERBOSITY=$(($VERBOSITY + 1));;
    --verbosity)
      shift
      VERBOSITY=$1;;
    -w | --warning)
      shift
      WARNING=$1;;
    -c | --critical)
      shift
      CRITICAL=$1;;
    -t | --timeout)
      shift
      TIMELIMIT=$1;;
    --)
      shift
      break;;
    *)
      echo "Option $1 not supported. Ignored." >&2;;
  esac
  shift
done

#Verbosity level  Type of output
#0      Single line, minimal output. Summary
#1      Single line, additional information (eg list processes that fail)
#2      Multi line, configuration debug output (eg ps command used)
#3      Lots of detail for plugin problem diagnosis
if [ $VERBOSITY -gt 2 ] ; then
  shopt -o -s xtrace
fi

# what needs to happen in the event of a timeout
function timeout {
  echo "UNKNOWN - script timed out after $TIMELIMIT seconds."
  cleanup $STATE_UNKNOWN
}

function check_range {
  # # positive values only
  if [ ! -z "$1" ] ; then
    WARNFORMAT=`echo "$1" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
    if [ $WARNFORMAT -lt 1 ] ; then
      echo "Please check the format of your warning and critical thresholds."
      range_help
    fi
  fi
  if [ ! -z "$2" ] ; then
    CRITFORMAT=`echo "$2" | grep -c '^@\?\([0-9]\+:[0-9]*\|[0-9]\+\)$'`
    if [ $CRITFORMAT -lt 1 ] ; then
      echo "Please check the format of your warning and critical thresholds."
      range_help
    fi
  fi
}

check_range "$WARNING" "$CRITICAL"

# since we've processed the options which potentially set the timeout limit,
# we can setup a timeout trap now
trap timeout USR1
  # what we're doing here sending a USR1 signal back to this process which
  # we just set a trap to catch and run the timeout function the syntax of
  # this is important and very odd - if you know of a better way to do this, 
  # please email me what we're doing is starting another process in the
  # background that sleeps for TIMELIMIT seconds and then uses pgrep when
  # it 'wakes up' to see if a process with our number, name and user exists,
  # only then will the USR1 signal be sent we have to use pgrep so that we
  # don't sent a USR1 signal to just any program. The only risk we run with
  # this is sending USR1 to another instance of this script that just happens
  # to get assigned the same process ID it should be reasonable to assume
  # that your Nagios check interval is greater than the specified timeout
  # still, if you havea better idea...
  ( sleep $TIMELIMIT; if [ `pgrep -U $USER -f "$SCRIPT" | grep -c ^$$$` -gt 0 ] ; then kill -USR1 $$ ; fi; ) </dev/null &>/dev/null &

  # Get the memory usages.
  DATA=`cat /proc/meminfo`
  if [ $VERBOSITY -gt 2 ] ; then
    echo "$DATA"
  fi
  TOTAL_KB=`echo "$DATA" | egrep '^MemTotal' | awk '{print $2;}'`
  FREE_KB=`echo "$DATA" | egrep '^MemFree' | awk '{print $2;}'`
  BUFFERS_KB=`cat /proc/meminfo | egrep '^(Buffers|SReclaimable)' | awk '{s+=$2} END {print s}'`
  CACHE_KB=`cat /proc/meminfo | egrep '^Cached' | awk '{s+=$2} END {print s}'`
  SWTOTAL_KB=`echo "$DATA" | egrep '^SwapTotal' | awk '{print $2;}'`
  SWFREE_KB=`echo "$DATA" | egrep '^SwapFree' | awk '{print $2;}'`
  SWCACHE_KB=`echo "$DATA" | egrep '^SwapCached' | awk '{print $2;}'`

  # Get arcstats memory usage.
  ARC_KB=0
  if [ -e "/proc/spl/kstat/zfs/arcstats" ]; then
    ARC=`egrep '^size' /proc/spl/kstat/zfs/arcstats | awk '{print $3;}'`
    ARC_KB=`expr $ARC / 1024`
  fi

  # Compute usage
  USED_KB=`expr $TOTAL_KB - $FREE_KB - $BUFFERS_KB - $CACHE_KB - $ARC_KB`
  SWUSED_KB=`expr $SWTOTAL_KB - $SWFREE_KB - $SWCACHE_KB`

  # compute value in MiB
  TOTAL_MB=`expr $TOTAL_KB / 1024`
  FREE_MB=`expr $FREE_KB / 1024`
  BUFFERS_MB=`expr $BUFFERS_KB / 1024`
  CACHE_MB=`expr $CACHE_KB / 1024`
  SWTOTAL_MB=`expr $SWTOTAL_KB / 1024`
  SWFREE_MB=`expr $SWFREE_KB / 1024`
  USED_MB=`expr $USED_KB / 1024`
  SWCACHE_MB=`expr $SWCACHE_KB / 1024`
  SWUSED_MB=`expr $SWUSED_KB / 1024`
  ARC_MB=`expr $ARC_KB / 1024`

  # Compute percentage value
  USED=`expr 100 '*' $USED_KB / $TOTAL_KB`
  SWUSED=0
  if [ "$SWTOTAL_KB" -gt "0" ]; then
    SWUSED=`expr 100 '*' $SWUSED_KB / $SWTOTAL_KB`
  fi

  # Once we're done doing work that could take any real time, we can end the
  # trap because from here on out it's just comparisons and string
  # concatenation
trap - USR1

function check_value {
  if [ -z "$1" ]; then
    return 0
  fi

  # If the range starts with an @, alert if value is inside the range,
    # otherwise alert if value is outside of range.
  INSIDE=`echo "$1" | grep -c '^@'`
  RANGE=`echo "$1" | sed 's/^@//'`

  # Start is anything left of the colon or 0.
    # End is anything right of the colon or the whole string if there's no
    # colon or infinity if there is a colon and nothing to the right of it

  # is there a colon?
  PARTS=`echo "$RANGE" | awk -F : '{ print NF }'`
  if [ $PARTS -gt 1 ] ; then
    START=${RANGE%%:*}
    END=${RANGE##*:}
  else
    START=0
    END=$RANGE
  fi

  # 4. to specify negative infinity, use "~"
  if [ "$START" == "~" ] ; then
    START=-999999999
  fi

  if [ -z "$END" ] ; then
    END=999999999
  fi

  if [ $START -gt $END ] ; then
    echo "In threshold START:END, START must be less than or equal to END"
    range_help
  fi

  # if the range starts with an @, alert if value is inside the range,
  # otherwise alert if value is outside of range all ranges are inclusive
  # of endpoints so we use less than or equal on the inside and just less
  # than on the outside
  if [ "$INSIDE" -gt 0 ] ; then
    if [ "$START" -le "$2" -a "$2" -le "$END" ] ; then
      return 1
    fi
  else
    if [ "$2" -lt "$START" -o "$END" -lt "$2" ] ; then
      return 1
    fi
  fi

  return 0
}

# Check critical threshold
STATE=$STATE_OK
check_value "$CRITICAL" "$USED"
if [ $? -gt 0 ] ; then
  STATE=$STATE_CRITICAL
fi
# Check warning treshold
if [ $STATE -ne $STATE_CRITICAL ]; then
  check_value "$WARNING" "$USED"
  if [ $? -gt 0 ] ; then
    STATE=$STATE_WARNING
  fi
fi

OUT="Usage: real $USED% ($USED_MB/$TOTAL_MB MB), buffer: $BUFFERS_MB MB, cache: $CACHE_MB MB, arc-cache: $ARC_MB MB, swap: $SWUSED ($SWUSED_MB/$SWTOTAL_MB MB)| utilisation=${USED}% buffer=${BUFFERS_MB}MB cached=${CACHE_MB}MB arc-cache=${ARC_MB}MB swap=${SWUSED_MB}MB"

case $STATE in
  $STATE_OK)
    printf "%s OK - %s\n" "$SERVICE" "$OUT";;
  $STATE_WARNING)
    printf "%s WARNING - %s\n" "$SERVICE" "$OUT";;
  $STATE_CRITICAL)
    printf "%s CRITICAL - %s\n" "$SERVICE" "$OUT";;
  $STATE_UNKNOWN)
    printf "%s UNKNOWN - %s\n" "$SERVICE" "$OUT";;
esac

cleanup $STATE