#!/usr/bin/bash # check_351x plugin # This nagios plugin does basic health checks on a Sun 3510 and 3511 disk arrays. # It takes various parameters. Here's a description: usage () { echo "Usage: $0 -h hostname -c community -d (3511|3510) -e x,x,x,x,x,x,x,x,x,x,x,y" echo " IMPORTANT: You MUST read the comments of this plugin script." } # check_351x [options] # -h hostname # -c snmp community string (default: public) # -d devicetype (should be either '3510' or '3511') # -e expected hdd status values (please read below) while getopts h:c:d:e: option do case $option in h) host=$OPTARG;; c) snmpstring=$OPTARG;; d) device=$OPTARG;; e) expect=$OPTARG;; ?) usage; exit 3;; esac done if [ -z ${host} ] || [ -z $device ] || [ -z $expect ]; then usage exit 3 fi exp=`echo $expect | tr -s ',' ' '` if [ $device != "3511" ] && [ $device != "3510" ]; then echo "This plugin only supports Sun 3510 and 3511 disk arrays." echo "Sure you know what you're doin?" usage exit 3 fi # Default snmp community string if [ -z ${snmpstring} ]; then snmpstring=public fi # This plugin uses snmpwalk. It expects to find it in the constant specified here: snmpwalk_bin=/usr/local/bin/snmpwalk # OID to find hddStatus hddoid=".1.3.6.1.4.1.42.2.180.$device.1.1.6.1.11" # OID to find battery info battoids=".1.3.6.1.4.1.42.2.180.$device.1.1.9.1.9.7 .1.3.6.1.4.1.42.2.180.$device.1.1.9.1.9.14" # OID to find cachemodeflags cacheoid=".1.3.6.1.4.1.42.2.180.$device.1.1.1.2.1.0" # snmpwalk command to run for the hdd status check: hdd_snmpwalk="${snmpwalk_bin} -m ALL -v 2c -O vq -c ${snmpstring} ${host} ${hddoid}" hdd_response=`$hdd_snmpwalk | tr -s '\n' ' '` if echo $hdd_response | grep -i "No Such Object" > /dev/null 2>&1; then echo "UNKNOWN: This device [$host] is not a Sun $device" exit 3 elif echo $hdd_response | grep -i "No more variables" > /dev/null >&1; then echo "UNKNOWN: This device [$host] is not a Sun $device" exit 3 fi maxhdd=`echo $hdd_response | wc -w | tr -d ' '` numdisks=`echo "$exp" | wc -w | tr -d ' '` check_hdd_result () { if [ $1 -ge 128 ] && [ $1 -le 143 ]; then echo "SCSI Device" return 0 fi val[0]="New Drive" val[1]="On-Line Drive" val[2]="Used Drive" val[3]="Spare Drive" val[4]="Drive Initializing" val[5]="Drive rebuilding" val[6]="Adding drive to logical drive" val[9]="Global Spare Drive" val[17]="Drive cloning" val[18]="Drive is valid clone" val[19]="Drive copying" val[63]="Drive absent" val[252]="Missing Global Spare Drive" val[253]="Missing Spare Drive" val[254]="Missing Drive" val[255]="Failed Drive" if [ $1 -ge 63 ]; then echo "Error $1: ${val[$1]}." return 2 else echo "Result $1: ${val[$1]}." return 1 fi } i=1 status=0 while [ $i -le $numdisks ]; do if [ $i -gt $maxhdd ]; then echo "CRITICAL: Ran out of disks. Either you specified too many, or there is a serious problem." exit 2 fi result=`echo "$hdd_response" | cut -f${i} -d' '` shouldbe=`echo "${expect}" | cut -f${i} -d','` if [ $result -ne "$shouldbe" ]; then output="$output Disk No:${i} `check_hdd_result $result`" if [ $? -eq 2 ]; then status=2 elif [ $status -eq 0 ]; then status=1 fi fi i=`expr $i + 1` done # snmpwalk command to run for the battery status check: for oid in $battoids; do batt_snmpwalk="${snmpwalk_bin} -m ALL -v 2c -O vq -c ${snmpstring} ${host} ${oid}" batt_response="`$batt_snmpwalk`" tobinary=`echo "obase = 2 ; ${batt_response}" | bc` result=`printf "%08d" "$tobinary" | cut -c 1,2,5,8` if [ $result -gt 0 ]; then status=2 output="$output Battery error." fi done # snmpwalk command to run for the cache status check: cache_snmpwalk="${snmpwalk_bin} -m ALL -v 2c -O vq -c ${snmpstring} ${host} ${cacheoid}" cache_response="`${cache_snmpwalk}`" result=`expr $cache_response % 2` if [ $result -eq 0 ]; then output="Cache disabled! $output" status=2 fi # Final result processing if [ $status -eq 2 ]; then echo "CRITICAL: $output" exit 2 elif [ $status -eq 1 ]; then echo "WARNING: $output" exit 1 elif [ $status -eq 0 ]; then echo "OK: Nothing out of the ordinary detected." exit 0 else echo "UNKNOWN: An unknown error has occurred" exit 3 fi ## HDD STATUS # A bit about hdd status values: # If you take a look at the MIB supplied by Sun, and do a bit of poking around, # you'll see that the OID we're interested in for hdd status values is this: # .1.3.6.1.4.1.42.2.180.3511.1.1.6.1.11 # If you snmpwalk this OID, you'll see something like this: # (Note: you'll need to copy the mib to /usr/local/share/snmp/mibs to see a proper description) # # snmpwalk -m ALL -v 2c -c public se3511 .1.3.6.1.4.1.42.2.180.3511.1.1.6.1.11 # SUN-STOREDGE-3511-MIB::hddStatus.1 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.2 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.3 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.4 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.5 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.6 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.7 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.8 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.9 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.10 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.11 = INTEGER: 1 # SUN-STOREDGE-3511-MIB::hddStatus.12 = INTEGER: 9 # SUN-STOREDGE-3511-MIB::hddStatus.13 = INTEGER: 141 # The hard drives are indexed 1 to 13, with 1 to 12 being actual hard drives # and 13 being the controller. # The integer values have the following meanings: # 0 : New Drive # 1 : On-Line Drive # 2 : Used Drive # 3 : Spare Drive # 4 : Drive Initialization in Progress # 5 : Drive Rebuild in Progress # 6 : Add Drive to Logical Drive in Progress # 9 : Global Spare Drive # 17 : Drive is in process of Cloning another Drive # 18 : Drive is a valid Clone of another Drive # 19 : Drive is in process of Copying from another Drive # (for Copy/Replace LD Expansion function) # 63 : Drive Absent # 128 to 143 : SCSI Device # 252 : Missing Global Spare Drive # 253 : Missing Spare Drive # 254 : Missing Drive # 255 : Failed Drive" # OK, so if we go back to our snmpwalk output, we can see that hdd's from 1 to 11 # are 'Online', hdd 12 is 'Global Spare Drive', and hdd 13 is in fact a SCSI device of some sort. # Depending on your config, you will expect different status values, hence the -e parameter. # All you need to do is create a CSV list of the values you expect, and in what order. For example: # -e '1,1,1,1,1,1,1,1,1,1,1,9' # -- will do fine. This plugin will ignore the values of any unspecified hdd's (in this case, hdd 13 was # unspecified, and will be ignored). # So, an example invokation: # check_351x -h arrayhost -c public -d 3511 -e '1,1,1,1,1,1,1,1,1,1,1,9' ## BATTERY STATUS # You also want to be informed of any failure with the battery. # In the 3510 and 3511 that I played with, the two battery oid's were: # .1.3.6.1.4.1.42.2.180.3511.1.1.9.1.9.7 and .1.3.6.1.4.1.42.2.180.3511.1.1.9.1.9.14 # (Yes, there are two batteries). # If this is different, then change the battoids variable. # According to the mib, the battery status is represented by converting the integer into a binary number # then examining which bits are set to determine the state of the battery: # BIT 0 - 0: Battery functioning normally. # 1: Battery malfunctioning. # BIT 1 - 0: Battery charging OFF (or trickle). # 1: Battery charging ON. # BIT 2 to 3 If == 0, battery fully charged. # If == 1, battery not fully charged. # If == 2, battery charge critically low. # If == 3, battery completely drained. # BIT 4 to 5 Reserved (Set to 0). # BIT 6 - 0: Battery-backup is enabled. # 1: Battery-backup is disabled. # BIT 7 - 0: Battery IS present. # 1: Battery is NOT present. # Value of 255 (ie, 1111 1111 or 0xff) means "Status unknown". # # This plugin will return a CRITICAL if bit 0, 3, 6 or 7 is set. # Otherwise it will return an OK. ## CACHE STATUS # MIB is .1.3.6.1.4.1.42.2.180.3511.1.1.1.2.1.0 mod 2 should be 1, else return CRIT "Caching disabled."