#! /usr/bin/ksh # check_weblogic_heapfree # nagios plugin to check if a weblogic jvm heap is falling below nominal thresholds (current and delta free) # author: Sergei Haramundanis 03-Nov-2006 # update: [2.0] 12-Apr-2007 by S. Haramundanis to include performing monitor within a specific timeframe # update: [2.1] 20-Jun-2007 by S. Haramundanis to include port # in .dat filename to support monitoring multiple weblogic instances # # usage: check_weblogic_heapfree runtime_directory url userid password heapfree_percent_minimum_threshold heapfree_delta_percent_maximum_threshold check_from_time check_to_time # # Description: # # This plugin will check the heapfree size in a weblogic jvm at the url specified via the weblogic.Admin client, # at a specific timeframe specified by check_from_time and check_to_time (in HHMM format) # # This plugin requires access to weblogic.jar to access the weblogic.Admin client (see definition of WEBLOGIC_LIB further on in this script, you may need to change it) # # This plugin also creates a check_weblogic_heapfree_port.dat file in the runtime_directory specified which contains the previous reading of the jvm heapfree to determine the delta change between runs # # Output: # # During any run of the plugin, the value returned will be as follows: # # if the current time is outside the specified timeframe it will return an OK state with the message: # # [OK] current time outside of monitoring timeframe check_from_time and check_to_time # # if the current time is within the specified timeframe: # # it will poll the JVMRuntime MBean of the weblogic application server specified in the url and make the following two checks: # # 1. compare the HeapFreeCurrent and HeapSizeCurrent to determine if the current heapfree is below the minimum threshold (in percent) as specified in the heapfree_percent_minimum_threshold parameter # # 2. compare the previous run's HeapFreeCurrent to the current run's HeapFreeCurrent to determine if the delta (previous_heapfree - current_heapfree) is above the maximum threshold (in percent) as specified in the heapfree_delta_percent_maximum_threshold parameter # # if the current heapfree is above the minimum threshold and the delta heapfree is below the maximum threshold it will return an OK state with a message similar to the following example: # # [OK] 84% heapfree above minimum threshold of 30%: 902816904 of 1063256064 free; [OK] -1% heapfree delta below maximum threshold of 30%: current 902816904 previous 885992368 # # if either the current heapfree is below the minimum threshold or the delta heapfree is above the maximum threshold it will return a CRITICAL state with a message similar to the following example: # [CRITICAL] 29% heapfree below minimum threshold of 30%: 308344240 of 1063256064 free; [CRITICAL] 31% heapfree delta above maximum threshold of 30%: current 611334755 previous 885992368 # SCRIPTPATH=`echo $0 | /bin/sed -e 's,[\\/][^\\/][^\\/]*$,,'` . ${SCRIPTPATH}/utils.sh # sets correct STATE_* return values if [ "${1}" = "" -o "${1}" = "--help" ]; then echo "check_weblogic_heapfree 2.1" echo "" echo "nagios plugin to check if a weblogic jvm heap is falling below nominal thresholds (current and delta free)" echo "" echo "This nagios plugin comes with ABSOLUTELY NO WARRANTY." echo "You may redistribute copies of this plugin under the terms of the GNU General Public License" echo "as long as the original author, edit history and description information remain in place." echo "" echo "usage: check_weblogic_heapfree runtime_directory url userid password heapfree_percent_minimum_threshold heapfree_delta_percent_maximum_threshold check_from_time check_to_time" echo "usage: check_weblogic_heapfree --help" echo "usage: check_weblogic_heapfree --version" exit ${STATE_OK} fi if [ ${1} == "--version" ]; then echo "check_weblogic_heapfree 2.1" echo "This nagios plugin comes with ABSOLUTELY NO WARRANTY." echo "You may redistribute copies of this plugin under the terms of the GNU General Public License" echo "as long as the original author, edit history and description information remain in place." exit ${STATE_OK} fi if [ $# -lt 8 ]; then echo "[CRITICAL] insufficient arguments" exit ${STATE_CRITICAL} fi RUNDIR=${1} URL=${2} USERID=${3} PASSWORD=${4} HEAPFREE_PERCENT_MINIMUM_THRESHOLD=${5} HEAPFREE_DELTA_PERCENT_MAXIMUM_THRESHOLD=${6} CHECK_FROM_TIME=${7} CHECK_TO_TIME=${8} #echo "RUNDIR=\"${RUNDIR}\"" #echo "URL=\"${URL}\"" #echo "USERID=\"${USERID}\"" #echo "PASSWORD=\"${PASSWORD}\"" #echo "HEAPFREE_PERCENT_MINIMUM_THRESHOLD=\"${HEAPFREE_PERCENT_MINIMUM_THRESHOLD}\"" #echo "HEAPFREE_DELTA_PERCENT_MAXIMUM_THRESHOLD=\"${HEAPFREE_DELTA_PERCENT_MAXIMUM_THRESHOLD}\"" #echo "CHECK_FROM_TIME=\"${CHECK_FROM_TIME}\"" #echo "CHECK_TO_TIME=\"${CHECK_TO_TIME}\"" START_TIME=`date +%H%M%S` #### if [ ! -d ${RUNDIR} ]; then echo "[CRITICAL] unable to locate runtime_directory ${RUNDIR}" exit ${STATE_CRITICAL} fi if [ ! -w ${RUNDIR} ]; then echo "[CRITICAL] no write access to runtime_directory ${RUNDIR}" exit ${STATE_CRITICAL} fi #### let index=`echo $URL ":" | awk '{print(index($1,$2))}'` #### if [ ${index} -eq 0 ]; then echo "[CRITICAL] URL does not include port (e.g. host:port)" exit ${STATE_CRITICAL} fi #### let index=index+1; URL_PORT=`echo $URL $index | awk '{print(substr($1,$2))}'` #echo "URL_PORT=\"${URL_PORT}\"" #### if [ "${URL_PORT}" == "" ]; then echo "[CRITICAL] URL does not include port (e.g. host:port)" exit ${STATE_CRITICAL} fi #### #### # if .dat file exists, check to make sure file can be written to if [ -f ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat -a ! -w ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat ]; then echo "[CRITICAL] \"${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat\" is not writeable by this process" exit ${STATE_CRITICAL} fi #### export WEBLOGIC_LIB=/usr/local/weblogic813/server/lib # retrieve current date and time CURRENT_DATE=`date +%Y%m%d` CURRENT_TIME=`date +%H%M` CURRENT_DATETIME=`date +%Y%m%d%H%M` CHECK_FROM_DATETIME=`echo ${CURRENT_DATE}${CHECK_FROM_TIME}` CHECK_TO_DATETIME=`echo ${CURRENT_DATE}${CHECK_TO_TIME}` #echo "CURRENT_DATE=\"${CURRENT_DATE}\"" #echo "CURRENT_TIME=\"${CURRENT_TIME}\"" #echo "CURRENT_DATETIME=\"${CURRENT_DATETIME}\"" #echo "CHECK_FROM_DATETIME=\"${CHECK_FROM_DATETIME}\"" #echo "CHECK_TO_DATETIME=\"${CHECK_TO_DATETIME}\"" if [ ${CURRENT_DATETIME} -ge ${CHECK_FROM_DATETIME} -a ${CURRENT_DATETIME} -le ${CHECK_TO_DATETIME} ] ; then let heapfree_percent_minimum_threshold=${HEAPFREE_PERCENT_MINIMUM_THRESHOLD} let heapfree_delta_percent_maximum_threshold=${HEAPFREE_DELTA_PERCENT_MAXIMUM_THRESHOLD} HEAPDATA=`java -cp ${WEBLOGIC_LIB}/weblogic.jar weblogic.Admin -url ${URL} -username ${USERID} -password ${PASSWORD} GET -pretty -type JVMRuntime 2>&1` #echo "HEAPDATA=\"${HEAPDATA}\"" HEAPDATA_1=`echo ${HEAPDATA} | sed /^$/d` # remove newlines #echo "HEAPDATA_1=\"${HEAPDATA_1}\"" let heapfree_index=`echo ${HEAPDATA_1} | awk '{ print index($0,heapfree) }' heapfree=HeapFree` let heapsize_index=`echo ${HEAPDATA_1} | awk '{ print index($0,heapsize) }' heapsize=HeapSize` #echo "heapfree_index=${heapfree_index}" #echo "heapsize_index=${heapsize_index}" if [ ${heapfree_index} -eq 0 -o ${heapsize_index} -eq 0 ]; then HEAPDATA=`echo ${HEAPDATA} | sed /^$/d` # remove newlines echo "[CRITICAL] weblogic.Admin failed HEAPDATA=\"${HEAPDATA}\"" exit ${STATE_CRITICAL} fi HEAPFREEDATA=`echo ${HEAPDATA_1} | awk '{ print substr($0,heapfree_index+17) }' heapfree_index=${heapfree_index} | awk '{ print $1 }'` #echo "HEAPFREEDATA=\"${HEAPFREEDATA}\"" HEAPSIZEDATA=`echo ${HEAPDATA_1} | awk '{ print substr($0,heapsize_index+17) }' heapsize_index=${heapsize_index} | awk '{ print $1 }'` #echo "HEAPSIZEDATA=\"${HEAPSIZEDATA}\"" let heapsize=${HEAPSIZEDATA} let heapfree=${HEAPFREEDATA} let one_percent_heapsize=${heapsize}/100 #echo "one_percent_heapsize=${one_percent_heapsize}" let heapfree_percent=${heapfree}/${one_percent_heapsize} #echo "heapfree_percent=${heapfree_percent}" HEAPFREE_ALERT=0 HEAPFREE_MESSAGE="" if [ ${heapfree_percent} -lt ${heapfree_percent_minimum_threshold} ]; then # set alert status HEAPFREE_ALERT=1 HEAPFREE_MESSAGE="${heapfree_percent}% heapfree below minimum threshold of ${heapfree_percent_minimum_threshold}%: ${heapfree} of ${heapsize} free" else # set non-alert status HEAPFREE_ALERT=0 HEAPFREE_MESSAGE="${heapfree_percent}% heapfree above minimum threshold of ${heapfree_percent_minimum_threshold}%: ${heapfree} of ${heapsize} free" fi # check delta HEAPFREE_DELTA_ALERT=0 HEAPFREE_DELTA_MESSAGE="" if [ ! -f ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat ]; then # if previous data does not exist just create it # set non-alert status HEAPFREE_DELTA_ALERT=0 # initialize .dat file RETVAL=`echo ${heapfree} > ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat 2>&1` if [ $? -ne 0 ]; then # set alert status HEAPFREE_DELTA_ALERT=1 HEAPFREE_DELTA_MESSAGE="unable to initialize ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat: ${RETVAL}" else # set non-alert status HEAPFREE_DELTA_ALERT=0 HEAPFREE_DELTA_MESSAGE="${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat initialized" fi else # previous data exists, check delta change PREVIOUSHEAPFREEDATA=`cat ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat 2>&1` if [ $? -ne 0 ]; then # set alert status HEAPFREE_DELTA_ALERT=1 HEAPFREE_DELTA_MESSAGE="unable to access ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat: ${PREVIOUSHEAPFREEDATA}" fi #echo "PREVIOUSHEAPFREEDATA=\"${PREVIOUSHEAPFREEDATA}\"" let previous_heapfree=${PREVIOUSHEAPFREEDATA} let heapfree_delta=${previous_heapfree}-${heapfree} #echo "heapfree_delta=${heapfree_delta}" let one_percent_previous_heapfree=${previous_heapfree}/100 let heapfree_delta_percent=${heapfree_delta}/${one_percent_previous_heapfree} #echo "heapfree_delta_percent=${heapfree_delta_percent}" if [ ${heapfree_delta_percent} -gt ${heapfree_delta_percent_maximum_threshold} ]; then # set alert status HEAPFREE_DELTA_ALERT=1 HEAPFREE_DELTA_MESSAGE="${heapfree_delta_percent}% heapfree delta above maximum threshold of ${heapfree_delta_percent_maximum_threshold}%: current ${heapfree} previous ${previous_heapfree}" else # set non-alert status HEAPFREE_DELTA_ALERT=0 HEAPFREE_DELTA_MESSAGE="${heapfree_delta_percent}% heapfree delta below maximum threshold of ${heapfree_delta_percent_maximum_threshold}%: current ${heapfree} previous ${previous_heapfree}" fi # reinitialize previous heapfree data RETVAL=`echo ${heapfree} > ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat 2>&1` if [ $? -ne 0 ]; then # set alert status only if alert status not already set if [ ${HEAPFREE_DELTA_ALERT} -eq 0 ]; then HEAPFREE_DELTA_ALERT=1 HEAPFREE_DELTA_MESSAGE="unable to initialize ${RUNDIR}/check_weblogic_heapfree_${URL_PORT}.dat: ${RETVAL}" fi fi fi if [ ${HEAPFREE_ALERT} -eq 0 -a ${HEAPFREE_DELTA_ALERT} -eq 0 ]; then # no alerts END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[OK] ${HEAPFREE_MESSAGE}; [OK] ${HEAPFREE_DELTA_MESSAGE} | elapsedTime=${ELAPSED_TIME}secs" exit ${STATE_OK} elif [ ${HEAPFREE_ALERT} -ne 0 -a ${HEAPFREE_DELTA_ALERT} -eq 0 ]; then # heapfree alert END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[CRITICAL] ${HEAPFREE_MESSAGE}; [OK] ${HEAPFREE_DELTA_MESSAGE} | elapsedTime=${ELAPSED_TIME}secs" exit ${STATE_CRITICAL} elif [ ${HEAPFREE_ALERT} -eq 0 -a ${HEAPFREE_DELTA_ALERT} -ne 0 ]; then # heapfree_delta alert END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[OK] ${HEAPFREE_MESSAGE}; [CRITICAL] ${HEAPFREE_DELTA_MESSAGE} | elapsedTime=${ELAPSED_TIME}secs" exit ${STATE_CRITICAL} elif [ ${HEAPFREE_ALERT} -ne 0 -a ${HEAPFREE_DELTA_ALERT} -ne 0 ]; then # heapfree and heapfree_delta alerts END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[CRITICAL] ${HEAPFREE_MESSAGE}; [CRITICAL] ${HEAPFREE_DELTA_MESSAGE} | elapsedTime=${ELAPSED_TIME}secs" exit ${STATE_CRITICAL} fi else END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[OK] current time outside of monitoring timeframe ${CHECK_FROM_TIME} and ${CHECK_TO_TIME} | elapsedTimeSecs=${ELAPSED_TIME}" exit ${STATE_OK} fi # shouldn't get here, but if we do return CRITICAL with reason unknown END_TIME=`date +%H%M%S` let ELAPSED_TIME=${END_TIME}-${START_TIME} echo "[CRITICAL] exiting with reason unknown, check shell script execution | elapsedTime=${ELAPSED_TIME}secs" exit ${STATE_CRITICAL}