#!/usr/bin/env python # # Plugin: check_java_threads.py # # LONG DESCRIPTION: # # This plugin is meant to analyze every java process running locally and take # metrics about which state each thread of this process has. It can (or can not) # output a nagios-like output formatting for chart creation, and can also save # automatically the thread dump to a file in case a determined type of thread # is detected. # # - usage: check_java_threads.py [options] # # IMPORTANT: As java process thread dumping must be done by the process # owner, this plugin relies on the "su - login" shell feature. # This said, you must either run it as root or with sudo # privileges, otherwise you possibly won't get the expected # behavior. # # If must at least set where your JAVA_HOME is and which login is the JAVA_OWNER # of the desired java process to monitor. You should also define the thread state # (stat_alert, default: STUCK) which you want to alert you and the minimum number # of thread occurences for that alert (min_stat_alert, default: 0) for acting # as a threshold. # # You can supress the nagios output format (the after '|' vars) with the -q or # --quiet option. For saving a thread dump every time a interesting state is # detected use the -c or --capture option. # # If your JDK is not supported by this plugin, feel free for editing it and # getting it to work. Any upgrade/improvement/suggestion is really welcomed. # # SHORT DESCRIPTION: # # Run this plugin for nagios metrics and alerts when you have one or more stuck # threads on a running java process. Use -c/--capture for saving a thread dump # when a threshold is reached and -q if you don't want nagios chart creation. # # --------------------------------------------------------------------------------- # # Compatibilities: # * GNU/Linux # * JRockit (any up to 1.6.x) # * Hotspot (any up to 1.8.x) # * Python (2.4 or bigger up to 2.x) # # --------------------------------------------------------------------------------- # # Author: Marcelo Varge # (marcelo.varge@gmail.com) # # --------------------------------------------------------------------------------- import sys from commands import getstatusoutput from optparse import OptionParser from datetime import datetime # --------------------------------------------------------------------------------- # PLUGIN CONFIGURATION # --------------------------------------------------------------------------------- # Environment Variables Dicts. Set your variables here # for identifying the correct JDK and Java PID owner # --------------------------------------------------------------------------------- env_vars = {'JAVA_HOME' : "", 'JAVA_OWNER' : "", } sup_jdks = {'jrockit' : "jrcmd %s print_threads", '1.7' : "jcmd %s Thread.print", '1.8' : "jcmd %s Thread.print", 'hotspot' : "jstack -l %s", } stat_alert = "STUCK" min_stat_alert = 0 # -------------------------------------------------------------------------------- # END OF PLUGIN CONFIGURATION # -------------------------------------------------------------------------------- # -------------------------------------------------------------------------------- # Option Parsing # -------------------------------------------------------------------------------- parser = OptionParser(usage="usage: %prog [options]", version="%prog 1.0") parser.add_option("-q", "--quiet", action="store_true", dest="not_nagios", default=False, help="supress the Nagios output") parser.add_option("-z", "--no-zero", action="store_true", dest="no_zero", default=False, help="supress zeroed values from the Nagios output") parser.add_option("-c", "--capture", action="store_true", dest="capture_dump", default=False, help="takes a thread dump if the process has stuck threads and save it to a file",) (options, args) = parser.parse_args() # -------------------------------------------------------------------------------- # PLUGIN ROUTINE START # -------------------------------------------------------------------------------- # Default dicts. Dont change here unless you know exactly what you want # -------------------------------------------------------------------------------- stats = {'ACTIVE' : 0, 'STANDBY' : 0, 'STUCK' : 0, 'WAITING' : 0, 'RUNNABLE' : 0, 'TIMED_WAITING' : 0, 'http' : 0, } signals = {"OK": 0, "WARNING": 1, "CRITICAL": 2, "UNKNOWN": 3, "OTHER": -1, } sig_exit = "OK" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # -------------------------------------------------------------------------------- # Validating JDK and alert configurations # -------------------------------------------------------------------------------- for opt in env_vars: if not env_vars[opt]: print "ERROR: You must set", opt, "in this plugin." sys.exit(-1) # -------------------------------------------------------------------------------- # Java command method # -------------------------------------------------------------------------------- def java_cmd(cmd): if not "bin" in env_vars['JAVA_HOME']: bin = "/bin/" else: bin = '/' final_cmd = 'su - %s -c "%s"' % (env_vars['JAVA_OWNER'], env_vars['JAVA_HOME'] + bin + cmd) status, cmd_out = getstatusoutput(final_cmd) if status != 0: print "ERROR: %s, exiting." % cmd_out sys.exit(-1) return cmd_out # -------------------------------------------------------------------------------- # Check JDK version and form correct thread commands. If necessary change the # sup_jdks dict, not here. # -> Use java_cmd(get_threads_cmd % PID) for get threads # -------------------------------------------------------------------------------- java_version = java_cmd("java -version") for jdk in sup_jdks.keys(): if jdk.lower() in java_version.lower(): get_threads_cmd = sup_jdks[jdk] # -------------------------------------------------------------------------------- # Get a list of all java processes and generate a dict # -------------------------------------------------------------------------------- java_procs = {} status, jps_out = getstatusoutput("pgrep java") for line in jps_out.split(): java_procs[line] = stats.copy() # -------------------------------------------------------------------------------- # Start Thread Checking # -------------------------------------------------------------------------------- for pid in java_procs.keys(): thread_dump = java_cmd(get_threads_cmd % pid) for line in thread_dump.split('\n'): for status in stats.keys(): if status in line: java_procs[pid][status] += 1 if status == stat_alert: sig_exit = "CRITICAL" # -------------------------------------------------------------------------------- # Format the plugin output # -------------------------------------------------------------------------------- final_out = [sig_exit] nagios_out = ['|'] for pid in java_procs: if java_procs[pid][stat_alert] > min_stat_alert: final_out.append("- PID %s has %s %s threads" % (pid, java_procs[pid][stat_alert], stat_alert.lower())) for value in stats: if options.no_zero and (java_procs[pid][value] == 0): continue nagios_out.append("pid_%s_%s=%s;" % (pid, value.upper(), java_procs[pid][value])) # -------------------------------------------------------------------------------- # Finally output with nagios stats and correct signal exit code. If requested, # a thread dump is saved as `filename`. # -------------------------------------------------------------------------------- if options.capture_dump: filename = "thread_dump_" + timestamp + ".txt" try: f = open(filename, 'w') f.write(thread_dump) f.close() except Exception, e: filename = "[WARN: Error %s while writing dump to file]" % e final_out.append("[INFO: Thread Dump saved as %s]" % filename) if options.not_nagios: print ' '.join(final_out) else: print ' '.join(final_out) + ' '.join(nagios_out) sys.exit(signals[sig_exit])