#!/usr/bin/perl -w # # # POD # =head1 NAME check_pbsnodes - a nagios script for calling the C command to test for the presence of crashed nodes in a high performance computing cluster that uses Moab/Maui & Torque for job scheduling and queuing. =head1 SYNOPSIS ./check_pbsnodes -w 1 -c 2 This would warn Nagios if one node was unresponsive. If two nodes were down, would send Nagios a critical message. =head1 DESCRIPTION This plugin is for testing the presence of crashed nodes in a high performance computing cluster. In such clusters, it is not uncommon for load to reach very very high levels on compute nodes. Under such load, many parts of the system may bog down and become unresponsive. For example, SSH logins may no longer work. Polling via Gangila or Cacti may cease. And yet, this does not mean that the compute node has crashed or isn't still doing the work assigned to it by the cluster scheduler. Under such circumstances, the only way to know if a node is really down is if a job goes negative. Torque has a higher I than the jobs it runs, so it is always guranteed a processor time slice. If B is exceeded and Torque is able to get a slice it will kill the job. If it can't, then it's because the node has crashed and we'll see C show negative in the B column. Therefore, this plugin is designed to be run on the Cluster Service Node, calling the C command, parsing the output, and searching for values in the REMAINING column that are negative numbers. When it finds them, it should report the problem using correct Nagios syntax, and provide the crashed node names to the output string. It needs to be called from a remote plugin executor such as NRPE, or MRPE if using Matthias Kettner's Check_MK. =head1 AUTHOR Jonathan Mills jonmills@renci.org =cut # # end POD # use strict; use Getopt::Long; use vars qw($showq %exit_codes @status_list $total_jobs $progname $opt_h $opt_w $opt_c); # # ADJUST THIS VALUE TO YOUR OWN ENVIRONMENT !!! # #$showq = "/opt/moab/bin/showq"; $showq = "/opt/maui/bin/showq"; # Predefined exit codes for Nagios %exit_codes = ('OK' , 0, 'WARNING' , 1, 'CRITICAL' , 2, 'INVALID' , 3,); $progname = "$0"; sub print_help (); # options definitions Getopt::Long::Configure('bundling'); GetOptions( "h" => \$opt_h, "help" => \$opt_h, "w=s" => \$opt_w, "warning=s" => \$opt_w, "c=s" => \$opt_c, "critical=s" => \$opt_c, ); if ($opt_h) { print_help(); exit $exit_codes{OK}; } # verify the options $opt_h = shift unless ($opt_h); print_help() unless ($opt_w && $opt_c); # print help sub print_help () { print "Usage: $progname -w -c \n"; exit $exit_codes{OK}; } # Pull data into perl from the showq command. # The -r flag limits output to active jobs. # The egrep pattern pulls out only the lines that # begin with a valid jobid. YOUR jobids may look different... # col1 = jobid # col2 = node # col3 = user # col4 = REMAINING (time) @status_list = `$showq -r | /bin/egrep "[0-9]{7}" | awk '{ print \$1, \$9, \$7, \$11 }' | sort -n`; $total_jobs = @status_list; my $errors = 0; my @err_jobs; my @err_users; my @err_nodes; foreach (@status_list) { chomp; my @fields = split; # This is where the magic happens. If the timestamp contains a minus sign, # then the job has gone negative and the node isn't responding... if ($fields[-1] =~ m/-/) { $errors += 1; push @err_jobs, $fields[0]; push @err_nodes, $fields[1]; push @err_users, $fields[2]; } } # Function to extract unique elements from an array sub unique { my @list = @_; my %seen = (); my @uniq = (); my $item; foreach $item (@list) { push(@uniq, $item) unless $seen{$item}++; } return @uniq; } # Figure out a return code for Nagios if ($errors == 0) { print "OK -- $total_jobs jobs running, all nodes responding.\n"; exit $exit_codes{OK}; } elsif ( $errors >= $opt_w && $errors < $opt_c ) { my @nodes = &unique(@err_nodes); my @jobs = &unique(@err_jobs); my @users = &unique(@err_users); my $count = $#nodes + 1; print "WARNING -- $count node(s) not responding: \n"; printf "@nodes\n"; printf "Job ID(s): @jobs\n"; printf "User(s): @users\n"; exit $exit_codes{WARNING}; } elsif ( $errors >= $opt_c ) { my @nodes = &unique(@err_nodes); my @jobs = &unique(@err_jobs); my @users = &unique(@err_users); my $count = $#nodes + 1; print "CRITICAL -- $count node(s) not responding: \n"; printf "@nodes\n"; printf "Job ID(s): @jobs\n"; printf "User(s): @users\n"; exit $exit_codes{CRITICAL}; } else { # We don't know...something ain't right print "UNKNOWN\n"; exit $exit_codes{INVALID}; } # # DONE! #