#!/bin/env perl ################################################################################ # # $HeadURL: https://bss-srv4.bioinformatics.ic.ac.uk/svn/bss_admin/nagios/trunk/plugins/check_rsnapshot.pl $ # $Author: jamesa $ # $Revision: 370 $ # $Date: 2014-09-11 14:18:10 +0100 (Thu, 11 Sep 2014) $ # # Nagios plugin for checking rsnapshot backups # Version 1.0 # # (c) James Abbott (j.abbott@imperial.ac.uk) # #*******************************************************************************# # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # #*******************************************************************************# # # This plugin is designed for scenarios where multiple backups are arranged # in different rsnaphost configuration files. # # For example, for an rsnapshot configuration file 'backup-1.conf', where # nagios and rsnapshot run on different machines: # # 1) Install this plugin on the rsnapshot server. # 2) Edit the plugin to set '$conf_dir' to the location of your rsnapshot config files # 3) Enable a mechanism for checking this remotely i.e. for via NRPE, add a line like: # # command[check_rsnapshot_backup-1]=/usr/local/bin/check_rsnapshot.pl --conf backup-1.conf # # to your nrpe.cfg. # # 4) Define appropriate nagios servicegroups/hostgroups i.e.: # # define hostgroup { # hostgroup_name rsnapshot-servers # alias rsnapshot servers # members backupserver1.my.domain.com #} # # define servicegroup { # servicegroup_name rsnapshot-service # alias rsnaphost backup service #} # # 5) Add a nagios service definition for each rsnapshot configuration you wish to monitor: # # define service { # use service-template # hostgroup_name rsnapshot-servers # service_description rsnapshot backup: backup-1.conf # service_groups rsnapshot-service # check_command check-nrpe!check_rsnapshot_backup-1 #} # # Reload NRPE/nagios configurations and hopefully all will be working. # # Rather than analysing on-disk data for file numbers like some plugins do # web, in this case we are checking for errors reported in the log file for the # last instance of each backup interval run, along with warnings for backups which # have been running for >24 hours or backups which have not completed cleanly # (i.e. a lockfile remains but no backup is running) # # The configuration file to be used should be passed on the command line using # the '--conf' argument. If the specified config file does not exist, the # plugin will prepend the value of $conf_dir to this path, consequently it is # not necessary to fully qualify the path to the configuration file if # $conf_dir is correctly set. # # The number of snapshots found for each interval can also optionally be checked by # adding the '--missing' argument. This checks that the number of on-disk # snapshosts for each interval matchches those defined in the configuration. # This is optional since it can take some months to build up a full set of # snapshots depending on rotation policies. # # Since this plugin is parsing the configuration and log files, it is # potentially fragile should these formats change. It has been developed and # tested against rsnapshot 1.3.1 running on Solaris 10, but should work ok on # any unixy platform. # ################################################################################ use warnings; use strict; use Nagios::Plugin; use File::stat; use File::Basename; # Path to directory containing rsnapshot configurations. my $conf_dir = "/etc/rsnapshot"; { my $plugin = Nagios::Plugin->new( usage => "Usage: %s --conf [--missing]", blurb => "This plugin checks the state of an rsnapshot job based upon the last reported messages in the log files for each interval" ); $plugin->add_arg( spec => 'conf=s', help => '--conf: rsnapshot configuration file', required => 1 ); $plugin->add_arg( spec => 'missing', help => '--missing: also report missing snapshots', ); $plugin->getopts(); my $conf_file; ( -e $plugin->opts->conf ) ? ( $conf_file = $plugin->opts->conf ) : ( $conf_file = $conf_dir . '/' . $plugin->opts->conf ); my $backup_name = basename( $conf_file, '.conf' ); my $conf = parse_config( $plugin, $conf_file ); # first, a couple of checks if the backup is currently in progress.... if ( -e $conf->{lockfile} ) { open PID, $conf->{lockfile} or $plugin->nagios_die("Could not open $conf->{lockfile}"); my $pid = ; close PID; chomp $pid; my $res = `ps -p $pid|grep $pid`; if ($res) { my $inode = stat( $conf->{lockfile} ); if ( ( time() - $inode->ctime ) > 86400 ) { $plugin->nagios_exit( 'WARNING', "$backup_name backup has been running for >24 hours" ); } else { $plugin->nagios_exit( 'OK', "$backup_name backup in progress" ); } } else { $plugin->nagios_exit( 'CRITICAL', "lockfile present but $backup_name backup process not running" ); } } else { # there is no lockfile, so the backup is not running.... my $last_logs = parse_log( $plugin, $conf->{logfile} ); #my ( $error, $warning ); my ( @errors, @warnings ); # check the logfiles for the last run from each interval... foreach my $interval ( keys %$last_logs ) { if ( $last_logs->{$interval}->{'message'} =~ /^ERROR:/ ) { push @errors, uc($interval) . ": $last_logs->{$interval}->{'message'}"; } elsif ( $last_logs->{$interval}->{'message'} eq "$backup_name completed, but with some warnings" ) { push @warnings, uc($interval) . ": $last_logs->{$interval}->{'message'}"; } } # now check each interval in the configuration contains the correct number # of snapshots... my $intervals = $conf->{'intervals'}; opendir SNAPSHOTS, $conf->{'snapshot_root'} or $plugin->nagios_die( "Could not open " . $conf->{'snapshot_root'} . ": $!" ); my @snapshots = grep !/\.\.?\z/, readdir SNAPSHOTS; close SNAPSHOTS; foreach my $interval ( keys(%$intervals) ) { my @ints = grep /^$interval\.[0-9]+/, @snapshots; my $int_count = scalar(@ints); if ( $plugin->opts->missing && $intervals->{$interval} != $int_count ) { push @warnings, uc($interval) . ": Missing $backup_name snapshots - $int_count found but $intervals->{$interval} defined"; } # Now check each snapshot contains the right backups, and that these are not # empty Trying to get backup size/file numbers seems futile since these are # always going to change, and possibly not always increasing, so just make sure # we are not backup up empty mountpoints foreach my $int (@ints) { my $backups = $conf->{'backups'}; foreach my $backup ( keys(%$backups) ) { my $snapshot_dir = $conf->{'snapshot_root'} . "/" . $int . "/"; ( $backups->{$backup} ) ? ( $snapshot_dir .= $backups->{$backup} ) : ( $snapshot_dir .= '/' ); # Account for entries which have an empty target in the conf if ( !-d $snapshot_dir ) { push @errors, uc($interval) . ": $backup_name backup $backup not found"; } else { my $backup_dir = $conf->{'snapshot_root'} . "/" . $int . "/"; ( $backups->{$backup} ) ? ( $backup_dir .= $backups->{$backup} ) : ( $backup_dir .= '/' ); opendir BACKUP, $backup_dir or $plugin->nagios_die("Could not open $backup_dir: $!"); my @files = grep !/\.\.?\z/, readdir BACKUP; close BACKUP; if ( scalar(@files) == 0 ) { push @errors, uc($interval) . ": $backup_name $backup contains no files..."; } } } } } if ( scalar(@errors) ) { my $message = join( "; ", @errors ); $plugin->nagios_exit( 'CRITICAL', $message ); } elsif ( scalar(@warnings) ) { my $message = join( "; ", @warnings ); $plugin->nagios_exit( 'WARNING', $message ); } else { $plugin->nagios_exit( 'OK', '' ); } } } ###################################################################### # # parses_config # # parses an rsnapshot configuration file, returning a hashref # containing desired values # # required params: $ (Nagios::Plugin object) # $ (path to config file) # # returns : $ (hashref of configuration data) # ###################################################################### sub parse_config { my $plugin = shift; my $conf_file = shift; my ( %conf, %intervals, %backups ); $plugin->nagios_die("Configuration file $conf_file does not exist...") unless ( -e $conf_file ); open CONF, $conf_file or $plugin->nagios_die("Could not open $conf_file: $!"); while () { next if /^#/; next if /^\n/; chomp; my @fields = split( /\t/, $_ ); $conf{'snapshot_root'} = $fields[1] if $fields[0] eq "snapshot_root"; $conf{'logfile'} = $fields[1] if $fields[0] eq "logfile"; $conf{'lockfile'} = $fields[1] if $fields[0] eq "lockfile"; $intervals{ $fields[1] } = $fields[2] if $fields[0] eq "interval"; $backups{ $fields[1] } = $fields[2] if $fields[0] eq "backup"; } close CONF; $conf{intervals} = \%intervals; $conf{backups} = \%backups; return \%conf; } ###################################################################### # # parse_log # # parses an rsnapshot logfile to check if any warnings were generated # for the last backup, or if it completed succesfully # # required params: $ (Nagios::Plugin object) # $ (logfile to parse) # # returns $ (hashref of details of status of backup jobs in log) # ###################################################################### sub parse_log { my $plugin = shift; my $logfile = shift; my %results; open LOG, $logfile or $plugin->nagios_die("Could not open $logfile"); while ( my $line = ) { if ( $line =~ /^(\[[0-9]+\/[A-Za-z]+\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2}\])( WARNING:)?(\s+\S+){3}\s+(daily|weekly|monthly): (.+)/ ) { my $date = $1; my $interval = $4; my $message = $5; if ( $results{$interval}->{'date'} && $results{$interval}->{'date'} eq $date && ( $results{$interval}->{'message'} !~ /^started/ ) ) { $message =~ s/ERROR://; $message =~ s/\-+//; $results{$interval}->{message} .= $message; } else { $results{$interval} = { 'date' => $date, 'interval' => $interval, 'message' => $message }; } } } close LOG; return \%results; }