#!/bin/env perl
################################################################################
#
# $HeadURL: https://bss-srv4.bioinformatics.ic.ac.uk/svn/bss_admin/nagios/trunk/plugins/check_rsnapshot.pl $
# $Author: jamesa $
# $Revision: 370 $
# $Date: 2014-09-11 14:18:10 +0100 (Thu, 11 Sep 2014) $
#
# Nagios plugin for checking rsnapshot backups
# Version 1.0
#
# (c) James Abbott (j.abbott@imperial.ac.uk)
#
#*******************************************************************************#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
#*******************************************************************************#
#
# This plugin is designed for scenarios where multiple backups are arranged
# in different rsnaphost configuration files.
#
# For example, for an rsnapshot configuration file 'backup-1.conf', where
# nagios and rsnapshot run on different machines:
#
# 1) Install this plugin on the rsnapshot server.
# 2) Edit the plugin to set '$conf_dir' to the location of your rsnapshot config files
# 3) Enable a mechanism for checking this remotely i.e. for via NRPE, add a line like:
#
# command[check_rsnapshot_backup-1]=/usr/local/bin/check_rsnapshot.pl --conf backup-1.conf
#
# to your nrpe.cfg.
#
# 4) Define appropriate nagios servicegroups/hostgroups i.e.:
#
# define hostgroup {
# hostgroup_name rsnapshot-servers
# alias rsnapshot servers
# members backupserver1.my.domain.com
#}
#
# define servicegroup {
# servicegroup_name rsnapshot-service
# alias rsnaphost backup service
#}
#
# 5) Add a nagios service definition for each rsnapshot configuration you wish to monitor:
#
# define service {
# use service-template
# hostgroup_name rsnapshot-servers
# service_description rsnapshot backup: backup-1.conf
# service_groups rsnapshot-service
# check_command check-nrpe!check_rsnapshot_backup-1
#}
#
# Reload NRPE/nagios configurations and hopefully all will be working.
#
# Rather than analysing on-disk data for file numbers like some plugins do
# web, in this case we are checking for errors reported in the log file for the
# last instance of each backup interval run, along with warnings for backups which
# have been running for >24 hours or backups which have not completed cleanly
# (i.e. a lockfile remains but no backup is running)
#
# The configuration file to be used should be passed on the command line using
# the '--conf' argument. If the specified config file does not exist, the
# plugin will prepend the value of $conf_dir to this path, consequently it is
# not necessary to fully qualify the path to the configuration file if
# $conf_dir is correctly set.
#
# The number of snapshots found for each interval can also optionally be checked by
# adding the '--missing' argument. This checks that the number of on-disk
# snapshosts for each interval matchches those defined in the configuration.
# This is optional since it can take some months to build up a full set of
# snapshots depending on rotation policies.
#
# Since this plugin is parsing the configuration and log files, it is
# potentially fragile should these formats change. It has been developed and
# tested against rsnapshot 1.3.1 running on Solaris 10, but should work ok on
# any unixy platform.
#
################################################################################
use warnings;
use strict;
use Nagios::Plugin;
use File::stat;
use File::Basename;
# Path to directory containing rsnapshot configurations.
my $conf_dir = "/etc/rsnapshot";
{
my $plugin = Nagios::Plugin->new(
usage => "Usage: %s --conf [--missing]",
blurb =>
"This plugin checks the state of an rsnapshot job based upon the last reported messages in the log files for each interval"
);
$plugin->add_arg(
spec => 'conf=s',
help => '--conf: rsnapshot configuration file',
required => 1
);
$plugin->add_arg( spec => 'missing',
help => '--missing: also report missing snapshots', );
$plugin->getopts();
my $conf_file;
( -e $plugin->opts->conf )
? ( $conf_file = $plugin->opts->conf )
: ( $conf_file = $conf_dir . '/' . $plugin->opts->conf );
my $backup_name = basename( $conf_file, '.conf' );
my $conf = parse_config( $plugin, $conf_file );
# first, a couple of checks if the backup is currently in progress....
if ( -e $conf->{lockfile} ) {
open PID, $conf->{lockfile} or $plugin->nagios_die("Could not open $conf->{lockfile}");
my $pid = ;
close PID;
chomp $pid;
my $res = `ps -p $pid|grep $pid`;
if ($res) {
my $inode = stat( $conf->{lockfile} );
if ( ( time() - $inode->ctime ) > 86400 ) {
$plugin->nagios_exit( 'WARNING', "$backup_name backup has been running for >24 hours" );
}
else {
$plugin->nagios_exit( 'OK', "$backup_name backup in progress" );
}
}
else {
$plugin->nagios_exit( 'CRITICAL', "lockfile present but $backup_name backup process not running" );
}
}
else {
# there is no lockfile, so the backup is not running....
my $last_logs = parse_log( $plugin, $conf->{logfile} );
#my ( $error, $warning );
my ( @errors, @warnings );
# check the logfiles for the last run from each interval...
foreach my $interval ( keys %$last_logs ) {
if ( $last_logs->{$interval}->{'message'} =~ /^ERROR:/ ) {
push @errors, uc($interval) . ": $last_logs->{$interval}->{'message'}";
}
elsif ( $last_logs->{$interval}->{'message'} eq "$backup_name completed, but with some warnings" ) {
push @warnings, uc($interval) . ": $last_logs->{$interval}->{'message'}";
}
}
# now check each interval in the configuration contains the correct number
# of snapshots...
my $intervals = $conf->{'intervals'};
opendir SNAPSHOTS, $conf->{'snapshot_root'}
or $plugin->nagios_die( "Could not open " . $conf->{'snapshot_root'} . ": $!" );
my @snapshots = grep !/\.\.?\z/, readdir SNAPSHOTS;
close SNAPSHOTS;
foreach my $interval ( keys(%$intervals) ) {
my @ints = grep /^$interval\.[0-9]+/, @snapshots;
my $int_count = scalar(@ints);
if ( $plugin->opts->missing && $intervals->{$interval} != $int_count ) {
push @warnings,
uc($interval)
. ": Missing $backup_name snapshots - $int_count found but $intervals->{$interval} defined";
}
# Now check each snapshot contains the right backups, and that these are not
# empty Trying to get backup size/file numbers seems futile since these are
# always going to change, and possibly not always increasing, so just make sure
# we are not backup up empty mountpoints
foreach my $int (@ints) {
my $backups = $conf->{'backups'};
foreach my $backup ( keys(%$backups) ) {
my $snapshot_dir = $conf->{'snapshot_root'} . "/" . $int . "/";
( $backups->{$backup} )
? ( $snapshot_dir .= $backups->{$backup} )
: ( $snapshot_dir .= '/' ); # Account for entries which have an empty target in the conf
if ( !-d $snapshot_dir ) {
push @errors, uc($interval) . ": $backup_name backup $backup not found";
}
else {
my $backup_dir = $conf->{'snapshot_root'} . "/" . $int . "/";
( $backups->{$backup} ) ? ( $backup_dir .= $backups->{$backup} ) : ( $backup_dir .= '/' );
opendir BACKUP, $backup_dir or $plugin->nagios_die("Could not open $backup_dir: $!");
my @files = grep !/\.\.?\z/, readdir BACKUP;
close BACKUP;
if ( scalar(@files) == 0 ) {
push @errors, uc($interval) . ": $backup_name $backup contains no files...";
}
}
}
}
}
if ( scalar(@errors) ) {
my $message = join( "; ", @errors );
$plugin->nagios_exit( 'CRITICAL', $message );
}
elsif ( scalar(@warnings) ) {
my $message = join( "; ", @warnings );
$plugin->nagios_exit( 'WARNING', $message );
}
else {
$plugin->nagios_exit( 'OK', '' );
}
}
}
######################################################################
#
# parses_config
#
# parses an rsnapshot configuration file, returning a hashref
# containing desired values
#
# required params: $ (Nagios::Plugin object)
# $ (path to config file)
#
# returns : $ (hashref of configuration data)
#
######################################################################
sub parse_config {
my $plugin = shift;
my $conf_file = shift;
my ( %conf, %intervals, %backups );
$plugin->nagios_die("Configuration file $conf_file does not exist...") unless ( -e $conf_file );
open CONF, $conf_file or $plugin->nagios_die("Could not open $conf_file: $!");
while () {
next if /^#/;
next if /^\n/;
chomp;
my @fields = split( /\t/, $_ );
$conf{'snapshot_root'} = $fields[1] if $fields[0] eq "snapshot_root";
$conf{'logfile'} = $fields[1] if $fields[0] eq "logfile";
$conf{'lockfile'} = $fields[1] if $fields[0] eq "lockfile";
$intervals{ $fields[1] } = $fields[2] if $fields[0] eq "interval";
$backups{ $fields[1] } = $fields[2] if $fields[0] eq "backup";
}
close CONF;
$conf{intervals} = \%intervals;
$conf{backups} = \%backups;
return \%conf;
}
######################################################################
#
# parse_log
#
# parses an rsnapshot logfile to check if any warnings were generated
# for the last backup, or if it completed succesfully
#
# required params: $ (Nagios::Plugin object)
# $ (logfile to parse)
#
# returns $ (hashref of details of status of backup jobs in log)
#
######################################################################
sub parse_log {
my $plugin = shift;
my $logfile = shift;
my %results;
open LOG, $logfile or $plugin->nagios_die("Could not open $logfile");
while ( my $line = ) {
if ( $line =~
/^(\[[0-9]+\/[A-Za-z]+\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2}\])( WARNING:)?(\s+\S+){3}\s+(daily|weekly|monthly): (.+)/
)
{
my $date = $1;
my $interval = $4;
my $message = $5;
if ( $results{$interval}->{'date'}
&& $results{$interval}->{'date'} eq $date
&& ( $results{$interval}->{'message'} !~ /^started/ ) )
{
$message =~ s/ERROR://;
$message =~ s/\-+//;
$results{$interval}->{message} .= $message;
}
else {
$results{$interval} = { 'date' => $date, 'interval' => $interval, 'message' => $message };
}
}
}
close LOG;
return \%results;
}