#!/usr/bin/python
# Daniel Helgenberger, m box bewegtbild GmbH, 2013
# <daniel.helgenberger@m-box.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
__author__ = 'Daniel Helgenberger <aniel.helgenbgerger@m-box.de>'
__version__ = '0.6'
__plugin_name__ = 'check_iferror.py'

import os
import sys
import time


def parse_args():
	# Build argument list
	try:
		import argparse
	except ImportError:
		print 'Error importing library python-argparse'
		gen_exitcode(exit_unkn)

	parser = argparse.ArgumentParser(
		prog=__plugin_name__,
		description='Nagios plugin, interned to check error count OIDs via SNMP. '
		            'A non-zero exit code is generated, if the counter delta exceeds the waring / critical values in'
		            'the evaluation time period. Additionally the plugin checks for the interface state if arguments --ifoid '
		            'and --ifupsate are supplied. The plugin will exit critical/warning if the the interface is down.'
		            'Recommendation: Use longer check_intervals with a low recheck count.',
		epilog='Currently only SNMPv1 and plain OIDs are supported. Needs python > 2.6, pysnmp > 4.2.5, python-argparse'
		       'This program is free software: you can redistribute it and/or modify '
		       'it under the terms of the GNU General Public License as published by '
		       'the Free Software Foundation, either version 3 of the License, or '
		       'at your option) any later version. Author: ' + __author__)
	parser.add_argument('-H', '--host', help='SNMP Agent IP / hostname', default='localhost', required=True)
	parser.add_argument('-C', '--community', help='Read community name (default: "public")', default='public')
	parser.add_argument('-o', '--oid', help='Error count base OID', type=str, default='')
	parser.add_argument('-O', '--ifoid', help='Interface base OID', type=str, default='')
	parser.add_argument('-S', '--ifupstate', help='Expected interface operational state (integer)', type=int)
	parser.add_argument('--warnifdown', help='Exit with warning instead of critical if interface is down',
	                    action='store_true')
	parser.add_argument('--okifdown', help='Exit with ok instead of critical if interface is down', action='store_true')
	parser.add_argument('-i', '--interface', help='Interface number. Appended to the base OID', type=int, default=1)
	parser.add_argument('--preset', help='Use OID preset', type=str)
	parser.add_argument('-P', '--port', help='SNMP port', type=int, default=161)
	parser.add_argument('-w', '--warning', help='Warning threshold (Count)', type=int, default=1)
	parser.add_argument('-c', '--critical', help='Critical threshold (Count)', type=int, default=2)
	parser.add_argument('-t', '--time', help='Evaluation time period. (in hours, default 24h)', type=int, default=24)
	parser.add_argument('-s', '--scratch', help='Scratch / temp base directory. Must exist. (default: /tmp)', type=str, default='/tmp')
	parser.add_argument('-p', '--perfdata', help='Print performance data, (default: off)', action='store_true')
	parser.add_argument('-d', '--debug', help='Verbose mode', action='store_true')
	parser.add_argument('-T', '--test', help='Run test case; needs WAN connection', action='store_true')
	parser.add_argument('-V', '--version', action='version', version='%(prog)s '+__version__)
	return parser.parse_args()


def get_time_threshold(offset_time):
	# calculate evaluation threshold time
	return int(time.time() + offset_time * epoch_multipl)


def join_oid(*oids):
	joinedoid = '.'
	for _oid in oids:
		if type(_oid) == str:
			if _oid[0] == '.':
				if joinedoid[-1] == '.':
					joinedoid = joinedoid[:-1] + _oid
				else:
					joinedoid = joinedoid + _oid
			else:
				if joinedoid[-1] == '.':
					joinedoid = joinedoid + _oid
				else:
					joinedoid = joinedoid + '.' + _oid
		else:
			print 'join_oid: only sting arguments are supported!'
			gen_exitcode(exit_unkn)
	if joinedoid[-1] == '.':
		return joinedoid[:-1]
	else:
		return joinedoid


def read_snmp(agent, community, port, oid):
	# Main SNMP get
	snmperrs = ["Null('')", "NoSuchObject('')", "NoSuchInstance('')", "OctetString('')"]
	integer_str = ["Integer", "Counter32"]
	hex_str = ["hexValue"]
	try:
		# try importing pysnmp and fail with error message if missing
		from pysnmp.entity.rfc3413.oneliner import cmdgen

		snmpread = cmdgen.CommandGenerator()
		try:
			errorind, errorstatus, errorindex, varbinds = snmpread.getCmd(
				cmdgen.CommunityData(community),
				cmdgen.UdpTransportTarget((agent, port)), oid)
		except:
			print 'SNMP exit error. Check agent and community.', str(varbinds), agent, community, port, oid
			sys.exit(exit_unkn)
		if any(word in str(varbinds) for word in snmperrs):
			print 'SNMP read error. Check agent and community.', str(varbinds), agent, community, port, oid
			sys.exit(exit_unkn)
		else:
			# Only Integer, Counter32 and hexValue are supported at the moment
			for name, val in varbinds:
				if any(word in str(varbinds) for word in integer_str):
					return int(val)
				elif any(word in str(varbinds) for word in hex_str):
					return int(val.asOctets().encode('hex'), 16)
				else:
					print 'SNMP value error, expected ', str(integer_str) + ', ' + str(hex_str) + '; got: ' + str(varbinds), agent, community, port, oid
					gen_exitcode(exit_unkn)
	except ImportError:
		print 'Library import error. Install pysnmp.'
		sys.exit(exit_unkn)


def get_agent_file(_path_base, _agent, _interface):
	# returns cache file name
	return os.path.join(_path_base, 'nagios-ifheath-' + _agent + '_if_' + str(_interface) + '.cache')


def read_offset(_file):
	# read offset from disk or return 0, -1 if there is no file (init case)
	try:
		tmpfile = open(_file, 'r')
		offset = int(tmpfile.readline())
		tmpfile.close()
		# return offset and file's mtime to use with evaluation period
		return int(offset), int(os.stat(_file).st_mtime)
	except IOError:
		return 0, -1


def write_offset(_file, _offset):
	# write offset to disk for later use
	try:
		tmpfile = open(_file, 'w')
		tmpfile.write(str(_offset))
		tmpfile.close()
		return True
	except IOError:
		print 'Error writing stat file'
		sys.exit(exit_unkn)


def print_nagios(_level, _errorcount, _warn, _crit, _deltatime, _time, _ifstate, _perfdata):
	# print main nagios output
	nagdict = ('OK: Interface error rate normal', 'WARNING: Interface errors exceeding warning threshold',
	           'CRITICAL: Interface errors exceeding critical threshold', 'UNKNOWN: Something went wrong',
	           'CRITICAL: Interface down', 'WARNING: Interface down')
	if _perfdata:
		if _ifstate:
			print '%s: %s errors in the last %2.1f/%02d hour(s)|errors=%dc;%d;%d; state=%d' % \
			      (nagdict[_level], _errorcount, _deltatime, _time, _errorcount, _warn, _crit, _ifstate)
		else:
			print '%s: %s errors in the last %2.1f/%02d hour(s)|errors=%dc;%d;%d;' % \
			      (nagdict[_level], _errorcount, _deltatime, _time, _errorcount, _warn, _crit)
	else:
		print '%s: %s errors in the last %2.1f/%02d hour(s)' % (nagdict[_level], _errorcount, _deltatime, _time)
	return True


def gen_exitcode(_exitcode):
	# exit method for nagios exit code
	sys.exit(_exitcode)


# main
if __name__ == '__main__':
	# some default values
	masterstopwatch = time.time()
	exit_ok = 0
	exit_warn = 1
	exit_crit = 2
	exit_unkn = 3
	epoch_multipl = 3600
	ifstate = False
	EXCODE = False
	presets = {
		'qlogicfc': {
			'eprefix' : '.1.3.6.1.3.94.4.5.1.3.16.0.0.192.221',
			'esuffix' : '.0.0.0.0.0.0.0.0',
			'ifprefix': '.1.3.6.1.3.94.1.10.1.23.16.0.0.192.221',
			'ifsuffix': '.0.0.0.0.0.0.0.0'
		}}

	# prepare debug / test case
	args = parse_args()

	if args.test:
		# setup test case
		print ''
		print 'Running test case on demo.snmplabs.com; using interface RX packet counter:'
		args.host = 'demo.snmplabs.com'
		args.oid = '.1.3.6.1.2.1.2.2.1.16'
		args.interface = 1
		args.community = 'public'
		args.port = 161
		args.debug = True
		args.time = 1

	if args.preset:
		# Set up preset OIDs ...
		if presets.has_key(args.preset.lower()):
			args.ifoid = join_oid(presets[args.preset.lower()]['ifprefix'], args.oid, presets[args.preset]['ifsuffix'], str(args.interface))
			args.oid = join_oid(presets[args.preset.lower()]['eprefix'], args.oid, presets[args.preset]['esuffix'], str(args.interface))
		else:
			print 'Invalid preset. Valid presets are:'
			for key in presets:
				print '   ' + key
				gen_exitcode(exit_unkn)
	else:
		# ... or argument OIDs
		if args.ifoid:
			args.ifoid = join_oid(args.ifoid, str(args.interface))
		args.oid = join_oid(args.oid, str(args.interface))

	if args.debug:
		print ''
		print 'preset host:    ', args.host
		print 'preset OID:     ', args.oid
		print 'interface:      ', args.interface
		print 'ifoid:          ', args.ifoid
		print 'ifstate:        ', args.ifupstate
		print 'using filename: ', get_agent_file(args.scratch, args.host, args.interface)
		print 'community       ', args.community

	# Rewrite threshold for cache file
	ts_threshold = get_time_threshold(args.time)

	# read data from cache file, use filesystem mtime for unix epoch timestamp
	snmp_offset, file_timestamp = read_offset(get_agent_file(args.scratch, args.host, args.interface))

	# run error count snmp check
	if args.debug:
		print 'Checking Interface error count...'
		stopwatch = time.time()
	snmp_errs = read_snmp(args.host, args.community, args.port, args.oid)
	if args.debug:
		stopwatch = float(time.time()) - float(stopwatch)
		print 'Got Interface error count: ', snmp_errs
		print '    Time: ', stopwatch, 's.'

	# Calculate deltas, counter and time. delta_hours is only used for print outs.
	delta_errs = snmp_errs - snmp_offset
	delta_time = int(time.time() - file_timestamp)
	delta_hours = float(delta_time) / float(epoch_multipl)

	if args.debug:
		print 'ts_threshold:   ', ts_threshold
		print 'file_timestamp: ', file_timestamp
		print 'snmp_offset:    ', snmp_offset
		print 'delta_errs:     ', delta_errs
		print 'delta_time:     ', delta_time
		print 'delta_hours:    ', delta_hours
		print 'warning arg:    ', args.warning
		print 'critical arg:   ', args.critical

	#start main program logic
	if args.ifoid and args.ifupstate:
		# do the interface status check only if necessary
		if args.debug:
			print 'Running ifstate test...'
			stopwatch = time.time()
		ifstate = read_snmp(args.host, args.community, args.port, args.ifoid)
		if args.debug:
			stopwatch = float(time.time()) - float(stopwatch)
			print 'Got Interface error count: ', ifstate
			print '    Time: ', stopwatch, 's.'
		if ifstate != args.ifupstate:
			if args.debug: print 'Interface down, got ifstate: ', ifstate
			if args.warnifdown:
				# argument: --warnifdown
				print_nagios(5, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
				EXCODE = exit_warn
			elif args.okifdown:
				# argument: --okifdown
				EXCODE = exit_ok
			else:
				# normal exit behavior
				print_nagios(4, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
				EXCODE = exit_crit
	if args.community and args.host and args.oid and (not EXCODE or EXCODE == exit_ok):
		# Error count / evaluation. Should not be run when interface state error.
		if (delta_errs <= args.warning) or (file_timestamp == -1):
			# Nagios OK
			if file_timestamp == -1:
				# Fist run printout
				print 'Initial baseline, %i Errors.' % delta_errs
			else:
				print_nagios(0, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
			EXCODE = exit_ok
		elif delta_errs >= args.critical:
			# Nagios CRITICAL
			print_nagios(2, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
			EXCODE = exit_crit
		elif delta_errs >= args.warning & delta_errs < args.critical:
			# Nagios WARNING
			print_nagios(1, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
			EXCODE = exit_warn
		else:
			# Nagios UNKNOWN. This should never be the case
			print_nagios(3, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata)
			EXCODE = exit_unkn
	if (delta_time > args.time * epoch_multipl) or file_timestamp == -1:
		# Always (re)write the cache when threshold is reached
		if args.debug:
			print 'Initialing / rotating tmp file'
		write_offset(get_agent_file(args.scratch, args.host, args.interface), snmp_errs)
	# Finally exit with the desired exit code for Nagios.
	if args.debug:
		print 'Finished. Total execution time: ', float(time.time()) - float(masterstopwatch), 's'
	gen_exitcode(EXCODE)