/******************************************************************************
* Nagios check_lpar_cpu plugin
*
* License: GPL
* Author: Konstantin Reichert
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
* Please visit also http://www.ibm.com/developerworks/wikis/display/WikiPtype/ryo
* Parts of the code within this plugin come from there.
*
******************************************************************************/
#include // C programming language subroutines that execute in user space and extract data from the perfstat kernel extension (kex) to obtain statistics. This API is available in AIX 5L.
#include // Header to get the basename of the file.
#include // C library to perform Input/Output operations
#include // C programming language which includes functions involving memory allocation, process control, conversions and others.
#include
#include
#include
#include
#include // C Standard Library to declare time and date functions that provide standardized access to time/date manipulation and formatting.
#include // Defines miscellaneous symbolic constants and types, and declares miscellaneous functions.
#include
using namespace std;
/******************************************************************************
* Variables
******************************************************************************/
string Description = "This plugin checks lpar cpu and outputs same values as lparstat command.\nThresholds can be set for Idle, App and Backward values, but feel free to implement other thresholds.\n";
string Author = "Konstantin Reichert ";
string Date = "2010/08/04 ";
string Version = "0.6 ";
#define XINTFRAC ((double)(_system_configuration.Xint)/(double)(_system_configuration.Xfrac))
#define HTIC2SEC(x) ((double)x * XINTFRAC)/(double)1000000000.0
static int firstiteration = 1;
static u_longlong_t last_time_base;
static u_longlong_t last_pcpu_user, last_pcpu_sys, last_pcpu_idle, last_pcpu_wait;
static u_longlong_t last_lcpu_user, last_lcpu_sys, last_lcpu_idle, last_lcpu_wait;
static u_longlong_t last_phint = 0, last_vcsw = 0, last_pit = 0;
static u_longlong_t last_runque, last_swpque;
int state;
int warn_cnt = 0;
int crit_cnt = 0;
int unkn_cnt = 0;
char *ch;
double app_warn = 20;
double app_crit = 10;
double back_warn = 4;
double back_crit = 5;
double idle_warn = 20;
double idle_crit = 10;
static int perfdata = 0;
/******************************************************************************
* Print Usage Info
******************************************************************************/
void print_usage(char *basename) {
cout << "Usage:\n\t" << basename << " [-p] [-a] , [-i] , [-b] ," << endl;
cout << "\t" << basename << " -h" << endl;
}
/******************************************************************************
* Print Help
******************************************************************************/
void print_help(char *basename) {
cout << setfill('-') << setw(50) << "-" < warning) {
return state = 0;
}
else if((value <= warning) && (value >= critical)) {
warn_cnt++;
return state = 1;
}
else if(value <= critical) {
crit_cnt++;
return state = 2;
}
else {
unkn_cnt++;
return state = 3;
}
}
/******************************************************************************
* Check Thresholds for Greater Than WARN/CRIT-Value
******************************************************************************/
void check_thresh_GT(double warning, double critical, int opt) {
if (warning > critical) {
printf("Warning-Value for -%c Options must be LESS than Critical-Value!!!\n", opt);
print_usage(basename(ch));
exit(3);
}
}
int check_thresh_GT(double value, double warning, double critical) {
if(value < warning) {
return state = 0;
}
else if((value >= warning) && (value <= critical)) {
warn_cnt++;
return state = 1;
}
else if(value >= critical) {
crit_cnt++;
return state = 2;
}
else {
unkn_cnt++;
return state = 3;
}
}
/******************************************************************************
* Get CPU Total and LPAR Values from AIX-Kernel
******************************************************************************/
/* Save the current values for the next iteration */
void save_last_values(perfstat_cpu_total_t *cpustats, perfstat_partition_total_t *lparstats) {
last_vcsw = lparstats->vol_virt_cswitch + lparstats->invol_virt_cswitch;
last_time_base = lparstats->timebase_last;
last_phint = lparstats->phantintrs;
last_pit = lparstats->pool_idle_time;
last_pcpu_user = lparstats->puser;
last_pcpu_sys = lparstats->psys;
last_pcpu_idle = lparstats->pidle;
last_pcpu_wait = lparstats->pwait;
last_lcpu_user = cpustats->user;
last_lcpu_sys = cpustats->sys;
last_lcpu_idle = cpustats->idle;
last_lcpu_wait = cpustats->wait;
last_runque = cpustats->runque;
last_swpque = cpustats->swpque;
}
/* Gather and display lpar utilization metrics */
void check_lpar_util() {
u_longlong_t dlt_pcpu_user, dlt_pcpu_sys, dlt_pcpu_idle, dlt_pcpu_wait; // Physical CPU Values
u_longlong_t dlt_lcpu_user, dlt_lcpu_sys, dlt_lcpu_idle, dlt_lcpu_wait; // Logical CPU Values
u_longlong_t dlt_runque, dlt_swpque;
u_longlong_t vcsw, lcputime, pcputime;
u_longlong_t entitled_purr, unused_purr;
u_longlong_t delta_purr, delta_time_base;
double phys_proc_consumed, entitlement, percent_ent, delta_sec;
double app, lbusy, vcsw_now, phint, app_in_pct;
perfstat_partition_total_t lparstats;
perfstat_cpu_total_t cpustats;
/* retrieve the metrics */
if (!perfstat_partition_total((perfstat_id_t*)NULL, &lparstats, sizeof(perfstat_partition_total_t), 1)) {
perror("perfstat_partition_total");
exit(-1);
}
if (!perfstat_cpu_total((perfstat_id_t*)NULL, &cpustats, sizeof(perfstat_cpu_total_t), 1)) {
perror("perfstat_cpu_total");
exit(-1);
}
/* first iteration, we only read the data and save the data */
if (firstiteration) {
firstiteration = 0;
save_last_values(&cpustats, &lparstats);
return;
}
dlt_pcpu_user = lparstats.puser - last_pcpu_user;
dlt_pcpu_sys = lparstats.psys - last_pcpu_sys;
dlt_pcpu_idle = lparstats.pidle - last_pcpu_idle;
dlt_pcpu_wait = lparstats.pwait - last_pcpu_wait;
delta_purr = pcputime = dlt_pcpu_user + dlt_pcpu_sys + dlt_pcpu_idle + dlt_pcpu_wait;
dlt_lcpu_user = cpustats.user - last_lcpu_user;
dlt_lcpu_sys = cpustats.sys - last_lcpu_sys;
dlt_lcpu_idle = cpustats.idle - last_lcpu_idle;
dlt_lcpu_wait = cpustats.wait - last_lcpu_wait;
lcputime = dlt_lcpu_user + dlt_lcpu_sys + dlt_lcpu_idle + dlt_lcpu_wait;
entitlement = (double)lparstats.entitled_proc_capacity / 100.0 ;
dlt_runque = cpustats.runque - last_runque;
dlt_swpque = cpustats.swpque - last_swpque;
delta_time_base = lparstats.timebase_last - last_time_base;
if (lparstats.type.b.shared_enabled) {
entitled_purr = delta_time_base * entitlement;
if (entitled_purr < delta_purr) {
/* when above entitlement, use consumption in percentages */
entitled_purr = delta_purr;
}
unused_purr = entitled_purr - delta_purr;
/* distribute unused purr in wait and idle proportionally to logical wait and idle */
dlt_pcpu_wait += unused_purr * ((double)dlt_lcpu_wait / (double)(dlt_lcpu_wait + dlt_lcpu_idle));
dlt_pcpu_idle += unused_purr * ((double)dlt_lcpu_idle / (double)(dlt_lcpu_wait + dlt_lcpu_idle));
pcputime = entitled_purr;
}
/* Physical Processor Utilization */
double user = ((double)dlt_pcpu_user * 100.0 / (double)pcputime);
double sys = ((double)dlt_pcpu_sys * 100.0 / (double)pcputime);
double wait = ((double)dlt_pcpu_wait * 100.0 / (double)pcputime);
double idle = ((double)dlt_pcpu_idle * 100.0 / (double)pcputime);
phys_proc_consumed = (double)delta_purr / (double)delta_time_base;
percent_ent = (double)((phys_proc_consumed / entitlement) * 100);
/* Special Check for CPU including APP and Idle */
if (lparstats.type.b.shared_enabled) {
if (lparstats.type.b.pool_util_authority) {
app = ((double)(lparstats.pool_idle_time - last_pit) / (XINTFRAC*(double)delta_time_base));
app_in_pct = ((double)(app * 100 / lparstats.phys_cpus_pool));
if (check_thresh_LT(idle, idle_warn, idle_crit) != 0 &&
check_thresh_LT(app_in_pct, app_warn, app_crit) != 0 ||
percent_ent >= 950.0) {
printf("App = %1.2f (%1.2f%%) (WARNING < %1.1f%% CRITICAL < %1.1f%%) AND Idle = %1.1f%% (WARNING < %1.1f%% CRITICAL < %1.1f%%) AND Entc = %1.1f%%\n",
app,
app_in_pct,
app_warn,
app_crit,
idle,
idle_warn,
idle_crit,
percent_ent);
}
else {
printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%%\n", user, sys, wait, idle);
warn_cnt = 0;
crit_cnt = 0;
unkn_cnt = 0;
}
}
else {
printf("'Allow performance information collection.' must be set in HMC for the monitored LPAR.");
exit (3);
}
}
else {
if (check_thresh_LT(idle, idle_warn, idle_crit) != 0) {
printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%% (WARNING < %1.1f%% CRITICAL < %1.1f%%)\n", user, sys, wait, idle, idle_warn, idle_crit);
}
else {
printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%%\n", user, sys, wait, idle);
}
}
printf("\n");
printf("### Detail Information ###\n");
if (check_thresh_GT(dlt_swpque, back_warn, back_crit) != 0) {
printf("- Backwards = %llu (WARNING < %1.1f CRITICAL < %1.1f)\n", dlt_swpque, back_warn, back_crit);
}
else {
printf("- Backwards = %llu\n", dlt_swpque);
}
printf("- RunQueue = %llu\n", dlt_runque);
if (lparstats.type.b.shared_enabled) {
/* Available Pool Processor (app) */
if (lparstats.type.b.pool_util_authority) {
app = ((double)(lparstats.pool_idle_time - last_pit) / (XINTFRAC*(double)delta_time_base));
app_in_pct = ((double)(app * 100 / lparstats.phys_cpus_pool));
if (check_thresh_LT(app_in_pct, app_warn, app_crit) != 0) {
printf("- App = %1.2f (%1.2f%%) (WARNING < %1.1f%% CRITICAL < %1.1f%%)\n", app, app_in_pct, app_warn, app_crit);
}
else {
printf("- App = %1.2f\n", app);
}
}
printf("\n");
/* Print of Physical Processor Consumed */
printf("- Physc = %1.2f\n", (double)phys_proc_consumed);
/* Print of Percentage of Entitlement Consumed */
printf("- Entc = %1.1f%%\n", percent_ent);
/* Logical Processor Utilization */
lbusy = ((double)(dlt_lcpu_user + dlt_lcpu_sys) * 100.0 / (double)lcputime);
printf("- Lbusy = %1.1f\n", lbusy);
/* Virtual CPU Context Switches per second */
vcsw = lparstats.vol_virt_cswitch + lparstats.invol_virt_cswitch;
delta_sec = HTIC2SEC(delta_time_base);
vcsw_now = ((double)(vcsw - last_vcsw) / delta_sec);
printf("- Vcsw = %1.0f\n", vcsw_now);
/* Phantom Interrupts per second */
phint = ((double)(lparstats.phantintrs - last_phint) / delta_sec);
printf("- Phint = %1.0f\n", phint);
}
printf("\n");
printf("### %s running at %llu MHz ###\n", cpustats.description, cpustats.processorHZ / 1000000);
printf("- CPUs Configured: %d\n", cpustats.ncpus_cfg);
printf("- CPUs Active: %d", cpustats.ncpus);
if(perfdata) {
printf(" | Idle=%1.1f%%;%1.1f;%1.1f Backwards=%llu;%1.f;%1.f App=%1.2f;%1.2f;%1.2f RunQueue=%llu User=%1.1f%% Sys=%1.1f%% Wait=%1.1f%% Physc=%1.2f Entc=%1.1f%% Lbusy=%1.1f Vcsw=%1.0f Phint=%1.0f",
idle,
idle_warn,
idle_crit,
dlt_swpque,
back_warn,
back_crit,
app,
app_warn * lparstats.phys_cpus_pool / 100,
app_crit * lparstats.phys_cpus_pool / 100,
dlt_runque,
user,
sys,
wait,
phys_proc_consumed,
percent_ent,
lbusy,
vcsw_now,
phint);
}
printf("\n");
save_last_values(&cpustats, &lparstats);
}
/******************************************************************************
* Main
******************************************************************************/
int main(int argc, char *argv[]) {
ch = argv[0]; // get the name/path of/to the check
int arguments;
state = 0;
if (argc <= 1) {
print_usage(basename(ch));
return 2;
}
while ((arguments = getopt (argc, argv, ":a:b:hi:p")) != -1) {
switch (arguments) {
case 'a':
app_warn = atoi (strtok(optarg, ","));
app_crit = atoi (strtok(NULL, ","));
check_thresh_LT(app_warn, app_crit, optopt);
break;
case 'b':
back_warn = atoi (strtok(optarg, ","));
back_crit = atoi (strtok(NULL, ","));
check_thresh_GT(back_warn, back_crit, optopt);
break;
case 'h':
print_help(basename(ch));
break;
case 'i':
idle_warn = atoi (strtok(optarg, ","));
idle_crit = atoi (strtok(NULL, ","));
check_thresh_LT(idle_warn, idle_crit, optopt);
break;
case 'p':
perfdata = 1;
break;
case ':':
fprintf (stderr, "Option -%c requires an argument.\n", optopt);
break;
case '?':
fprintf (stderr, "Unknown -%c argument.\n", optopt);
break;
default:
abort();
}
}
for (int i = 0 ; i < 2 ; i++) {
check_lpar_util();
sleep(1);
}
if (warn_cnt == 0 && crit_cnt == 0) {
/* printf ("OK\n"); */
state = 0;
}
else if (warn_cnt >= 0 && crit_cnt == 0) {
/* printf ("WARN\n"); */
state = 1;
}
else if (crit_cnt > 0 || crit_cnt >= warn_cnt) {
/* printf ("CRIT\n"); */
state = 2;
}
else {
/* printf ("UNKNOWN\n"); */
state = 3;
}
/*
* printf ("WARN: %i\n", warn_cnt);
* printf ("CRIT: %i\n", crit_cnt);
* printf ("UNKN: %i\n", unkn_cnt);
* printf ("\n");
*/
return state;
}