#!/usr/bin/perl -w # cluster_percent Timothy Denike 8/14/2004 Friendster.com # # This script acts as a wrapper to the check_cluster binary included in the # Nagios distribution. The problems with check_cluster are that it requires # you to pipe to stdin a list of hostnames and services, as well as specify # fixed values for the warning/critical thresholds. # # This script addresses those issues by taking a nagios hostgroup name and # a nagios service name as parameters. The hostgroup members are looked up # from the nagios config file and fed into check_cluster. Optionally, # you can specify a percentage for the trigger thresholds, rather than a # fixed value. # Edit the three variables below as needed for your system use strict; use IPC::Open2; my $config = "/etc/nagios/hostgroups.cfg"; my $statusfile = "/var/log/nagios/status.log"; my $cluster = "/usr/lib/nagios/plugins/check_cluster"; my $DEBUG = 0; sub alog { my $log = "/var/log/nagios/cluster_percent.log"; my $text = shift; my $time = localtime(); open LOG, ">> $log" || die "Unable to open $log\n"; print LOG "[$$ $time] $text\n"; close LOG; } sub barf ($$) { # barf is used to output a consistent format and exit status. my %ERRORS = ('-1',"UNKNOWN", '0' , 'OK', '1', 'WARNING', '2', 'CRITICAL'); my $errorcode = shift; my $description = shift; #print "$ERRORS{$errorcode} : $description\n"; print "$description\n"; if ($DEBUG) { alog ("barf - $errorcode: $description"); } exit $errorcode; } sub roundup { my $n = shift; return(($n == int($n)) ? $n : int($n + 1)) } sub syntax { print "$0 hostgroup service [warn] [crit]\n"; print "\thostgroup = Hostgroup name as it appears in hostgroups.cfg\n"; print "\tservice = Service name as is appears in Nagios\n"; print "\twarn = % down to trigger Warning - def 25\n"; print "\tcrit = % down to trigger Critical - def 33\n"; if ($DEBUG) { alog ("Called with no parms - syntax output"); } exit -1; # Just in case we're called improperly, exit Unknown state } sub get_members { # Get the members of the hostgroup from the hostgroups.cfg file my $hostgroup = shift; my $config = shift; my $ingroup = 0; my $members = ""; my @hosts; open CONFIG, $config || barf (-1,"Unable to open $config"); while () { if ( /^\s*hostgroup_name\s*$hostgroup\s*/ ) { $ingroup=1; # Set this variable if we're inside the hostgroup # stanza we're looking for. } if ( $ingroup && /^\s*members\s*(\S*)/ ) { $members=$1; @hosts = split(/,/,$members); if ($DEBUG) { alog ("Returning member list"); } return @hosts; # Okay - now we have a hostlist, lets return it. } } return 0; } MAIN: { my @members; my $hostgroup = shift || syntax(); my $service = shift || syntax(); my $warn = shift || "25"; # default to 25% for warnings my $crit = shift || "33"; # 33% for critical my $hostcount = 0; my $output = ""; my $status = -1; my $RCLU; my $WCLU; if ($DEBUG) { alog ("Starting up.."); } # Get the members of the specified hostgroup @members = get_members($hostgroup,$config); $hostcount = $#members +1; # Count is short by one if ($hostcount == 0) { barf (-1,"No hosts found in group $hostgroup");} # Set the number of failed hosts to trigger a warning/critical my $critcount = roundup($hostcount*$crit/100); my $warncount = roundup($hostcount*$warn/100); # Open the cluster_check binary for input/output # We'll pipe in a list of hosts, and parse the output. if ($DEBUG) { alog("Dumping $hostcount to check_cluster"); } my $pid = open2 ($RCLU,$WCLU,"$cluster --service $statusfile $warncount $critcount"); # check_cluster wants hostname;servicename\N as input foreach (@members) { print $WCLU "$_;$service\n"; } close $WCLU; if ($DEBUG) { alog("Finished writing to stdin");} # now fetch the output. It should only be one line. while (<$RCLU>) { $output=$_; } if ($DEBUG) { alog ("Got output - waiting for pid to exit");} # stick around for the child process to exit. waitpid ($pid,0); if ($DEBUG) { alog ("Child pid $pid done, going on..");} # I'm still not sure why, but the childs exit status is a factor of 256. $status = $? / 256; # Get rid of any CR on the end out the output chomp $output; # I don't want the "service cluster ok: " part, just the # good stuff at the end of the line. ($output) = $output =~ /.*: (.*)$/; # I'd rather have a total count first for reference. $output = "$hostcount total - $output"; # Lets just make sure any weird codes exit with -1 (Unknown) if ($status < 0 || $status > 2) { $status = -1; } # Call barf to exit with a description and proper exit status. barf ($status,$output); }