http://cpansearch.perl.org/src/TROCKIJ/mon-0.99.2/mon.d/
#!/usr/bin/perl # # mon - schedules service tests and triggers alerts upon failures # # Jim Trocki, trockij@transmeta.com # # $Id: mon 1.27 Sat, 08 Sep 2001 09:42:05 -0400 trockij $ # # Copyright (C) 1998 Jim Trocki # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # use strict; my $RCSID='$Id: mon 1.27 Sat, 08 Sep 2001 09:42:05 -0400 trockij $'; my $AUTHOR='trockij@transmeta.com'; my $RELEASE='$ProjectVersion: mon-0-99-2.6 $'; # # modules in the perl distribution # use Getopt::Std; use Text::ParseWords; use POSIX; use Fcntl; use Socket; use Sys::Hostname; use Sys::Syslog qw(:DEFAULT setlogsock); use FileHandle; use Data::Dumper; # # CPAN modules # use Time::HiRes qw(gettimeofday tv_interval usleep); use Time::Period; use Mon::SNMP; #use SNMP in read_cf() sub auth; sub call_alert; sub check_auth; sub clear_timers; sub client_accept; sub client_close; sub client_command; sub client_dopending; sub client_write_opstatus; sub collect_output; sub daemon; sub debug; sub debug_dir; sub dep_ok; sub depend; sub dhmstos; sub die_die; sub disen_host; sub disen_service; sub disen_watch; sub do_alert; sub do_startup_alerts; sub err_startup; sub esc_str; sub gen_scriptdir_hash; sub handle_io; sub handle_snmp_trap; sub handle_trap; sub handle_trap_timeout; sub host_exists; sub inRange; sub init_cf_globals; sub init_globals; sub load_auth; sub load_oncall; sub load_state; sub normalize_paths; sub init_dtlog; sub pam_conv_func; sub proc_cleanup; sub randomize_startdelay; sub read_cf; sub readhistoricfile; sub reload; sub remove_proc; sub reset_server; sub run_monitor; sub save_state; sub set_last_test; sub set_op_status; sub reset_timer; sub setup_server; sub sock_write; sub syslog_die; sub un_esc_str; sub usage; sub write_dtlog; # # globals # my %opt; # cmdline arguments my %CF; # configuration directives my $PWD; # current working directory my $HOSTNAME; # system hostname my $STOPPED; # 1 = scheduler stopped, 0 = not stopped my $STOPPED_TIME; # time(2) scheduler was stopped, if stopped my $SLEEPINT; # don't touch my %oncall; # currently unused my %watch_disabled; # watches disabled, indexed by watch my %watch; # main configuration file data structure my %alias; # aliases my %groups; # hostgroups, indexed by group # # I/O routine globals # my %clients; # fds of connected clients my $numclients; # count of connected clients my %running; # procs which are forked and running, # indexed by group/service my $iovec; # used for select loop my %runningpid; # procs which are forked and running, # indexed by PID my $procs; # number of outstanding procs my %fhandles; # input file handles of children my %ibufs; # buffer structure to hold data from children my ($fdset_rbits, $fdset_ebits); # # history globals # my @last_alerts; # alert history, in memory my @last_failures; # failure history, in memory # # misc. globals # my $i; # loop iteration counter, used for debugging only my $lasttm; # the last time(2) the mon loop started my $pid_file_owner; # set when creating pid file my $tm; # used in main loop # # authentication structure globals # my %AUTHCMDS; my %NOAUTHCMDS; my %AUTHTRAPS; my %AUTHSNMPTRAPS; # # PAM authentication globals (must not be lexically scoped) # use vars qw ( $PAM_username $PAM_password ) ; # # opstatus globals # my (%OPSTAT, %FAILURE, %SUCCESS, %WARNING); # operational statuses my ($TRAP_COLDSTART, $TRAP_WARMSTART, # trap types $TRAP_LINKDOWN, $TRAP_LINKUP, $TRAP_AUTHFAIL, $TRAP_EGPNEIGHBORLOSS, $TRAP_ENTERPRISE, $TRAP_HEARTBEAT); my ($STAT_FAIL, $STAT_OK, $STAT_COLDSTART, # _op_status values $STAT_WARMSTART, $STAT_LINKDOWN, $STAT_UNKNOWN, $STAT_TIMEOUT, $STAT_UNTESTED, $STAT_DEPEND, $STAT_WARN); my ($FL_MONITOR, $FL_UPALERT, # alert type flags $FL_TRAP, $FL_TRAPTIMEOUT, $FL_STARTUPALERT, $FL_TEST); my $TRAP_PDU; my (%ALERTHASH, %MONITORHASH); # hash of pathnames for # alerts/monitors my $PROT_VERSION; my $START_TIME; # time(2) server started my $TRAP_PRO_VERSION; # trap protocol version my $DEP_EVAL_SANDBOX; # perl environment for # dep evals # # argument parsing # getopts ("fhlMSvda:A:b:B:c:D:i:L:m:O:o:p:P:r:s:t:", \%opt); # # these two things can be taken care of without # initializing things further # if ($opt{"v"}) { print "$RCSID\n$RELEASE\n"; exit; } if ($opt{"h"}) { usage(); exit; } if ($opt{"d"}) { eval 'require Data::Dumper;'; if ($@ ne "") { die "error: $@\n"; } } ($^O eq "linux" || $^O eq "openbsd") && setlogsock ('unix'); openlog ("mon", "cons,pid", $CF{"SYSLOG_FACILITY"}); # # definitions # die "basedir $opt{b} does not exist\n" if ($opt{"b"} && ! -d $opt{"b"}); init_globals(); init_cf_globals(); syslog_die ("config file $CF{CF} does not exist") if (! -f $CF{"CF"}); # # read config file # if ((my $err = read_cf ($CF{"CF"}, 1)) ne "") { syslog_die ("$err"); } closelog; openlog ("mon", "cons,pid", $CF{"SYSLOG_FACILITY"}); # # cmdline args override config file # $CF{"ALERTDIR"} = $opt{"a"} if ($opt{"a"}); $CF{"BASEDIR"} = $opt{"b"} if ($opt{"b"}); $CF{"AUTHFILE"} = $opt{"A"} if ($opt{"A"}); $CF{"LOGDIR"} = $opt{"L"} if ($opt{"L"}); $CF{"STATEDIR"} = $opt{"D"} if ($opt{"D"}); $CF{"SCRIPTDIR"} = $opt{"s"} if ($opt{"s"}); $CF{"OCFILE"} = $opt{"o"} if ($opt{"o"}); $CF{"PIDFILE"} = $opt{"P"} if defined($opt{"P"}); # allow empty pidfile $CF{"MAX_KEEP"} = $opt{"k"} if ($opt{"k"}); $CF{"MAXPROCS"} = $opt{"m"} if ($opt{"m"}); $CF{"SERVPORT"} = $opt{"p"} if ($opt{"p"}); $CF{"TRAPPORT"} = $opt{"t"} if ($opt{"t"}); $SLEEPINT = $opt{"i"} if ($opt{"i"}); if ($opt{"r"}) { syslog_die ("bad randstart value") if (!defined (dhmstos ($opt{"r"}))); $CF{"RANDSTART"} = dhmstos($opt{"r"}); } if ($opt{"S"}) { $STOPPED = 1; $STOPPED_TIME = time; } # # do some path cleanups and # build lookup tables for alerts and monitors # normalize_paths(); gen_scriptdir_hash(); if ($opt{"d"}) { debug_dir(); } # # load the auth control, oncall, bind, and listen # load_auth (1); %oncall = (); #load_oncall (1); # # init client interface # %clients is an I/O structure, indexed by the fd of the client # $numclients is the number of clients currently connected # $iovec is fd_set for clients and traps # %clients = (); $numclients = 0; $iovec = ''; setup_server(); # # fork and become a daemon # init_dtlog() if ($CF{"DTLOGGING"}); daemon() if ($opt{"f"}); if ($CF{"PIDFILE"} ne '' && open PID, ">$CF{PIDFILE}") { $pid_file_owner = $$; print PID "$pid_file_owner\n"; close PID; } set_last_test (); # # randomize startup checks if asked to # randomize_startdelay() if ($CF{"RANDSTART"}); @last_alerts = (); @last_failures = (); readhistoricfile (); $procs = 0; $i=0; $lasttm=time; $fdset_rbits = $fdset_ebits = ''; %watch_disabled = (); $SIG{HUP} = \&reset_server; $SIG{INT} = \&handle_sigterm; # for interactive debugging $SIG{TERM} = \&handle_sigterm; $SIG{PIPE} = 'IGNORE'; # # load previously saved state # load_state ("disabled") if ($opt{"l"}); syslog ('info', "mon server started"); # # startup alerts # do_startup_alerts(); # # main monitoring loop # for (;;) { debug (1, "$i" . ($STOPPED ? " (stopped)" : "") . "\n"); $i++; $tm = time; # # step through the watch groups, decrementing and # handing expired timers # if (!$STOPPED) { foreach my $group (keys %watch) { # # skip over disabled watch # next if ($watch_disabled{$group} == 1); foreach my $service (keys %{$watch{$group}}) { my $sref = \%{$watch{$group}->{$service}}; my $t = $tm - $lasttm; $t = 1 if ($t <= 0); # # trap timer # if ($sref->{"traptimeout"}) { $sref->{"_trap_timer"} -= $t; if ($sref->{"_trap_timer"} <= 0 && $tm - $sref->{"_last_uptrap"} > $sref->{"traptimeout"}) { $sref->{"_trap_timer"} = $sref->{"traptimeout"}; handle_trap_timeout ($group, $service); } } # # trap duration timer # if (defined ($sref->{"_trap_duration_timer"})) { $sref->{"_trap_duration_timer"} -= $t; if ($sref->{"_trap_duration_timer"} <= 0) { set_op_status ($group, $service, $STAT_OK); undef $sref->{"_trap_duration_timer"}; } } # # polling monitor timer # if ($sref->{"interval"} && $sref->{"_timer"} <= 0 && !$running{"$group/$service"}) { if (!$CF{"MAXPROCS"} || $procs < $CF{"MAXPROCS"}) { if ($sref->{"exclude_period"} ne "" && inPeriod (time, $sref->{"exclude_period"})) { debug (1, "not running $group,$service because of exclude_period\n"); } elsif ($sref->{"dep_behavior"} eq "m" && $sref->{"depend"} ne "") { if (dep_ok ($sref)) { run_monitor ($group, $service); } else { debug (1, "not running $group,$service because of depend\n"); } } else { run_monitor ($group, $service); } } else { syslog ('info', "throttled at $procs processes"); } } else { $sref->{"_timer"} -= $t; if ($sref->{"_timer"} < 0) { $sref->{"_timer"} = 0; } } } } } $lasttm = time; # # collect any output from subprocs # collect_output; # # clean up after exited processes, and trigger alerts # proc_cleanup; # # handle client, server, and trap I/O # this routine sleeps for $SLEEPINT if no I/O is ready # handle_io; } die "not reached"; END { unlink $CF{"PIDFILE"} if $$ == $pid_file_owner && $CF{"PIDFILE"} ne ''; } ############################################################################## # # startup alerts # sub do_startup_alerts { foreach my $group (keys %watch) { foreach my $service (keys %{$watch{$group}}) { do_alert ($group, $service, "", 0, $FL_STARTUPALERT); } } } # # handle alert event, throttling the alert call if necessary # sub do_alert { my ($group, $service, $output, $retval, $flags) = @_; my (@groupargs, $last_alert, $alert); my ($sref, $range, @alerts); $sref = \%{$watch{$group}->{$service}}; my $tmnow = time; # # if the alarm is disabled, ignore it # if ($sref->{"disable"} == 1) { syslog ("notice", "ignoring alert for $group,$service"); return; } # # dependency check # if (!($flags & $FL_STARTUPALERT) && !($flags & $FL_UPALERT) && defined $sref->{"depend"} && $sref->{"dep_behavior"} eq "a") { if (!$sref->{"_depend_status"}) { debug (1, "alert for $group,$service supressed because of dep fail\n"); return; } } # # no alerts for ack'd failures, except for upalerts # if ($sref->{"_ack"} == 1 && !($flags & $FL_UPALERT)) { syslog ("notice", "no alert for $group.$service" . " because of ack'd failure"); return; } my ($summary) = split("\n", $output); $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m); # # check each time period for pending alerts # foreach my $periodlabel (keys %{$sref->{"periods"}}) { # # only send alerts that are in the proper period # next if (!inPeriod ($tmnow, $sref->{"periods"}->{$periodlabel}->{"period"})); my $pref = \%{$sref->{"periods"}->{$periodlabel}}; # # skip upalerts not paired with down alerts # disable by setting "no_comp_alerts" in period section # if (!$pref->{"no_comp_alerts"} && ($flags & $FL_UPALERT) && !$pref->{"_alert_sent"}) { next; } # # do this if we're not handling an upalert or startupalert # if (!($flags & $FL_UPALERT) && !($flags & $FL_STARTUPALERT)) { # # alert only numalerts # if ($pref->{"numalerts"} && $pref->{"_alert_sent"} >= $pref->{"numalerts"}) { next; } # # only alert once every "alertevery" seconds, unless # output from monitor is different # my ($prevsumm) = split("\n", $sref->{"_failure_output"}); if ( $pref->{"alertevery"} != 0 && ( ($tmnow - $pref->{"_last_alert"} < $pref->{"alertevery"}) && ( ($pref->{"_observe_detail"} && $sref->{"_failure_output"} eq $output) || (!$pref->{"_observe_detail"} && $prevsumm eq $summary) ) ) ) { syslog ("info", "not alerting for failure of $group/$service"); next; } # # alertafter NUM # if (defined $pref->{"alertafter_consec"}) { next if ($sref->{"_consec_failures"} < $pref->{"alertafter_consec"}); } # # alertafter timeval # elsif ( (!defined ($pref->{"alertafter"})) && (defined ($pref->{"alertafterival"})) ) { $pref->{'_1stfailtime'} = $tmnow if $pref->{'_1stfailtime'} == 0; if ($tmnow - $pref->{'_1stfailtime'} <= $pref->{'alertafterival'}) { next; } } # # alertafter NUM timeval # elsif (defined ($pref->{"alertafter"})) { $pref->{"_failcount"}++; if ($tmnow - $pref->{'_1stfailtime'} <= $pref->{'alertafterival'} && $pref->{"_failcount"} < $pref->{"alertafter"}) { next; } # # start a new time interval # if ($tmnow - $pref->{'_1stfailtime'} > $pref->{'alertafterival'}) { $pref->{"_failcount"} = 1; } if ($pref->{"_failcount"} == 1) { $pref->{"_1stfailtime"} = $tmnow; } if ($pref->{"_failcount"} < $pref->{"alertafter"}) { next; } } } # # at this point, no alerts are blocked, # so send the alerts # # # trigger multiple alerts in this period # if ($flags & $FL_UPALERT) { @alerts = @{$pref->{"upalerts"}}; } elsif ($flags & $FL_STARTUPALERT) { @alerts = @{$pref->{"startupalerts"}}; } else { @alerts = @{$pref->{"alerts"}}; } my $called = 0; for (my $i=0;$i<@alerts;$i++) { my ($range, $fac, $args); if ($alerts[$i] =~ /^exit\s*=\s*((\d+|\d+-\d+))\s/i) { $range=$1; next if (!inRange($retval, $range)); ($fac, $args) = (split (/\s+/, $alerts[$i], 3))[1,2]; } else { ($fac, $args) = split (/\s+/, $alerts[$i], 2); } $called++ if (call_alert ( group => $group, service => $service, output => $output, retval => $retval, flags => $flags, pref => $pref, alert => $fac, args => $args, ) ); } # # reset _alert_sent if up alert was sent from a trap # if ($called) { if( (($FL_TRAP | $flags) && ($FL_UPALERT & $flags)) ) { $pref->{"_alert_sent"} = 0; } else { $pref->{"_alert_sent"}++; } } } } # # walk through the watch list and reset the time # the service was last called # sub set_last_test { my ($i, $k, $t); $t = time; foreach $k (keys %watch) { foreach my $service (keys %{$watch{$k}}) { $watch{$k}->{$service}->{"_timer"} = $watch{$k}->{$service}->{"interval"}; } } } # # parse configuration file # # build the following data structures: # # %group # each element of %group is an array of hostnames # group records are terminated by a blank line in the # configuration file # %watch{"group"}->{"service"}->{"variable"} = value # %alias # sub read_cf { my ($CF, $commit) = @_; my ($var, $watchgroup, $ingroup, $curgroup, $inwatch, $args, $hosts, %disabled, $h, $i, $inalias, $curalias); my ($sref, $pref); my ($service, $period); my ($authtype, @authtypes); my $line_num = 0; # # parse configuration file # if ($opt{"M"} || $CF =~ /\.m4$/) { return "could not open m4 pipe of cf file: $CF: $!" if (!open (CFG, "m4 $CF |")); } else { return "could not open cf file: $CF: $!" if (!open (CFG, $CF)); } # # buffers to hold the new un-committed config # my %new_alias = (); my %new_CF = %CF; my %new_groups; my %new_watch; my %is_watch; my $servnum = 0; my $DEP_BEHAVIOR = "a"; my $incomplete_line = 0; my $linepart = ""; my $l = ""; my $acc_line = ""; for (;;) { # # read in a logical "line", which may span actual lines # do { $line_num++; last if (!defined ($linepart = <CFG>)); next if $linepart =~ /^\s*#/; # # accumulate multi-line lines (ones which are \-escaped) # if ($incomplete_line) { $linepart =~ s/^\s*//; } if ($linepart =~ /^(.*)\\\s*$/) { $incomplete_line = 1; $acc_line .= $1; chomp $acc_line; next; } else { $acc_line .= $linepart; } $l = $acc_line; $acc_line = ""; chomp $l; $l =~ s/^\s*//; $l =~ s/\s*$//; $incomplete_line = 0; $linepart = ""; }; # # global variables which can be overriden by the command line # if (!$inwatch && $l =~ /^(\w+) \s* = \s* (.*) \s*$/ix) { if ($1 eq "alertdir") { $new_CF{"ALERTDIR"} = $2; } elsif ($1 eq "basedir") { $new_CF{"BASEDIR"} = $2; $new_CF{"BASEDIR"} = "$PWD/$new_CF{BASEDIR}" if ($new_CF{"BASEDIR"} !~ m{^/}); $new_CF{"BASEDIR"} =~ s{/$}{}; } elsif ($1 eq "cfbasedir") { $new_CF{"CFBASEDIR"} = $2; $new_CF{"CFBASEDIR"} = "$PWD/$new_CF{CFBASEDIR}" if ($new_CF{"CFBASEDIR"} !~ m{^/}); $new_CF{"CFBASEDIR"} =~ s{/$}{}; } elsif ($1 eq "mondir") { $new_CF{"SCRIPTDIR"} = $2; } elsif ($1 eq "logdir") { $new_CF{"LOGDIR"} = $2; } elsif ($1 eq "histlength") { $new_CF{"MAX_KEEP"} = $2; } elsif ($1 eq "serverport") { $new_CF{"SERVPORT"} = $2; } elsif ($1 eq "trapport") { $new_CF{"TRAPPORT"} = $2; } elsif ($1 eq "serverbind") { $new_CF{"SERVERBIND"} = $2; } elsif ($1 eq "trapbind") { $new_CF{"TRAPBIND"} = $2; } elsif ($1 eq "pidfile") { $new_CF{"PIDFILE"} = $2; } elsif ($1 eq "randstart") { $new_CF{"RANDSTART"} = dhmstos($2); if (!defined ($new_CF{"RANDSTART"})) { close (CFG); return "cf error: bad value '$2' for randstart option (syntax: historictime = timeval), line $line_num"; } } elsif ($1 eq "maxprocs") { $new_CF{"MAXPROCS"} = $2; } elsif ($1 eq "statedir") { $new_CF{"STATEDIR"} = $2; } elsif ($1 eq "authfile") { $new_CF{"AUTHFILE"} = $2; if (! -r $new_CF{"AUTHFILE"}) { close (CFG); return "cf error: authfile '$2' does not exist or is not readable, line $line_num"; } } elsif ($1 eq "authtype") { $new_CF{"AUTHTYPE"} = $2; @authtypes = split(' ' , $new_CF{"AUTHTYPE"}) ; foreach $authtype (@authtypes) { if ($authtype eq "pam") { eval 'use Authen::PAM qw(:constants);' ; if ($@ ne "") { close (CFG); return "cf error: could not use PAM authentication: $@"; } } } } elsif ($1 eq "pamservice") { $new_CF{"PAMSERVICE"} = $2; } elsif ($1 eq "userfile") { $new_CF{"USERFILE"} = $2; if (! -r $new_CF{"USERFILE"}) { close (CFG); return "cf error: userfile '$2' does not exist or is not readable, line $line_num"; } } elsif ($1 eq "ocfile") { $new_CF{"OCFILE"} = $2; } elsif ($1 eq "historicfile") { $new_CF{"HISTORICFILE"} = $2; } elsif ($1 eq "historictime") { $new_CF{"HISTORICTIME"} = dhmstos($2); if (!defined $new_CF{"HISTORICTIME"}) { close (CFG); return "cf error: bad value '$2' for historictime command (syntax: historictime = timeval), line $line_num"; } } elsif ($1 eq "cltimeout") { $new_CF{"CLIENT_TIMEOUT"} = dhmstos($2); if (!defined ($new_CF{"CLIENT_TIMEOUT"})) { close (CFG); return "cf error: bad value '$2' for cltimeout command (syntax: cltimeout = secs), line $line_num"; } } elsif ($1 eq "snmp") { if ($2 =~ /^1|yes|on|true$/i) { $new_CF{"SNMP"} = 1; eval "use SNMP"; if ($@ ne "") { close (CFG); return "cf error: could not use SNMP: $@"; } } else { $new_CF{"SNMP"} = 0; } } elsif ($1 eq "monerrfile") { $new_CF{"MONERRFILE"} = $2; } elsif ($1 eq "dtlogfile") { $new_CF{"DTLOGFILE"} = $2; } elsif ($1 eq "dtlogging") { $new_CF{"DTLOGGING"} = 0; if ($2 == 1 || $2 eq "yes" || $2 eq "true") { $new_CF{"DTLOGGING"} = 1; } } elsif ($1 eq "snmpport") { $new_CF{"SNMPPORT"} = $2; } elsif ($1 eq "dep_recur_limit") { $new_CF{"DEP_RECUR_LIMIT"} = $2; } elsif ($1 eq "dep_behavior") { if ($2 ne "m" && $2 ne "a") { close (CFG); return "cf error: unknown dependency behavior '$2', line $line_num"; } $DEP_BEHAVIOR = $2; } elsif ($1 eq "syslog_facility") { $new_CF{"SYSLOG_FACILITY"} = $2; } elsif ($1 eq "startupalerts_on_reset") { if ($2 =~ /^1|yes|true|on$/i) { $new_CF{"STARTUPALERTS_ON_RESET"} = 1; } else { $new_CF{"STARTUPALERTS_ON_RESET"} = 0; } } else { close (CFG); return "cf error: unknown variable '$1', line $line_num"; } next; } # # end of record # if ($l eq "") { $ingroup = 0; $inalias = 0; $inwatch = 0; $period = 0; $curgroup = ""; $curalias = ""; $watchgroup = ""; $servnum = 0; next; } # # hostgroup record # if ($l =~ /^hostgroup\s+([a-zA-Z0-9_.-]+)\s*(.*)/) { $curgroup = $1; $ingroup = 1; $inalias = 0; $inwatch = 0; $period = 0; $hosts = $2; %disabled = (); foreach $h (grep (/^\*/, @{$groups{$curgroup}})) { # We have to make $i = $h because $h is actually # a pointer to %groups and will modify it. $i = $h; $i =~ s/^\*//; $disabled{$i} = 1; } @{$new_groups{$curgroup}} = split(/\s+/, $hosts); # # keep hosts which were previously disabled # for ($i=0;$i<@{$new_groups{$curgroup}};$i++) { $new_groups{$curgroup}[$i] = "*$new_groups{$curgroup}[$i]" if ($disabled{$new_groups{$curgroup}[$i]}); } next; } if ($ingroup) { push (@{$new_groups{$curgroup}}, split(/\s+/, $l)); for ($i=0;$i<@{$new_groups{$curgroup}};$i++) { $new_groups{$curgroup}[$i] = "*$new_groups{$curgroup}[$i]" if ($disabled{$new_groups{$curgroup}[$i]}); } next; } # # alias record # if ($l =~ /^alias\s+([a-zA-Z0-9_.-]+)\s*$/) { $inalias = 1; $ingroup = 0; $inwatch = 0; $period = 0; $curalias = $1; next; } if ($inalias) { if ($l =~ /\A(.*)\Z/) { push (@{$new_alias{$curalias}}, $1); next; } } # # watch record # if ($l =~ /^watch\s+([a-zA-Z0-9_.-]+)\s*/) { $watchgroup = $1; $inwatch = 1; $inalias = 0; $ingroup = 0; $period = 0; if (!defined ($new_groups{$watchgroup})) { # # This hostgroup doesn't exist yet, we'll create it and warn # @{$new_groups{$watchgroup}} = ($watchgroup); print STDERR "Warning: watch group $watchgroup defined with no corresponding hostgroup.\n"; } if ($new_watch{$watchgroup}) { close (CFG); return "cf error: watch '$watchgroup' already defined, line $line_num"; } $curgroup = ""; $service = ""; next; } if ($inwatch) { # # env variables # if ($l =~ /^([A-Z_][A-Z0-9_]*)=(.*)/) { if ($service eq "") { close (CFG); return "cf error: environment variable defined without a service, line $line_num"; } $new_watch{$watchgroup}->{$service}->{"ENV"}->{$1} = $2; next; } # # non-env variables # else { $l =~ /^(\w+)\s*(.*)$/; $var = $1; $args = $2; } # # service entry # if ($var eq "service") { $service = $args; if ($service !~ /^[a-zA-Z0-9_.-]+$/) { close (CFG); return "cf error: invalid service tag '$args', line $line_num"; } $period = 0; $sref = \%{$new_watch{$watchgroup}->{$service}}; $sref->{"service"} = $args; $sref->{"interval"} = undef; $sref->{"randskew"} = 0; $sref->{"dep_behavior"} = $DEP_BEHAVIOR; $sref->{"exclude_period"} = ""; $sref->{"exclude_hosts"} = {}; $sref->{"_op_status"} = $STAT_UNTESTED; $sref->{"_last_op_status"} = $STAT_UNTESTED; $sref->{"_ack"} = 0; $sref->{"_ack_comment"} = ''; $sref->{"_consec_failures"} = 0; $sref->{"_failure_count"} = 0 if (!defined($sref->{"_failure_count"})); $sref->{"_start_of_monitor"} = time if (!defined($sref->{"_start_of_monitor"})); $sref->{"_alert_count"} = 0 if (!defined($sref->{"_alert_count"})); $sref->{"_last_failure"} = 0 if (!defined($sref->{"_last_failure"})); $sref->{"_last_success"} = 0 if (!defined($sref->{"_last_success"})); $sref->{"_last_trap"} = 0 if (!defined($sref->{"_last_trap"})); $sref->{"_exitval"} = "undef" if (!defined($sref->{"_exitval"})); $sref->{"_last_check"} = undef; $sref->{"_depend_status"} = undef; $sref->{"failure_interval"} = undef; $sref->{"_old_interval"} = undef; next; } if ($service eq "") { close (CFG); return "cf error: need to specify service in watch record, line $line_num"; } # # period definition # # for each service there can be one or more alert periods # this is stored as an array of hashes named # %{$watch{$watchgroup}->{$service}->{"periods"}} # each index for this hash is a unique tag for the period as # defined by the user or named after the period (such as # "wd {Mon-Fri} hr {7am-11pm}") # # the value of the hash is an array containing the list of alert commands # and arguments, so # # @alerts = @{$watch{$watchgroup}->{$service}->{"periods"}->{"TAG"}} # if ($var eq "period") { $period = 1; my $periodstr; if ($args =~ /^([a-z_]\w*) \s* : \s* (.*)$/ix) { $periodstr = $1; $args = $2; } else { $periodstr = $args; } $pref = \%{$sref->{"periods"}->{$periodstr}}; if (inPeriod (time, $args) == -1) { close (CFG); return "cf error: malformed period '$args' (the specified time period is not valid as per Time::Period::inPeriod), line $line_num"; } $pref->{"period"} = $args; $pref->{"alertevery"} = 0; $pref->{"numalerts"} = 0; $pref->{"_alert_sent"} = 0; $pref->{"no_comp_alerts"} = 0; @{$pref->{"alerts"}} = (); @{$pref->{"upalerts"}} = (); @{$pref->{"startupalerts"}} = (); next; } # # period variables # if ($period) { if ($var eq "alert") { push @{$pref->{"alerts"}}, $args; } elsif ($var eq "upalert") { $sref->{"_upalert"} = 1; push @{$pref->{"upalerts"}}, $args; } elsif ($var eq "startupalert") { push @{$pref->{"startupalerts"}}, $args; } elsif ($var eq "alertevery") { my $observe_detail = 0; if ($args =~ /(\S+) \s+ observe_detail \s*$/ix) { $observe_detail = 1; $args = $1; } # # for backawards-compatibility with <= 0.38.21 # elsif ($args =~ /(\S+) \s+ summary/ix) { $args = $1; } if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid time interval '$args' (syntax: alertevery {positive number}{smhd}), line $line_num"; } $pref->{"alertevery"} = $args; $pref->{"_observe_detail"} = $observe_detail; next; } elsif ($var eq "alertafter") { my ($p1, $p2); # # alertafter NUM # if ($args =~ /^(\d+)$/) { $p1 = $1; $pref->{"alertafter_consec"} = $p1; } # # alertafter timeval # elsif ($args =~ /^(\d+[hms])$/) { $p1 = $1; if (!($p1 = dhmstos ($p1))) { close (CFG); return "cf error: invalid time interval '$args' (syntax: alertafter = [{positive integer}] [{positive number}{smhd}]), line $line_num"; } $pref->{"alertafterival"} = $p1; $pref->{"_1stfailtime"} = 0; } # # alertafter NUM timeval # elsif ($args =~ /(\d+)\s+(\d+[hms])$/) { ($p1, $p2) = ($1, $2); if (($p1 - 1) * $sref->{"interval"} >= dhmstos($p2)) { close (CFG); return "cf error: interval & alertafter not sensible. No alerts can be generated with those parameters, line $line_num"; } $pref->{"alertafter"} = $p1; $pref->{"alertafterival"} = dhmstos ($p2); $pref->{"_1stfailtime"} = 0; $pref->{"_failcount"} = 0; } else { close (CFG); return "cf error: invalid interval specification '$args', line $line_num"; } } elsif ($var eq "upalertafter") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid upalertafter specification '$args' (syntax: upalertafter = {positive number}{smhd}), line $line_num"; } } elsif ($var eq "numalerts") { if ($args !~ /^\d+$/) { close (CFG); return "cf error: -numeric arg '$args' (syntax: numalerts = {positive integer}, line $line_num"; } $pref->{"numalerts"} = $args; next; } elsif ($var eq "no_comp_alerts") { $pref->{"no_comp_alerts"} = 1; next; } } # # non-period variables # elsif (!$period) { if ($var eq "interval") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid time interval '$args' (syntax: interval = {positive number}{smhd}), line $line_num"; } } elsif ($var eq "failure_interval") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid interval '$args' (syntax: failure_interval = {positive number}{smhd}), line $line_num"; } } elsif ($var eq "monitor") { # valid } elsif ($var eq "allow_empty_group") { # valid } elsif ($var eq "description") { # valid } elsif ($var eq "traptimeout") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid traptimeout interval '$args' (syntax: traptimeout = {positive number}{smhd}), line $line_num"; } $sref->{"_trap_timer"} = $args; } elsif ($var eq "trapduration") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid trapduration interval '$args' (syntax: trapduration = {positive number}{smhd}), line $line_num"; } } elsif ($var eq "randskew") { if (!($args = dhmstos ($args))) { close (CFG); return "cf error: invalid randskew time interval '$args' (syntax: randskew = {positive number}{smhd}), line $line_num"; } } elsif ($var eq "dep_behavior") { if ($args ne "m" && $args ne "a") { close (CFG); return "cf error: unknown dependency behavior '$args' (syntax: dep_behavior = {m|a}), line $line_num"; } } elsif ($var eq "depend") { $args =~ s/SELF:/$watchgroup:/g; } elsif ($var eq "exclude_hosts") { my $ex = {}; foreach my $h (split (/\s+/, $args)) { $ex->{$h} = 1; } $args = $ex; } elsif ($var eq "exclude_period" && inPeriod (time, $args) == -1) { close (CFG); return "cf error: malformed exclude_period '$args' (the specified time period is not valid as per Time::Period::inPeriod), line $line_num"; } else { close (CFG); return "cf error: unknown syntax [$l], line $line_num"; } $sref->{$var} = $args; } else { close (CFG); return "cf error: unknown syntax outside of period section [$l], line $line_num"; } } next; } close (CFG) || return "Could not open pipe to m4 (check that m4 is properly installed and in your PATH): $!"; # # Go through each defined hostgroup and check that there is a # watch associated with that hostgroup record. # # hostgroups without associated watches are not a violation of # mon config syntax, but it's usually not what you want. # for (keys(%new_watch)) { $is_watch{$_} = 1 }; foreach $watchgroup ( keys (%new_groups) ) { print STDERR "Warning: hostgroup $watchgroup has no watch assigned to it!\n" unless $is_watch{$watchgroup}; } # # no errors, commit new config if $commit was specified # return "" unless $commit; %alias = %new_alias; %groups = %new_groups; %watch = %new_watch; %CF = %new_CF; ""; } # # convert a string like "20m" into seconds # sub dhmstos { my ($str) = @_; my ($s); $str = lc ($str); if ($str =~ /^\s*(\d+(?:\.\d+)?)([dhms])\s*$/i) { if ($2 eq "m") { $s = $1 * 60; } elsif ($2 eq "h") { $s = $1 * 60 * 60; } elsif ($2 eq "d") { $s = $1 * 60 * 60 * 24; } else { $s = $1; } } else { return undef; } $s; } # # reset the state of the server on SIGHUP, and reread config # file. # sub reset_server { my ($keepstate) = @_; # # reap children that may be running # foreach my $pid (keys %runningpid) { my ($group, $service) = split (/\//, $runningpid{$pid}); kill 15, $pid; waitpid ($pid, 0); syslog ('info', "reset killed child $pid, exit status $?"); remove_proc ($pid); } $procs = 0; syslog ('info', "resetting, and re-reading configuration $CF{CF}"); if ((my $err = read_cf ($CF{"CF"}, 1)) ne "") { syslog ('err', "error reading config file: $err"); return undef; } normalize_paths; gen_scriptdir_hash; $lasttm=time; # the last time(2) the loop started $fdset_rbits = $fdset_ebits = ''; set_last_test (); randomize_startdelay() if ($CF{"RANDSTART"}); load_state ("disabled") if ($keepstate); if ($CF{"DTLOGGING"}) { init_dtlog(); } readhistoricfile; if ($CF{"STARTUPALERTS_ON_RESET"}) { do_startup_alerts; } return 1; } sub init_dtlog { my $t = time; return if (!$CF{"DTLOGGING"}); if (!open (DTLOG, ">>$CF{DTLOGFILE}")) { syslog ('err', "could not append to $CF{DTLOGFILE}: $!"); $CF{"DTLOGGING"} = 0; } else { $CF{"DTLOGGING"} = 1; print DTLOG <<EOF; # # downtime log start $t # time back up, group, service, first failure, downtime, interval, summary # EOF close (DTLOG); } } # # remove a process from our state # sub remove_proc { my ($pid) = @_; return if (!defined $runningpid{$pid}); vec ($fdset_rbits, fileno($fhandles{$runningpid{$pid}}), 1) = 0; close ($fhandles{$runningpid{$pid}}); delete $fhandles{$runningpid{$pid}}; delete $running{$runningpid{$pid}}; delete $runningpid{$pid}; $procs--; } # # exit on SIGTERM # sub handle_sigterm { syslog ("info", "caught TERM signal, exiting"); exit (1); } # # set O_NONBLOCK and FD_CLOEXEC on the given filehandle # sub configure_filehandle { my ($fh) = @_; my ($fl); $fl = ''; fcntl ($fh, F_GETFL, $fl) || return; $fl |= O_NONBLOCK; fcntl ($fh, F_SETFL, $fl) || return; $fl = fcntl ($fh, F_GETFD, 0) || return; $fl |= FD_CLOEXEC; fcntl ($fh, F_SETFD, $fl) || return; return 1; } # # setup server # sub setup_server { my ($tcpproto, $udpproto, $fl); if (!defined ($tcpproto = getprotobyname ('tcp'))) { die_die ("err", "could not get protocol for tcp"); } if (!defined ($udpproto = getprotobyname ('udp'))) { die_die ("err", "could not get protocol for tcp"); } # # client server, such as moncmd # my $bindaddr; if (defined $CF{"SERVERBIND"}) { if (!($bindaddr = gethostbyname ($CF{"SERVERBIND"}))) { die_die ("err", "error returned by gethostbyname for serverbind: $?"); } } else { $bindaddr = INADDR_ANY; } socket (SERVER, PF_INET, SOCK_STREAM, $tcpproto) || die_die ("err", "could not create TCP socket: $!"); setsockopt (SERVER, SOL_SOCKET, SO_REUSEADDR, pack ("l", 1)) || die_die ("err", "could not setsockopt: $!"); bind (SERVER, sockaddr_in ($CF{"SERVPORT"}, $bindaddr)) || die_die ("err", "could not bind TCP server port $CF{'SERVPORT'}: $!"); listen (SERVER, SOMAXCONN); configure_filehandle (*SERVER) || die_die ("err", "could not configure TCP server port: $!"); # # remote monitor traps # if (defined $CF{"TRAPBIND"}) { if (!($bindaddr = gethostbyname ($CF{"TRAPBIND"}))) { die_die ("err", "error returned by gethostbyname for trapbind: $?"); } } else { $bindaddr = INADDR_ANY; } socket (TRAPSERVER, PF_INET, SOCK_DGRAM, $udpproto) || die_die ("err", "could not create UDP socket: $!"); bind (TRAPSERVER, sockaddr_in ($CF{"TRAPPORT"}, $bindaddr)) || die_die ("err", "could not bind UDP server port: $!"); configure_filehandle (*TRAPSERVER) || die_die ("err", "could not configure UDP trap port: $!"); return if (!$CF{"SNMP"}); # # SNMP traps # socket (SNMPSERVER, PF_INET, SOCK_DGRAM, $udpproto) || die_die ("err", "could not create UDP socket: $!"); bind (SNMPSERVER, sockaddr_in ($CF{"SNMPPORT"}, INADDR_ANY)) || die_die ("err", "could not bind UDP server port: $!"); configure_filehandle (*SNMPSERVER) || die_die ("err", "could not configure UDP SNMP port: $!"); } # # set up a client connection if necessary # sub client_accept { my ($rin, $rout, $n, $sock, $port, $addr, $fl); my $CLIENT = new FileHandle; if (!defined ($sock = accept ($CLIENT, SERVER))) { syslog ('err', "accept returned error: $!"); return; } debug(1, "accepted client $CLIENT\n"); my $fno = fileno ($CLIENT); # # set socket to nonblocking # if (!configure_filehandle ($CLIENT)) { syslog ("err", "could not configure for client: $!"); close ($CLIENT); return; } ($port, $addr) = unpack_sockaddr_in ($sock); syslog ('info', "client connection from " . inet_ntoa ($addr) . ":" . $port); select ($CLIENT); $|=1; select (STDOUT); $clients{$fno}->{"fhandle"} = $CLIENT; $clients{$fno}->{"user"} = undef; # username if authenticated $clients{$fno}->{"timeout"} = $CF{"CLIENT_TIMEOUT"}; $clients{$fno}->{"last_read"} = time; # last time data was read $clients{$fno}->{"buf"} = ''; $numclients++; } # # do all pending client commands # sub client_dopending { my ($cl, $cmd, $l); foreach $cl (keys %clients) { if ($clients{$cl}->{"buf"} =~ /^([^\r\n]*)[\r\n]+/s) { $cmd = $1; $l = length ($cmd); $clients{$cl}->{"buf"} =~ s/^[^\r\n]*[\r\n]+//s; client_command ($cl, $cmd); } } } # # close a client connection # sub client_close { my ($cl, $reason) = @_; syslog ('info', "closing client $cl: $reason") if (defined $reason); die if !defined ($clients{$cl}->{"fhandle"}); close ($clients{$cl}->{"fhandle"}); delete $clients{$cl}; vec ($iovec, $cl, 1) = 0; $numclients--; } # # Handle a connection from a client # sub client_command { my ($cl, $l) = @_; my ($cmd, $args, $group, $service, $s, $sname, $stchanged); my ($var, $value, $msg, @l, $sock, $port, $addr, $sref, $auth, $fh); my ($user, $pass, @argsList, $comment); my ($authtype, @authtypes); my $is_auth = 0; #flag for multiple auth types syslog ('info', "client command \"$l\"") if ($l !~ /^\s*login/i); $fh = $clients{$cl}->{"fhandle"}; if ($l !~ /^(dump|login|disable|enable|quit|list|set|get| stop|start|loadstate|savestate|reset|clear|checkauth| reload|term|test|servertime|ack|version|protid)\s*(.*)?$/ix) { sock_write ($fh, "520 invalid command\n"); return; } ($cmd, $args) = ("\L$1", $2); $stchanged = 0; # # quit command # if ($cmd eq "quit") { sock_write ($fh, "220 quitting\n"); client_close ($cl); } elsif ($opt{"d"} && $cmd eq "dump") { print STDERR Dumper (\%watch), "\n\n"; # # protocol identification # } elsif ($cmd eq "protid") { if ($args != int ($PROT_VERSION)) { sock_write ($fh, "520 protocol mismatch\n"); } else { sock_write ($fh, "220 protocol match\n"); } # # login # } elsif ($cmd eq "login") { ($user, $pass) = split (/\s+/, $args, 2); @authtypes = split(' ' , $CF{"AUTHTYPE"}) ; # Check each for of authentication in order, and stop checking # as soon as we get a positive authentication result. foreach $authtype (@authtypes) { if (defined auth ($authtype, $user, $pass)) { $is_auth = 1; last; } } if ($is_auth != 1) { sock_write ($fh, "530 login unsuccessful\n"); } else { $clients{$cl}->{"user"} = $user; syslog ("info", "authenticated $user"); sock_write ($fh, "220 login accepted\n"); } # # reset # } elsif ($cmd eq "reset" && check_auth ($clients{$cl}->{"user"}, $cmd)) { my ($keepstate); if ($args =~ /stopped/i) { $STOPPED = 1; $STOPPED_TIME = time; } if ($args =~ /keepstate/) { $keepstate = 1; } if (reset_server ($keepstate)) { sock_write ($fh, "220 reset PID $$\@$HOSTNAME\n"); } else { sock_write ($fh, "520 reset PID $$\@$HOSTNAME failed, error in config file\n"); } # # reload # } elsif ($cmd eq "reload" && check_auth ($clients{$cl}->{"user"}, $cmd)) { if (!defined reload (split (/\s+/, $args))) { sock_write ($fh, "520 unknown reload command\n"); } else { sock_write ($fh, "220 reload completed\n"); } # # clear # } elsif ($cmd eq "clear" && check_auth ($clients{$cl}->{"user"}, $cmd)) { if ($args =~ /^timers \s+ ([a-zA-Z0-9_.-]+) \s+ ([a-zA-Z0-9_.-]+)/ix) { if (!defined $watch{$1}->{$2}) { sock_write ($fh, "520 unknown group\n"); } else { clear_timers ($1, $2); sock_write ($fh, "220 clear timers completed\n"); } } else { sock_write ($fh, "520 unknown clear command\n"); next; } # # test # } elsif ($cmd eq "test" && check_auth ($clients{$cl}->{"user"}, $cmd)) { my ($cmd, $args) = split (/\s+/, $args, 2); # # test monitor # if ($cmd eq "monitor") { my ($group, $service) = split (/\s+/, $args); if (!defined $watch{$group}->{$service}) { sock_write ($fh, "$group $service not defined\n"); } else { $watch{$group}->{$service}->{"_timer"} = 0; } sock_write ($fh, "220 test monitor completed\n"); # # test alert # } elsif ($cmd =~ /^alert|startupalert|upalert$/) { my ($group, $service, $retval, $period) = split (/\s+/, $args, 4); if (!defined $watch{$group}->{$service}) { sock_write ($fh, "520 $group $service not defined\n"); } elsif (!defined $watch{$group}->{$service}->{"periods"}->{$period}) { sock_write ($fh, "520 period not defined\n"); } else { my $f = 0; my $a; if ($cmd eq "alert") { $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"alerts"}; } elsif ($cmd eq "startupalert") { $f = $FL_STARTUPALERT; $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"startupalerts"}; } elsif ($cmd eq "upalert") { $f = $FL_UPALERT; $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"upalerts"}; } for (@{$a}) { my ($alert, $args) = split (/\s+/, $_, 2); if ($args =~ /^exit=/) { $args =~ s/^exit=\S+ \s+//x; } call_alert ( group => $group, service => $service, output => "test\ntest detail\n", retval => $retval, flags => $f | $FL_TEST, alert => $alert, args => $args, ); } sock_write ($fh, "220 test alert completed\n"); } # # test config file # } elsif ($cmd =~ /^config$/) { if ((my $err = read_cf ($CF{"CF"}, 0)) ne "") { sock_write ($fh, $err); sock_write ($fh, "\n520 test config completed, errors found in config file\n"); } else { sock_write ($fh, "220 test config completed OK, no errors found\n"); } } else { sock_write ($fh, "520 test error\n"); } # # version # } elsif ($cmd eq "version") { sock_write ($fh, "version " . int ($PROT_VERSION) . "\n"); sock_write ($fh, "220 version completed\n"); # # load state # } elsif ($cmd eq "loadstate" && check_auth ($clients{$cl}->{"user"}, $cmd)) { foreach (split (/\s+/, $args)) { load_state ($_); } sock_write ($fh, "220 loadstate completed\n"); # # save state # } elsif ($cmd eq "savestate" && check_auth ($clients{$cl}->{"user"}, $cmd)) { if ($args =~ /\S/) { foreach (split (/\s+/, $args)) { save_state ($_); } sock_write ($fh, "220 savestate completed\n"); } else { sock_write ($fh, "520 savestate error, arguments required\n"); } # # term # } elsif ($cmd eq "term" && check_auth ($clients{$cl}->{"user"}, $cmd)) { sock_write ($fh, "220 terminating server\n"); client_close ($cl, "terminated by user command"); syslog ("info", "terminating by user command"); exit; # # stop testing # } elsif ($cmd eq "stop"&& check_auth ($clients{$cl}->{"user"}, $cmd)) { $STOPPED = 1; $STOPPED_TIME = time; sock_write ($fh, "220 stop completed\n"); # # start testing # } elsif ($cmd eq "start" && check_auth ($clients{$cl}->{"user"}, $cmd)) { $STOPPED = 0; $STOPPED_TIME = 0; sock_write ($fh, "220 start completed\n"); # # set # } elsif ($cmd eq "set" && check_auth ($clients{$cl}->{"user"}, $cmd)) { if ($args =~ /^maxkeep\s+(\d+)/) { $CF{"MAX_KEEP"} = $1; sock_write ($fh, "220 set completed\n"); } else { ($group, $service, $var, $value) = split (/\s+/, $args, 4); if (!defined $watch{$group}->{$service}) { sock_write ($fh, "520 $group,$service not defined\n"); } elsif ($var eq "opstatus") { if (!defined ($OPSTAT{$value})) { sock_write ($fh, "520 undefined opstatus\n"); } else { set_op_status ($group, $service, un_esc_str ((parse_line ('\s+', 0, $value))[0])); sock_write ($fh, "220 set completed\n"); } } else { $value = un_esc_str ((parse_line ('\s+', 0, $value))[0]); $watch{$group}->{$service}->{$var} = $value; sock_write ($fh, "$group $service $var='$value'\n"); sock_write ($fh, "220 set completed\n"); } } # # get # } elsif ($cmd eq "get" && check_auth ($clients{$cl}->{"user"}, $cmd)) { if ($args =~ /^maxkeep\s*$/) { sock_write ($fh, "maxkeep = $CF{MAX_KEEP}\n"); sock_write ($fh, "220 set completed\n"); } else { ($group, $service, $var) = split (/\s+/, $args, 3); if (!defined $watch{$group}->{$service}) { sock_write ($fh, "520 $group,$service not defined\n"); } else { sock_write ($fh, "$group $service $var='" . esc_str ($watch{$group}->{$service}->{$var}, 1) . "'\n"); sock_write ($fh, "220 get completed\n"); } } # # list # } elsif ($cmd eq "list" && check_auth ($clients{$cl}->{"user"}, $cmd)) { @argsList = split(/\s+/, $args); ($cmd, $args) = split (/\s+/, $args, 2); # # list service descriptions # if ($cmd eq "descriptions") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { sock_write ($fh, "$group $service '" . esc_str ($watch{$group}->{$service}->{"description"}, 1) . "'\n"); } } sock_write ($fh, "220 list descriptions completed\n"); # # list group members # } elsif ($cmd eq "group") { if ($groups{$args}) { sock_write ($fh, "hostgroup $args @{$groups{$args}}\n"); sock_write ($fh, "220 list group completed\n"); } else { sock_write ($fh, "520 list group error, undefined group\n"); } # # list status of all services # } elsif ($cmd eq "opstatus") { if ($args eq "") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { client_write_opstatus ($fh, $group, $service); } } sock_write ($fh, "220 list opstatus completed\n"); } else { my $err = 0; my @g = (); my ($group, $service); foreach my $gs (split (/\s+/, $args)) { ($group, $service) = split (/,/, $gs); $err++ && last if (!defined $watch{$group}->{$service}); push (@g, [$group, $service]); } if (!$err) { foreach my $gs (@g) { client_write_opstatus ($fh, $gs->[0], $gs->[1]); } sock_write ($fh, "220 list opstatus completed\n"); } else { sock_write ($fh, "520 $group,$service does not exist\n"); } } # # list disabled hosts and services # } elsif ($cmd eq "disabled") { foreach $group (keys %groups) { @l = grep (/^\*/, @{$groups{$group}}); if (@l) { grep (s/^\*//, @l); sock_write ($fh, "group $group: @l\n"); } } foreach $group (keys %watch) { if ($watch_disabled{$group} == 1) { sock_write ($fh, "watch $group\n"); } foreach $service (keys %{$watch{$group}}) { if ($watch{$group}->{$service}->{'disable'} == 1) { sock_write ($fh, "watch $group service " . "$service\n"); } } } sock_write ($fh, "220 list disabled completed\n"); # # list last alert history # } elsif ($cmd eq "alerthist") { foreach my $l (@last_alerts) { sock_write ($fh, esc_str ($l) . "\n"); } sock_write ($fh, "220 list alerthist completed\n"); # # list time of last failures for each service # } elsif ($cmd eq "failures") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { my $sref = \%{$watch{$group}->{$service}}; client_write_opstatus ($fh, $group, $service) if ($FAILURE{$sref->{"_op_status"}}); } } sock_write ($fh, "220 list failures completed\n"); # # list the failure history # } elsif ($cmd eq "failurehist") { foreach my $l (@last_failures) { sock_write ($fh, esc_str ($l) . "\n"); } sock_write ($fh, "220 list failurehist completed\n"); # # list the time of last successes for each service # } elsif ($cmd eq "successes") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { my $sref = \%{$watch{$group}->{$service}}; client_write_opstatus ($fh, $group, $service) if ($SUCCESS{$sref->{"_op_status"}}); } } sock_write ($fh, "220 list successes completed\n"); # # list warnings # } elsif ($cmd eq "warnings") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { my $sref = \%{$watch{$group}->{$service}}; client_write_opstatus ($fh, $group, $service) if ($WARNING{$sref->{"_op_status"}}); } } sock_write ($fh, "220 list successes completed\n"); # # list process IDs # } elsif ($cmd eq "pids") { sock_write ($fh, "server $$\n"); foreach $value (keys %runningpid) { ($group, $service) = split (/\//, $runningpid{$value}); sock_write ($fh, "$group $service $value\n"); } sock_write ($fh, "220 list pids completed\n"); # # list watch groups and services # } elsif ($cmd eq "watch") { foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { if (!defined $watch{$group}->{$service}) { sock_write ($fh, "$group (undefined service)\n"); } else { sock_write ($fh, "$group $service\n"); } } } sock_write ($fh, "220 list watch completed\n"); # # list server state # } elsif ($cmd eq "state") { if ($STOPPED) { sock_write ($fh, "scheduler stopped since $STOPPED_TIME\n"); } else { sock_write ($fh, "scheduler running\n"); } sock_write ($fh, "220 list state completed\n"); # # list aliases # } elsif ($cmd eq "aliases") { my (@listAliasesRequest) = @argsList; shift (@listAliasesRequest); # if no alias request, all alias are responded unless (@listAliasesRequest) { @listAliasesRequest = keys (%alias); } foreach my $alias (@listAliasesRequest){ sock_write ($fh, "alias $alias\n"); foreach $value (@{$alias{$alias}}) { sock_write ($fh, "$value\n"); } sock_write ($fh, "\n"); } sock_write ($fh, "220 list aliases completed\n"); # # list aliasgroups # } elsif ($cmd eq "aliasgroups") { my (@listAliasesRequest); @listAliasesRequest = keys (%alias); sock_write ($fh, "@listAliasesRequest\n") unless (@listAliasesRequest == 0); sock_write ($fh, "220 list aliasgroups completed\n"); # # list deps # } elsif ($cmd eq "deps") { foreach my $g (keys %watch) { foreach my $s (keys %{$watch{$g}}) { my $sref = \%{$watch{$g}->{$s}}; if ($sref->{"depend"} ne "") { sock_write ($fh, "exp $g $s '" . esc_str ($sref->{"depend"}, 1) . "'\n"); } else { sock_write ($fh, "exp $g $s 'NONE'\n"); } my @u = ($sref->{"depend"} =~ /[a-zA-Z0-9_.-]+:[a-zA-Z0-9_.-]+/g); if (@u) { sock_write ($fh, "cmp $g $s @u\n"); } else { sock_write ($fh, "cmp $g $s NONE\n"); } } } sock_write ($fh, "220 list deps completed\n"); # # downtime log # } elsif ($cmd eq "dtlog") { if ($CF{"DTLOGGING"}) { if (!open (DTLOGTMP, "< $CF{DTLOGFILE}")) { sock_write ($fh, "520 list dtlog error, cannot open dtlog\n"); } else { while (<DTLOGTMP>) { sock_write ($fh, $_ ) if (!/^#/); } close (DTLOGTMP); sock_write ($fh, "220 list dtlog completed\n"); } } else { sock_write ($fh, "520 list dtlog error, dtlogging is not turned on\n"); } } else { sock_write ($fh, "520 unknown list command\n"); } # # acknowledge a failure # } elsif ($cmd eq "ack" && check_auth ($clients{$cl}->{"user"}, $cmd)) { my ($group, $service, $comment) = split (/\s+/, $args, 3); if (!defined ($watch{$group})) { sock_write ($fh, "520 unknown group\n"); } elsif (!defined $watch{$group}->{$service}) { sock_write ($fh, "520 unknown service\n"); } my $sref = \%{$watch{$group}->{$service}}; if ($sref->{"_op_status"} == $STAT_OK || $sref->{"_op_status"} == $STAT_UNTESTED) { sock_write ($fh, "520 service is in a non-failure state\n"); } else { $sref->{"_ack"} = 1; $sref->{"_ack_comment"} = un_esc_str ((parse_line ('\s+', 0, $comment))[0]); sock_write ($fh, "220 ack completed\n"); } # # disable watch, service or host # } elsif ($cmd eq "disable" && check_auth ($clients{$cl}->{"user"}, $cmd)) { ($cmd, $args) = split (/\s+/, $args, 2); # # disable watch # if ($cmd eq "watch") { if (!defined (disen_watch($args, 0))) { sock_write ($fh, "520 disable error, unknown watch \"$args\"\n"); } else { $stchanged++; sock_write ($fh, "220 disable watch completed\n"); } # # disable service # } elsif ($cmd eq "service") { ($group, $service) = split (/\s+/, $args, 2); if (!defined (disen_service ($group, $service, 0))) { sock_write ($fh, "520 disable error, unknown service\n"); } else { $stchanged++; sock_write ($fh, "220 disable service completed\n"); } # # disable host # } elsif ($cmd eq "host") { my @notfound = (); my @hosts = split (/\s+/, $args); foreach my $h (@hosts) { if (!host_exists ($h)) { push @notfound, $h; } } if (@notfound == 0) { foreach my $h (@hosts) { disen_host ($h, 0); $stchanged++; sock_write ($fh, "220 disable host completed\n"); } } else { sock_write ($fh, "520 disable host failed, @notfound does not exist\n"); } } else { sock_write ($fh, "520 command could not be executed\n"); } # # enable watch, service or host # } elsif ($cmd eq "enable" && check_auth ($clients{$cl}->{"user"}, $cmd)) { ($cmd, $args) = split (/\s+/, $args, 2); # # enable watch # if ($cmd eq "watch") { if (!defined (disen_watch ($args, 1))) { sock_write ($fh, "520 enable error, unknown watch\n"); } else { $stchanged++; sock_write ($fh, "220 enable watch completed\n"); } # # enable service # } elsif ($cmd eq "service") { ($group, $service) = split (/\s+/, $args, 2); if (!defined (disen_service ($group, $service, 1))) { sock_write ($fh, "520 enable error, unknown group\n"); } else { $stchanged++; sock_write ($fh, "220 enable completed\n"); } # # enable host # } elsif ($cmd eq "host") { foreach $var (split (/\s+/, $args)) { disen_host ($var, 1); $stchanged++; } sock_write ($fh, "220 enable completed\n"); } else { sock_write ($fh, "520 command could not be executed\n"); } # # server time # } elsif ($cmd eq "servertime" && check_auth ($clients{$cl}->{"user"}, $cmd)) { sock_write ($fh, join ("", time, " ", scalar (localtime), "\n")); sock_write ($fh, "220 servertime completed\n"); # # check auth # } elsif ($cmd eq "checkauth") { split(' ',$args); $cmd = $_[0]; $user = $clients{$cl}->{"user"}; # Note that we call check_auth without syslogging here. if (check_auth($clients{$cl}->{"user"}, $cmd, 1)) { sock_write ($fh, "220 command authorized\n"); } else { sock_write ($fh, "520 command could not be executed\n"); } } else { sock_write ($fh, "520 command could not be executed\n"); } save_state ("disabled") if ($stchanged); } sub client_write_opstatus { my $fh = shift; my ($group, $service) = @_; my $sref = \%{$watch{$group}->{$service}}; my $summary = esc_str ($sref->{"_last_summary"}, 1); my $detail = esc_str ($sref->{"_last_detail"}, 1); my $depend = esc_str ($sref->{"depend"}, 1); my $monitor = esc_str ($sref->{"monitor"}, 1); my $comment; if ($sref->{"_ack"} == 1) { $comment = esc_str ($sref->{"_ack_comment"}, 1); } else { $comment = ''; } my $alerts_sent = 0; foreach my $period (keys %{$sref->{"periods"}}) { $alerts_sent += $sref->{"periods"}->{$period}->{"_alert_sent"}; } my $buf = "group=$group" . " service=$service" . " opstatus=$sref->{_op_status}" . " last_opstatus=$sref->{_last_op_status}" . " exitval=$sref->{_exitval}" . " timer=$sref->{_timer}" . " last_success=$sref->{_last_success}" . " last_trap=$sref->{_last_trap}" . " last_check=$sref->{_last_check}" . " ack=$sref->{_ack}" . " ackcomment='$comment'" . " alerts_sent=$alerts_sent" . " depstatus=" . int ($sref->{"_depend_status"}) . " depend='$depend'" . " monitor='$monitor'" . " last_summary='$summary'" . " last_detail='$detail'"; $buf .= " last_failure=$sref->{_last_failure}" if ($sref->{"_last_failure"}); $buf .= " interval=$sref->{interval}" if ($sref->{"interval"}); $buf .= " exclude_period='$sref->{exclude_period}'" if ($sref->{"exclude_period"} ne ""); $buf .= " exclude_hosts='" . join (" ", keys %{$sref->{exclude_hosts}}) . "'" if (keys %{$sref->{"exclude_hosts"}}); $buf .= " randskew=$sref->{randskew}" if ($sref->{"randskew"}); my $l = 0; foreach my $p (keys %{$sref->{"periods"}}) { $l = $sref->{"periods"}->{$p}->{"_last_alert"} if ($sref->{"periods"}->{$p}->{"_last_alert"} > $l); } $buf .= " last_alert=$l" if ($l); if ($sref->{"_first_failure"}) { my $t = time - $sref->{"_first_failure"}; $buf .= " first_failure=$sref->{_first_failure}" . " failure_duration=$t"; } $buf .= "\n"; sock_write ($fh, $buf); } # # show usage # sub usage { print <<"EOF"; usage: mon [-a dir] [-c config] [-d] [-f] [-i secs] [-k num] [-m num] [-p num] [-P file] [-r num] [-s dir] mon -v -a dir alert script dir -A file authorization file -b dir base directory for alerts and monitors (basedir) -B dir base directory for configuration files (cfbasedir) -c config config file, defaults to "mon.cf" -d debug -D dir state directory (statedir) -f fork and become a daemon -h this help -i secs sleep interval (seconds), defaults to 1 -k num keep history of last num events -l load old state from statedir -L dir log directory (logdir) -M pre-process config file with m4 -m num throttle at maximum number of monitor processes -O facility syslog facility to use -o file on-call schedule -p num server listens on port num -P file PID file -r num randomize startup schedule -s dir monitor script dir -S start with scheduler stopped -t port trap port -v print version Report bugs to $AUTHOR $RCSID EOF } # # become a daemon # sub daemon { my $pid; if ($pid = fork()) { # the parent goes away all happy and stuff exit (0); } elsif (!defined $pid) { die "could not fork: $!\n"; } setsid(); # # make it so that we cannot regain a controlling terminal # if ($pid = fork()) { # the parent goes away all happy and stuff exit (0); } elsif (!defined $pid) { syslog ('err', "could not fork: $!"); exit 1; } chdir ('/'); umask (022); if (!open (N, "+>>" . $CF{"MONERRFILE"})) { syslog ("err", "could not open error output file $CF{'MONERRFILE'}: %m"); exit (1); } if (!open(STDOUT, ">&N") || !open (STDIN, "<&N") || !open (STDERR, ">&N")) { syslog ("err", "could not redirect: %m"); exit(1); } syslog ('info', "running as daemon"); } # # debug # sub debug { my ($level, @l) = @_; return if ($level > $opt{"d"}); if ($opt{"d"} && !$opt{"f"}) { print STDERR @l; } else { syslog ('debug', join ('', @l)); } } # # die_die # sub die_die { my ($level, $msg) = @_; die "[$level] $msg\n" if ($opt{"d"}); syslog ($level, "fatal, $msg"); closelog(); exit (1); } # # handle cleanup of exited processes # trigger alerts on failures (or send no alert if disabled) # do some accounting # sub proc_cleanup { my ($summary, $tmnow, $buf); $tmnow = time; return if (keys %running == 0); while ((my $p = waitpid (-1, &WNOHANG)) >0) { my ($group, $service) = split (/\//, $runningpid{$p}); my $sref = \%{$watch{$group}->{$service}}; # # suck in any extra data # my $fh = $fhandles{$runningpid{$p}}; while (my $z = sysread ($fh, $buf, 8192)) { $ibufs{$runningpid{$p}} .= $buf; } $sref->{"_exitval"} = int($?>>8); debug (1, "PID $p ($runningpid{$p}) exited with [$sref->{'_exitval'}]\n"); $sref->{"_last_checked"} = $tmnow; if ($sref->{"depend"} ne "" && $sref->{"dep_behavior"} eq "a") { dep_ok ($sref); } # # error exit value # if ($?) { # # accounting # $sref->{"_failure_count"}++; $sref->{"_consec_failures"}++; $sref->{"_last_failure"} = $tmnow; if ($sref->{"_op_status"} == $STAT_OK || $sref->{"_op_status"} == $STAT_UNKNOWN || $sref->{"_op_status"} == $STAT_UNTESTED) { $sref->{"_first_failure"} = $tmnow; } set_op_status ($group, $service, $STAT_FAIL); my ($summary, $detail) = split("\n", $ibufs{$runningpid{$p}}, 2); $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m); $sref->{"_last_summary"} = $summary; $sref->{"_last_detail"} = $detail; shift @last_failures if (@last_failures > $CF{"MAX_KEEP"}); push @last_failures, "$group $service" . " $tm $summary"; syslog ('crit', "failure for $last_failures[-1]"); # # send an alert if necessary # do_alert ($group, $service, $ibufs{$runningpid{$p}}, $?>>8, $FL_MONITOR); # # change interval if needed # if (defined ($sref->{"failure_interval"}) && $sref->{"_old_interval"} == undef) { $sref->{"_old_interval"} = $sref->{"interval"}; $sref->{"interval"} = $sref->{"failure_interval"}; $sref->{"_next_check"} = 0; } $sref->{"_failure_output"} = $ibufs{$runningpid{$p}}; } # # success exit value # else { if ($CF{"DTLOGGING"} && defined ($sref->{"_op_status"}) && $sref->{"_op_status"} == $STAT_FAIL) { write_dtlog ($sref, $group, $service); } # # if this service has just come back up and # we are paying attention to this event, # let someone know # if (defined ($sref->{"_op_status"}) && $sref->{"_op_status"} == $STAT_FAIL) { if (defined($sref->{"_upalert"}) && $tmnow - $sref->{"_first_failure"} >= $sref->{"upalertafter"}) { do_alert ($group, $service, $sref->{"_last_output"}, 0, $FL_UPALERT); } } $sref->{"_ack"} = 0; $sref->{"_ack_comment"} = ''; $sref->{"_first_failure"} = 0; $sref->{"_last_failure"} = 0; $sref->{"_consec_failures"} = 0; my ($summary, $detail) = split("\n", $ibufs{$runningpid{$p}}, 2); $sref->{"_last_summary"} = $summary; $sref->{"_last_detail"} = $detail; # # reset the alertevery timer # foreach my $period (keys %{$sref->{"periods"}}) { $sref->{"periods"}->{$period}->{"_last_alert"} = 0; $sref->{"periods"}->{$period}->{"_alert_sent"} = 0; } # # change interval back to original # if (defined ($sref->{"failure_interval"}) && $sref->{"_old_interval"} != undef) { $sref->{"interval"} = $sref->{"_old_interval"}; $sref->{"_old_interval"} = undef; $sref->{"_next_check"} = 0; } $sref->{"_last_success"} = $tmnow; set_op_status ($group, $service, $STAT_OK); } # # save the output # $sref->{"_last_output"} = $ibufs{$runningpid{$p}}; reset_timer ($group, $service); remove_proc ($p); } } # # collect output from running processes # sub collect_output { my ($buf, $rout); return if (!keys %running); my $nfound = select ($rout=$fdset_rbits, undef, undef, 0); debug (1, "select returned $nfound file handles\n"); return if ($! == &EINTR); if ($nfound) { # # look for the file descriptors that are readable, # and try to read as much as possible from them # foreach my $k (keys %fhandles) { my $fh = $fhandles{$k}; if (vec ($rout, fileno($fh), 1) == 1) { my $z = 0; while ($z = sysread ($fh, $buf, 8192)) { $ibufs{$k} .= $buf; debug (1, "[$buf] from $fh\n"); } # # ignore if EAGAIN, since we're nonblocking # if (!defined($z) && $! == &EAGAIN) { # # error on this descriptor # } elsif (!defined($z)) { debug (1, "error on $fh: $!\n"); syslog ('err', "error on $fh: $!"); vec($fdset_rbits, fileno($fh), 1) = 0; } elsif ($z == 0 && $! == &EAGAIN) { debug (1, "EAGAIN on $fh\n"); # # if EOF encountered, stop trying to # get input from this file descriptor # } elsif ($z == 0) { debug (1, "EOF on $fh\n"); vec($fdset_rbits, fileno($fh), 1) = 0; } } } } } # # handle forking a monitor process, and set up variables # sub run_monitor { my ($group, $service) = @_; my (@args, @groupargs, $pid, @ghosts, $monitor, $monitorargs); my $sref = \%{$watch{$group}->{$service}}; ($monitor, $monitorargs) = ($sref->{"monitor"} =~ /^(\S+)(\s+(.*))?$/); if (!defined $MONITORHASH{$monitor} || ! -f $MONITORHASH{$monitor}) { syslog ('err', "no monitor found while trying to run [$monitor]"); return undef; } else { $monitor = $MONITORHASH{$monitor}; } $monitor .= " " . $monitorargs if ($monitorargs); @ghosts = (); # # if monitor ends with ";;", do not append groups # to command line # if ($monitor =~ /;;\s*$/) { $monitor =~ s/\s*;;\s*$//; @args = quotewords ('\s+', 0, $monitor); @ghosts = (1); # # exclude disabled hosts # } else { @ghosts = grep (!/^\*/, @{$groups{$group}}); # # per-service excludes # if (keys %{$sref->{"exclude_hosts"}}) { my @g = (); for (my $i=0; $i<@ghosts; $i++) { push (@g, $ghosts[$i]) if !$sref->{"exclude_hosts"}->{$ghosts[$i]}; } @ghosts = @g; } @args = (quotewords ('\s+', 0, $monitor), @ghosts); } if (@ghosts == 0 && !defined ($sref->{"allow_empty_group"})) { syslog ('err', "monitor for $group/$service" . " not called because of no host arguments\n"); } else { $fhandles{"$group/$service"} = new FileHandle; $pid = open($fhandles{"$group/$service"}, '-|'); if (!defined $pid) { syslog ('err', "Could not fork: $!"); delete $fhandles{"$group/$service"}; return 0; } elsif ($pid == 0) { open(STDERR, '>&STDOUT') or syslog ('err', "Could not dup stderr: $!"); open(STDIN, "</dev/null") or syslog ('err', "Could not connect stdin to /dev/null: $!"); my $v; foreach $v (keys %{$sref->{"ENV"}}) { $ENV{$v} = $sref->{"ENV"}->{$v}; } $ENV{"MON_LAST_SUMMARY"} = $sref->{"_last_summary"}; $ENV{"MON_LAST_OUTPUT"} = $sref->{"_last_output"}; $ENV{"MON_LAST_FAILURE"} = $sref->{"_last_failure"}; $ENV{"MON_FIRST_FAILURE"} = $sref->{"_first_failure"}; $ENV{"MON_DEPEND_STATUS"} = $sref->{"_depend_status"}; $ENV{"MON_LAST_SUCCESS"} = $sref->{"_last_success"}; $ENV{"MON_STATEDIR"} = $CF{"STATEDIR"}; $ENV{"MON_LOGDIR"} = $CF{"LOGDIR"}; exec @args or syslog ('err', "could not exec '@args': $!") && exit(1); } $sref->{"_last_check"} = scalar (time); unless ($sref->{"_next_check"}) { $sref->{"_next_check"} = $sref->{"_last_check"} + $sref->{"interval"}; } else { $sref->{"_next_check"} += $sref->{"interval"}; } debug (1, "watching file handle ", fileno ($fhandles{"$group/$service"}), " for $group/$service\n"); # # set nonblocking I/O and setup bit vector for select(2) # configure_filehandle ($fhandles{"$group/$service"}) || syslog ("err", "could not configure filehandle for $group/$service: $!"); vec ($fdset_rbits, fileno($fhandles{"$group/$service"}), 1) = 1; $fdset_ebits |= $fdset_rbits; # # note that this is running # $running{"$group/$service"} = 1; $runningpid{$pid} = "$group/$service"; $ibufs{"$group/$service"} = ""; $procs++; } } # # set the countdown timer for this service # sub reset_timer { my ($group, $service) = @_; my $sref = \%{$watch{$group}->{$service}}; if ($sref->{"randskew"} != 0) { $sref->{"_timer"} = $sref->{"interval"} + (int (rand (2)) == 0 ? -int(rand($sref->{"randskew"}) + 1) : int(rand($sref->{"randskew"})+1)); } elsif ($sref->{"_next_check"}) { $sref->{"_timer"} = $sref->{"_next_check"} - time(); } else { $sref->{"_timer"} = $sref->{"interval"}; } } # # randomize the delay before each test # $opt{"randstart"} is seconds # sub randomize_startdelay { my ($group, $service); foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { $watch{$group}->{$service}->{"_timer"} = int (rand ($CF{"RANDSTART"})); } } } # # return 1 if $val is within $range, # where $range = "number" or "number-number" # sub inRange { my ($val, $range) = @_; my ($retval); $retval = 0; if ($range =~ /^(\d+)$/ && $val == $1) { $retval = 1 } elsif ($range =~ /^(\d+)\s*-\s*(\d+)$/ && ($val >= $1 && $val <= $2)) { $retval = 1 } $retval; } # # disable ($cmd==0) or enable a watch # sub disen_watch { my ($w, $cmd) = @_; return undef if (!defined ($watch{$w})); if ($cmd == 0) { $watch_disabled{$w} = 1; } else { $watch_disabled{$w} = 0; } } # # disable ($cmd==0) or enable a service # sub disen_service { my ($g, $s, $cmd) = @_; my ($snum); return undef if (!defined $watch{$g}); return undef if (!defined $watch{$g}->{$s}); if ($cmd == 0) { $watch{$g}->{$s}->{"disable"} = 1; } else { $watch{$g}->{$s}->{"disable"} = 0; } } # # disable ($cmd==0) or enable a host # sub disen_host { my ($h, $cmd) = @_; my $found = undef; foreach my $g (keys %groups) { if ($cmd == 0) { if (grep (s/^$h$/*$h/, @{$groups{$g}})) { $found = 1; } } else { if (grep (s/^\*$h$/$h/, @{$groups{$g}})) { $found = 1; } } } $found; } sub host_exists { my $host = shift; my $found = 0; foreach my $g (keys %groups) { if (grep (/^$host$/, @{$groups{$g}})) { $found = 1; last; } } $found; } # # save state # sub save_state { my (@states) = @_; my ($group, $service, @l, $state); foreach $state (@states) { if ($state eq "disabled") { if (!open (STATE, ">$CF{STATEDIR}/disabled")) { syslog ("err", "could not write to state file: $!"); next; } foreach $group (keys %groups) { @l = grep (/^\*/, @{$groups{$group}}); if (@l) { grep (s/^\*//, @l); grep { print STATE "disable host $_\n" } @l; } } foreach $group (keys %watch) { if ($watch_disabled{$group} == 1) { print STATE "disable watch $group\n"; } foreach $service (keys %{$watch{$group}}) { if ($watch{$group}->{$service}->{'disable'} == 1) { print STATE "disable service $group $service\n"; } } } close (STATE); } elsif ($state eq "opstatus") { if (!open (STATE, ">$CF{STATEDIR}/opstatus")) { syslog ("err", "could not write to opstatus state file: $!"); next; } foreach $group (keys %watch) { foreach $service (keys %{$watch{$group}}) { print STATE "group=$group service=$service" . " op_status=$watch{$group}->{$service}->{_op_status}" . " failure_count=$watch{$group}->{$service}->{_failure_count}" . " alert_count=\n"; } } close (STATE); } } } # # load state # sub load_state { my (@states) = @_; my ($l, $cmd, $args, $group, $service, $what, $state); foreach $state (@states) { if ($state eq "disabled") { if (!open (STATE, "$CF{STATEDIR}/disabled")) { syslog ("err", "could not read state file: $!"); next; } while (defined ($l = <STATE>)) { chomp $l; ($cmd, $what, $args) = split (/\s+/, $l, 3); next if ($cmd ne "disable"); if ($what eq "host") { disen_host ($args); } elsif ($what eq "watch") { syslog ("err", "undefined watch reading state file: $l") if (!defined disen_watch ($args)); } elsif ($what eq "service") { ($group, $service) = split (/\s+/, $args, 2); syslog ("err", "undefined group or service reading state file: $l") if (!defined disen_service ($group, $service)); } } syslog ("info", "state '$state' loaded"); close (STATE); } } } # # authenticate a login # sub auth { my ($type, $user, $plaintext) = @_; my ($pass, %u, $l, $u, $p); if ($user eq "" || $plaintext eq "") { syslog ('err', "an undef username or password supplied"); return undef; } # # standard UNIX passwd # if ($type eq "getpwnam") { (undef, $pass) = getpwnam($user); return undef if (!defined $pass); if ((crypt ($plaintext, $pass)) ne $pass) { return undef; } return 1; # # shadow password # } elsif ($type eq "shadow") { # # "mon" authentication # } elsif ($type eq "userfile") { if (!open (U, $CF{"USERFILE"})) { syslog ('err', "could not open user file '$CF{USERFILE}': $!"); return undef; } while (<U>) { next if (/^\s*#/ || /^\s*$/); chomp; ($u,$p) = split (/\s*:\s*/, $_, 2); $u{$u} = $p; } close (U); return undef if (!defined($u{$user})); #user was not found in userfile return undef if ((crypt ($plaintext, $u{$user})) ne $u{$user}); #user gave wrong password return 1; # # PAM authentication # } elsif ($type eq "pam") { local $PAM_username = $user; local $PAM_password = $plaintext; my $pamh; if (!ref($pamh = new Authen::PAM($CF{'PAMSERVICE'}, $PAM_username, \&pam_conv_func))) { syslog ('err', "Error code $pamh during PAM init!: $!"); return undef; } my $res = $pamh->pam_authenticate ; return undef if ($res != &Authen::PAM::PAM_SUCCESS) ; return 1; } else { syslog ('err', "authentication type '$type' not known"); } return undef; } # # load the table of who can do which commands # sub load_auth { my ($startup) = @_; my ($l, $cmd, $users, $u, $host, $user, $password, $sect); %AUTHCMDS = (); %NOAUTHCMDS = (); %AUTHTRAPS = (); %AUTHSNMPTRAPS = (); $sect = "command"; if (!open (C, $CF{"AUTHFILE"})) { err_startup ($startup, "could not open $CF{AUTHFILE}: $!"); return undef; } while (defined ($l = <C>)) { next if ($l =~ /^\s*#/ || $l =~ /^\s*$/); chomp $l; $l =~ s/^\s*//; $l =~ s/\s*$//; if ($l =~ /^command\s+section/) { $sect = "command"; next; } elsif ($l =~ /^trap\s+section/) { $sect = "trap"; next; } elsif ($l =~ /^snmp trap section/) { $sect = "snmptrap"; next; } if ($sect eq "command") { ($cmd, $users) = split (/\s*:\s*/, $l, 2); if (!defined $users) { err_startup ($startup, "could not parse line $. of auth file\n"); next; } foreach $u (split (/\s*,\s*/, $users)) { if ( $u =~ /^AUTH_ANY$/ ) { # Allow all authenticated users $AUTHCMDS{"\L$cmd"}{$u} = 1; } elsif ( $u =~ /^!(.*)/ ) { # Directive is to "deny-user" $NOAUTHCMDS{"\L$cmd"}{$1} = 1; } else { # Directive is to "allow-user" $AUTHCMDS{"\L$cmd"}{$u} = 1; } } } elsif ($sect eq "trap") { if ($l !~ /^(\S+)\s+(\S+)\s+(\S+)$/) { syslog ('err', "invalid entry in trap sect of $CF{AUTHFILE}, line $."); next; } ($host, $user, $password) = ($1, $2, $3); if ($host eq "*") { # # allow traps from all hosts # } elsif ($host =~ /^[a-z]/ && ($host = gethostbyname ($host)) eq "") { syslog ('err', "invalid host in $CF{AUTHFILE}, line $."); next; } elsif ($host =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ && ($host = inet_aton ($host)) eq "") { syslog ('err', "invalid host in $CF{AUTHFILE}, line $."); next; } else { syslog ('err', "invalid host in $CF{AUTHFILE}, line $."); next; } if ($host ne "*") { $host = inet_ntoa ($host); } $AUTHTRAPS{$host}{$user} = $password; } elsif ($sect eq "snmptrap") { if ($l !~ /^(\S+)\s+(\S+)$/) { syslog ('err', "invalid line in $CF{AUTHFILE}, line $."); next; } ($host, $password) = ($1, $2); $AUTHSNMPTRAPS{$host}{$password} = 1; } else { syslog ('err', "unknown section in $CF{AUTHFILE}: $l"); } } close (C); } # # return undef if $user isn't permitted to perform $cmd # Optional third argument controls logging to syslog. # e.g., # check_auth("joe", "disable") # will check to see if user joe is authorized to disable, and # complain to syslog if joe is not authorized # check_auth("joe", "disable", 1) # will check to see if user joe is authorized to disable but # NOT complain to syslog if joe is not authorized # sub check_auth { my ($user, $cmd, $no_syslog) = @_; # # Check to see if the authenticated user is specifically # denied the ability to run this command. # if ( (defined ($user) && $NOAUTHCMDS{$cmd}{$user}) || (defined ($user) && $NOAUTHCMDS{$cmd}{"AUTH_ANY"}) ) { syslog ("err", "user '$user' tried '$cmd', denied"); return undef; } # # Check for "all". This allows any client, authenticated or # not, to execute the requested command. # return 1 if ($AUTHCMDS{$cmd}{"all"}); # # Check for AUTH_ANY. This allows any authenticated user to # execute the requested command. # return 1 if (defined ($user) && $AUTHCMDS{$cmd}{"AUTH_ANY"}); # # Check to see if the authenticated user is specifically #allowed the ability to run this command. # return 1 if (defined ($user) && $AUTHCMDS{$cmd}{$user}); syslog ("err", "user '$user' tried '$cmd', not authenticated") unless defined($no_syslog); return undef; } # # reload things # sub reload { my (@what) = @_; for (@what) { if ($_ eq "auth") { load_auth; } elsif ($_ eq "oncall") { load_oncall; } else { return undef; } } return 1; } # # (re)load the oncall schedule # sub load_oncall { my ($startup) = @_; my ($group, $service, $time, $who, %newoncall); if (!open (ONCALL, $CF{"OCFILE"})) { err_startup ($startup, "could not open $CF{OCFILE}: $!"); return undef; } %newoncall = (); while (<ONCALL>) { next if (/^\s*$/ || /^\s*#/); chomp; if (!/^\s* ([a-zA-Z0-9_.-]+) \s+ ([a-zA-Z0-9_.-]+) \s+ (\w{3} \s+ \d{1,2}:\d\d|default|none) \s+ (.*) \s*$/xi) { err_startup ($startup, syslog ('err', "error in oncall configuration, line $.")); close (ONCALL); return undef; } ($group, $service, $time, $who) = ($1, $2, $3, $4); $group =~ tr/A-Z/a-z/; $service =~ tr/A-Z/a-z/; $time =~ tr/A-Z/a-z/; if (!defined($groups{$group})) { err_startup ($startup, "group $group in oncall line $. not defined in $CF{OCFILE}"); close (ONCALL); return undef; } elsif (!defined $watch{$group}->{$service}) { err_startup ($startup, "service $service in oncall line $. not defined in $CF{OCFILE}"); close (ONCALL); return undef; } print "[$group] [$service] [$time] [$who]\n"; } close (ONCALL); %oncall = %newoncall; 1; } sub err_startup { my ($startup, $msg) = @_; if ($startup) { die "$msg\n"; } else { syslog ('err', $msg); } } # # handle SNMP trap # sub handle_snmp_trap { my ($buf, $from) = @_; my ($port, $addr, $fromip); my (%traphash); ($port, $addr) = sockaddr_in ($from); $fromip = inet_ntoa ($addr); if (!defined ($AUTHSNMPTRAPS{$fromip})) { syslog ('err', "got SNMP trap from unauthorized agent: $fromip"); return undef; } $TRAP_PDU->buffer ($buf); %traphash = $TRAP_PDU->decode; if (! keys %traphash) { syslog ('err', "error decoding SNMP trap: " . $TRAP_PDU->error); return undef; } if ($AUTHSNMPTRAPS{$fromip} ne crypt ($traphash{"community"}, $traphash{"community"})) { syslog ('err', "unauthorized community from agent: $fromip"); return undef; } # # here's the real meat # } # # handle a trap # sub handle_trap { my ($buf, $from) = @_; my $time = time; my $noalert = 0; my %trap = (); my $flags = 0; my $tmnow = time; # # MON-specific tags # pro protocol # aut auth # usr username # pas password # typ type ("failure", "up", "startup", "trap", "traptimeout") # spc specific type (TRAP_*) # seq sequence # grp group # svc service # hst host # sta status (opstatus) # tsp timestamp as time(2) value # sum summary output # dtl detail # foreach my $l (split (/\n/, $buf)) { if ($l =~ /^(\w+)=(.*)/) { my $trap_val = $2; chomp $trap_val; $trap{$1} = un_esc_str ((parse_line ('\s+', 0, $trap_val))[0]); } else { syslog ('err', "unspecified tag in trap: $l"); } } $trap{"sum"} = "$trap{sum}\n" if ($trap{"sum"} !~ /\n$/); my ($port, $addr) = sockaddr_in ($from); my $fromip = inet_ntoa ($addr); # # trap authentication # my ($traphost, $trapuser, $trappass); if (defined ($AUTHTRAPS{"*"})) { $traphost = "*"; } else { $traphost = $addr; } if (defined ($AUTHTRAPS{$traphost}{"*"})) { $trapuser = "*"; $trappass = ""; } else { $trapuser = $trap{"usr"}; $trappass = $trap{"pas"}; } if (!defined ($AUTHTRAPS{$traphost})) { syslog ('err', "received trap from unauthorized host: $fromip"); return undef; } if ($trapuser ne "*" && crypt ($trappass, $AUTHTRAPS{$traphost}{$trapuser}) ne $AUTHTRAPS{$traphost}{$trapuser}) { syslog ('err', "received trap from unauthorized user $trapuser, host $traphost"); return undef; } # # protocol version # if ($trap{"pro"} < $TRAP_PRO_VERSION) { syslog ('err', "cannot handle traps from version less than $TRAP_PRO_VERSION"); return undef; } # # validate trap type # if (!defined $trap{"typ"} || !defined ($trap{"spc"})) { syslog ('err', "no trap type specified from $fromip"); return undef; } # # validate trap type # # # if mon receives a trap for an unknown group/service, then the # default/default group/service should catch these if it is defined # my $intended; if ((!defined ($groups{$trap{"grp"}}) && !defined $watch{$trap{"grp"}}->{$trap{"svc"}}) && (defined($groups{'default'}) && defined($watch{'default'}->{'default'}))) { $intended = "$trap{'grp'}:$trap{'svc'}"; $trap{"grp"} = "default"; $trap{"svc"} = "default"; } if (!defined ($groups{$trap{"grp"}})) { syslog ('err', "trap received for undefined group $trap{grp}"); return; } elsif (!defined $watch{$trap{"grp"}}->{$trap{"svc"}}) { syslog ('err', "trap received for undefined service type $trap{grp}/$trap{svc}"); return; } my $sref = \%{$watch{$trap{"grp"}}->{$trap{"svc"}}}; $sref->{"_last_trap"} = $time; $sref->{"_last_detail"} = $trap{"dtl"}; $sref->{"_last_summary"} = $trap{"sum"}; if ($intended) { $sref->{"_intended"} = $intended; } my $old_status = $sref->{"_op_status"}; syslog ('info', "trap $trap{typ} $trap{spc} from " . "$fromip for $trap{grp} $trap{svc}, status $trap{sta}"); my $group = $trap{"grp"}; my $service = $trap{"svc"}; # # Not sure what I want to do with this. It's not done, and # just because it's here doesn't mean that it is meant to work # how it is coded. # if (1) { if ($trap{"spc"} == $STAT_COLDSTART) { set_op_status ($group, $service, $STAT_COLDSTART); $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); } elsif ($trap{"spc"} == $STAT_WARMSTART) { set_op_status ($group, $service, $STAT_WARMSTART); $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); $sref->{"_last_uptrap"} = $time; } elsif ($trap{"spc"} == $STAT_LINKDOWN) { set_op_status ($group, $service, $STAT_LINKDOWN); $sref->{"_failure_count"}++; $sref->{"_first_failure"} = $tm if ($sref->{"_op_status"} != $STAT_FAIL); $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); } elsif ($trap{"spc"} == $STAT_OK) { if ($CF{"DTLOGGING"} && defined ($sref->{"_op_status"}) && $sref->{"_op_status"} == $STAT_FAIL) { write_dtlog ($sref, $group, $service); } set_op_status ($group, $service, $STAT_OK); $sref->{"_last_uptrap"} = $time; $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); } elsif ($trap{"spc"} == $STAT_FAIL) { set_op_status ($group, $service, $STAT_FAIL); $sref->{"_first_failure"} = $tm if ($sref->{"_op_status"} != $STAT_FAIL); $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); } elsif ($trap{"spc"} == $STAT_WARN) { set_op_status ($group, $service, $STAT_WARN); # } elsif ($trap{"spc"} == $STAT_HEARTBEAT) { # set_op_status ($group, $service, $STAT_OK); # $sref->{"_last_uptrap"} = $time; # $noalert++; } else { syslog ('err', "trap received from $fromip" . " for undefined type $trap{typ} $trap{spc} $trap{grp}"); return; } } shift @last_failures if (@last_failures > $CF{"MAX_KEEP"}); push @last_failures, "$trap{grp} $trap{svc}" . " $tm $trap{typ} $trap{spc} $trap{sum}"; if ($sref->{"depend"} ne "" && $sref->{"dep_behavior"} eq "a") { dep_ok ($sref); } # # if trap is FAIL, send an alert # if trap is OK send upalert # upalert only gets sent if an upalert for this # trap is actually defined, and if the # upalertafter config is satisfied # $flags = 0; if ( $trap{"spc"} == $STAT_OK ) { $flags = $FL_UPALERT; if ( defined($sref->{"_upalert"}) ) { if ( $tmnow - $sref->{"_first_failure"} < $sref->{"upalertafter"}) { $noalert++; } } else { $noalert++; } } #### else just fall through and send alert do_alert ( $trap{"grp"}, $trap{"svc"}, $trap{"sum"} . $trap{"dtl"}, $trap{"sta"}, $FL_TRAP | $flags, ) unless ($noalert); if( defined($sref->{"_intended"}) ) { undef($sref->{"_intended"}); } } # # trap timeout # sub handle_trap_timeout { my ($group, $service) = @_; my ($tmnow); $tmnow = time; my $sref = \%{$watch{$group}->{$service}}; $sref->{"_failure_count"}++; $sref->{"_last_failure"} = $tmnow; $sref->{"_first_failure"} = $tmnow if ($sref->{"_op_status"} != $STAT_FAIL); set_op_status ($group, $service, $STAT_FAIL); $sref->{"_last_summary"} = "trap timeout"; $sref->{"_last_detail"} = ""; shift @last_failures if (@last_failures > $CF{"MAX_KEEP"}); push @last_failures, "$group $service $tm $sref->{_last_summary}"; syslog ('crit', "failure for $last_failures[-1]"); do_alert ($group, $service, undef, undef, $FL_TRAPTIMEOUT); } # # write to a socket # sub sock_write { my ($sock, $buf) = @_; my ($nleft, $nwritten); $nleft = length ($buf); while ($nleft) { $nwritten = syswrite ($sock, $buf, $nleft); if (!defined ($nwritten)) { return undef if ($! != EAGAIN); usleep (100000); next; } $nleft -= $nwritten; substr ($buf, 0, $nwritten) = ""; } } # # do I/O processing for traps and client connections # sub handle_io { # # build iovec for server connections, traps, and clients # $iovec = ''; my $niovec = ''; vec ($iovec, fileno (TRAPSERVER), 1) = 1; vec ($iovec, fileno (SERVER), 1) = 1; vec ($iovec, fileno (SNMPSERVER), 1) = 1 if ($CF{"SNMP"}); foreach my $cl (keys %clients) { vec ($iovec, $cl, 1) = 1; } # # handle client I/O while there is some to handle # my $sleep = $SLEEPINT; my $tm0 = [gettimeofday]; my $n; while ($n = select ($niovec = $iovec, undef, undef, $sleep)) { my $tm1 = [gettimeofday]; if ($! != &EINTR) { # # mon trap # if (vec ($niovec, fileno (TRAPSERVER), 1)) { my ($from, $trapbuf); if (!defined ($from = recv (TRAPSERVER, $trapbuf, 65536, 0))) { syslog ('err', "error trying to recv a trap: $!"); } else { handle_trap ($trapbuf, $from); } next; # # SNMP trap # } elsif ($CF{"SNMP"} && vec ($niovec, fileno (SNMPSERVER), 1)) { my ($from, $trapbuf); if (!defined ($from = recv (SNMPSERVER, $trapbuf, 65536, 0))) { syslog ('err', "error trying to recv an SNMP trap: $!"); } else { handle_snmp_trap ($trapbuf, $from); } next; # # client connections # } elsif (vec ($niovec, fileno (SERVER), 1)) { client_accept; } # # read data from clients if any exists # if ($numclients) { foreach my $cl (keys %clients) { next if (!vec ($niovec, $cl, 1)); my $buf = ''; $n = sysread ($clients{$cl}->{"fhandle"}, $buf, 8192); if ($n == 0 && $! != &EAGAIN) { client_close ($cl); } elsif (!defined $n) { client_close ($cl, "read error: $!"); } else { $clients{$cl}->{"buf"} .= $buf; $clients{$cl}->{"timeout"} = $CF{"CLIENT_TIMEOUT"}; $clients{$cl}->{"last_read"} = time; } } } } # # execute client commands which have been read # client_dopending if ($numclients); last if (tv_interval ($tm0, $tm1) >= $SLEEPINT); $sleep = $SLEEPINT - tv_interval ($tm0, $tm1); } if (!defined ($n)) { syslog ('err', "select returned an error for I/O loop: $!"); } # # count down client inactivity timeouts and close expired connections # if ($numclients) { foreach my $cl (keys %clients) { my $timenow = time; $clients{$cl}->{"timeout"} = $timenow - $clients{$cl}->{"last_read"}; if ($clients{$cl}->{"timeout"} >= $CF{"CLIENT_TIMEOUT"}) { client_close ($cl, "timeout after $CF{CLIENT_TIMEOUT}s"); } } } } # # generate alert and monitor path hashes # sub gen_scriptdir_hash { my ($d, @scriptdirs, @alertdirs, $found); %MONITORHASH = (); %ALERTHASH = (); foreach $d (split (/\s*:\s*/, $CF{"SCRIPTDIR"})) { if (-d "$d" && -x "$d") { push (@scriptdirs, $d); } else { syslog ('err', "scriptdir $d is not usable"); } } foreach $d (split (/\s*:\s*/, $CF{"ALERTDIR"})) { if (-d $d && -x $d) { push (@alertdirs, $d); } else { syslog ('err', "alertdir $d is not usable"); } } # # monitors # foreach my $group (keys %watch) { foreach my $service (keys %{$watch{$group}}) { next if (!defined $watch{$group}->{$service}->{"monitor"}); my $monitor = (split (/\s+/, $watch{$group}->{$service}->{"monitor"}))[0]; $found = 0; foreach (@scriptdirs) { if (-x "$_/$monitor") { $MONITORHASH{$monitor} = "$_/$monitor" unless (defined $MONITORHASH{$monitor}); $found++; last; } } if (!$found) { syslog ('err', "$monitor not found in one of (\@scriptdirs[@scriptdirs])"); } } } # # alerts # foreach my $group (keys %watch) { foreach my $service (keys %{$watch{$group}}) { foreach my $period (keys %{$watch{$group}->{$service}->{"periods"}}) { foreach my $my_alert ( @{$watch{$group}->{$service}->{"periods"}->{$period}->{"alerts"}}, @{$watch{$group}->{$service}->{"periods"}->{$period}->{"upalerts"}}, @{$watch{$group}->{$service}->{"periods"}->{$period}->{"startupalerts"}}, ) { my $alert = $my_alert; $alert =~ s/^(\S+=\S+ )*(\S+).*$/$2/; $found = 0; foreach (@alertdirs) { if (-x "$_/$alert") { $ALERTHASH{$alert} = "$_/$alert" unless (defined $ALERTHASH{$alert}); $found++; } } if (!$found) { syslog ('err', "$alert not found in one of (\@alerttdirs[@alertdirs])"); } } } } } } # # do some processing on dirs # sub normalize_paths { my ($authtype, @authtypes); # # do some sanity checks on dirs # $CF{"STATEDIR"} = "$CF{BASEDIR}/$CF{STATEDIR}" if ($CF{"STATEDIR"} !~ m{^/}); syslog ('err', "$CF{STATEDIR} does not exist") if (! -d $CF{"STATEDIR"}); $CF{"LOGDIR"} = "$CF{BASEDIR}/$CF{LOGDIR}" if ($CF{"LOGDIR"} !~ m{^/}); syslog ('err', "$CF{LOGDIR} does not exist") if (! -d $CF{LOGDIR}); $CF{"AUTHFILE"} = "$CF{CFBASEDIR}/$CF{AUTHFILE}" if ($CF{"AUTHFILE"} !~ m{^/}); syslog ('err', "$CF{AUTHFILE} does not exist") if (! -f $CF{"AUTHFILE"}); $CF{"OCFILE"} = "$CF{CFBASEDIR}/$CF{OCFILE}" if ($CF{"OCFILE"} !~ m{^/}); @authtypes = split(' ' , $CF{"AUTHTYPE"}) ; foreach $authtype (@authtypes) { if ($authtype eq "userfile") { $CF{"USERFILE"} = "$CF{CFBASEDIR}/$CF{USERFILE}" if ($CF{"USERFILE"} !~ m{^/}); syslog ('err', "$CF{USERFILE} does not exist") if (! -f $CF{"USERFILE"}); } } $CF{"DTLOGFILE"} = "$CF{LOGDIR}/$CF{DTLOGFILE}" if ($CF{"DTLOGFILE"} !~ m{^/}); if ($CF{"HISTORICFILE"} ne "") { $CF{"HISTORICFILE"} = "$CF{LOGDIR}/$CF{HISTORICFILE}" if ($CF{"HISTORICFILE"} !~ m{^/}); } # # script and alert dirs may have multiple paths # foreach my $dir (\$CF{"SCRIPTDIR"}, \$CF{"ALERTDIR"}) { my @n; foreach my $d (split (/\s*:\s*/, $$dir)) { $d =~ s{/$}{}; $d = "$CF{BASEDIR}/$d" if ($d !~ m{^/}); syslog ('err', "$d does not exist, check your alertdir and mondir paths") unless (-d $d); push @n, $d; } $$dir = join (":", @n); } } # # set opstatus and save old status # sub set_op_status { my ($group, $service, $status) = @_; $watch{$group}->{$service}->{"_last_op_status"} = $watch{$group}->{$service}->{"_op_status"}; $watch{$group}->{$service}->{"_op_status"} = $status; } sub debug_dir { print STDERR <<EOF; basedir [$CF{BASEDIR}] cfbasedir [$CF{CFBASEDIR}] cf [$CF{CF}] statedir [$CF{STATEDIR}] logdir [$CF{LOGDIR}] authfile [$CF{AUTHFILE}] ocfile [$CF{OCFILE}] userfile [$CF{USERFILE}] dtlogfile [$CF{DTLOGFILE}] historicfile[$CF{HISTORICFILE}] monerrfile [$CF{MONERRFILE}] scriptdir [$CF{SCRIPTDIR}] alertdir [$CF{ALERTDIR}] EOF foreach my $m (keys %MONITORHASH) { print STDERR "M $m=[$MONITORHASH{$m}]\n"; } foreach my $m (keys %ALERTHASH) { print STDERR "A $m=[$ALERTHASH{$m}]\n"; } } # # globals affected by config file are # all stored in %CF # sub init_cf_globals { $CF{"BASEDIR"} = $opt{"b"} || "/usr/lib/mon"; $CF{"BASEDIR"} =~ s{/$}{}; $CF{"CFBASEDIR"} = $opt{"B"} || "/etc/mon"; $CF{"CF"} = $opt{"c"} || "$CF{CFBASEDIR}/mon.cf"; $CF{"CF"} = "$PWD/$CF{CF}" if ($CF{"CF"} !~ /^\//); $CF{"SCRIPTDIR"} = "/usr/local/lib/mon/mon.d:mon.d"; $CF{"ALERTDIR"} = "/usr/local/lib/mon/alert.d:alert.d"; $CF{"LOGDIR"} = $opt{"L"} || (-d "/var/log/mon" ? "/var/log/mon" : "log.d"); $CF{"STATEDIR"} = -d "/var/state/mon" ? "/var/state/mon" : -d "/var/lib/mon" ? "/var/lib/mon" : "state.d"; $CF{"AUTHFILE"} = "auth.cf"; $CF{"AUTHTYPE"} = "getpwnam"; $CF{"PAMSERVICE"} = "passwd"; $CF{"USERFILE"} = "monusers.cf"; $CF{"OCFILE"} = "oncall.cf"; $CF{"PIDFILE"} = (-d "/var/run/mon" ? "/var/run/mon" : -d "/var/run" ? "/var/run" : "/etc") . "/mon.pid"; $CF{"MONERRFILE"} = "/dev/null"; $CF{"DTLOGFILE"} = "downtime.log"; $CF{"DTLOGGING"} = 0; $CF{"MAX_KEEP"} = 100; $CF{"CLIENT_TIMEOUT"} = 30; $CF{"SERVPORT"} = getservbyname ("mon", "tcp") || 2583; $CF{"TRAPPORT"} = getservbyname ("mon", "udp") || 2583; $CF{"MAXPROCS"} = 0; $CF{"SNMP"} = 0; $CF{"SNMPPORT"} = 34000; $CF{"HISTORICFILE"} = ""; $CF{"HISTORICTIME"} = 0; $CF{"DEP_RECUR_LIMIT"} = 10; $CF{"SYSLOG_FACILITY"} = "daemon"; $CF{"STARTUPALERTS_ON_RESET"} = 0; } # # globals not affected by config file # sub init_globals { $TRAP_PRO_VERSION = 0.3807; $SLEEPINT = 1; $STOPPED = 0; $STOPPED_TIME = 0; $START_TIME = time; $PROT_VERSION = 0x2611; $HOSTNAME = hostname; $PWD = getcwd; # # flags # $FL_MONITOR = 1; $FL_UPALERT = 2; $FL_TRAP = 4; $FL_TRAPTIMEOUT = 8; $FL_STARTUPALERT = 16; $FL_TEST = 32; # # specific trap types # ($TRAP_COLDSTART, $TRAP_WARMSTART, $TRAP_LINKDOWN, $TRAP_LINKUP, $TRAP_AUTHFAIL, $TRAP_EGPNEIGHBORLOSS, $TRAP_ENTERPRISE, $TRAP_HEARTBEAT) = (0..7); # # operational statuses # ($STAT_FAIL, $STAT_OK, $STAT_COLDSTART, $STAT_WARMSTART, $STAT_LINKDOWN, $STAT_UNKNOWN, $STAT_TIMEOUT, $STAT_UNTESTED, $STAT_DEPEND, $STAT_WARN) = (0..9); %FAILURE = ( $STAT_FAIL => 1, $STAT_LINKDOWN => 1, $STAT_TIMEOUT => 1, ); %SUCCESS = ( $STAT_OK => 1, $STAT_COLDSTART => 1, $STAT_WARMSTART => 1, $STAT_UNKNOWN => 1, $STAT_UNTESTED => 1, ); %WARNING = ( $STAT_COLDSTART => 1, $STAT_WARMSTART => 1, $STAT_UNKNOWN => 1, $STAT_WARN => 1, ); %OPSTAT = ("fail" => $STAT_FAIL, "ok" => $STAT_OK, "coldstart" => $STAT_COLDSTART, "warmstart" => $STAT_WARMSTART, "linkdown" => $STAT_LINKDOWN, "unknown" => $STAT_UNKNOWN, "timeout" => $STAT_TIMEOUT, "untested" => $STAT_UNTESTED); # # fast lookup hashes for alerts and monitors # %MONITORHASH = (); %ALERTHASH = (); $TRAP_PDU = new Mon::SNMP; } # # clear timers # sub clear_timers { my ($group, $service) = @_; return undef if (!defined $watch{$group}->{$service}); my $sref = \%{$watch{$group}->{$service}}; $sref->{"_trap_timer"} = $sref->{"traptimeout"} if ($sref->{"traptimeout"}); $sref->{"_trap_duration_timer"} = $sref->{"trapduration"} if ($sref->{"trapduration"}); $sref->{"_timer"} = $sref->{"interval"} if ($sref->{"interval"}); foreach my $period (keys %{$sref->{"periods"}}) { my $pref = \%{$sref->{"periods"}->{$period}}; $pref->{"_last_alert"} = 0 if ($pref->{"alertevery"}); $pref->{"_consec_failures"} = 0 if ($pref->{"alertafter_consec"}); $pref->{'_1stfailtime'} = 0 if ($pref->{"alertafterival"}); } } # # load some amount of the alert history into memory # sub readhistoricfile { return if ($CF{"HISTORICFILE"} eq ""); if (!open (HISTFILE, $CF{"HISTORICFILE"})) { syslog ('err', "Could not read history from $CF{HISTORICFILE} : $!"); return; } my $epochLimit = 0; if ($CF{"HISTORICTIME"} != 0) { $epochLimit = time - $CF{"HISTORICTIME"}; } @last_alerts = (); while (<HISTFILE>) { next if (/^\s*$/ || /^\s*#/); chomp; my $epochAlert = (split(/\s+/))[3]; push (@last_alerts, $_) if ($epochAlert >= $epochLimit); } close (HISTFILE); if (defined $CF{"MAX_KEEP"}) { splice(@last_alerts, 0, $#last_alerts + 1 - $CF{"MAX_KEEP"}); } } # # This routine simply calls an alert. # # call with %args = ( # group => "name of group", # service => "name of service", # pref => "optional period reference", # alert => "alert script", # args => "args to alert script", # flags => "flags, as in $FL_*", # retval => "return value of monitor", # output => "output of monitor", # ) # sub call_alert { my (%args) = @_; foreach my $mandatory_arg (qw( group service flags retval alert output )) { return (undef) if (!defined $args{$mandatory_arg}); } my @groupargs = grep (!/^\*/, @{$groups{$args{"group"}}}); my $tmnow = time; my ($summary) = split("\n", $args{"output"}); $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m); my $sref = \%{$watch{$args{"group"}}->{$args{"service"}}}; my $pref; if (defined $args{"pref"}) { $pref = $args{"pref"}; } my $alert = ""; if (!defined $ALERTHASH{$args{"alert"}} || ! -f $ALERTHASH{$args{"alert"}}) { syslog ('err', "no alert found while trying to run $args{alert}"); return undef; } else { $alert = $ALERTHASH{$args{"alert"}}; } my $alerttype = ""; # sent to syslog and stored in @last_alerts my $alert_type = "failure"; # MON_ALERTTYPE set to this if ($args{"flags"} & $FL_UPALERT) { $alerttype = "upalert"; $alert_type = "up"; } elsif ($args{"flags"} & $FL_STARTUPALERT) { $alerttype = "startupalert"; $alert_type = "startup"; } elsif ($args{"flags"} & $FL_TRAPTIMEOUT) { $alerttype = "traptimeoutalert"; $alert_type = "traptimeout"; } elsif ($args{"flags"} & $FL_TRAP) { $alerttype = "trapalert"; $alert_type = "trap"; } elsif ($args{"flags"} & $FL_TEST) { $alerttype = "testalert"; $alert_type = "test"; } else { $alerttype = "alert"; } # # log why we are triggering an alert # my $a = $alert; $a =~ s{^.*/([^/]+)$}{$1}; syslog ("alert", "calling $alerttype $a for" . " $args{group}/$args{service} ($alert,$args{args}) $summary"); my $pid = open (ALERT, "|-"); if (!defined $pid) { syslog ('err', "could not fork: $!"); return undef; } # # child, the actual alert # if ($pid == 0) { # # set env variables to pass to the alert # foreach my $v (keys %{$sref->{"ENV"}}) { $ENV{$v} = $sref->{"ENV"}->{$v}; } $ENV{"MON_LAST_SUMMARY"} = $sref->{"_last_summary"}; $ENV{"MON_LAST_OUTPUT"} = $sref->{"_last_output"}; $ENV{"MON_LAST_FAILURE"} = $sref->{"_last_failure"}; $ENV{"MON_FIRST_FAILURE"} = $sref->{"_first_failure"}; $ENV{"MON_LAST_SUCCESS"} = $sref->{"_last_success"}; $ENV{"MON_DESCRIPTION"} = $sref->{"description"}; $ENV{"MON_GROUP"} = $args{"group"}; $ENV{"MON_SERVICE"} = $args{"service"}; $ENV{"MON_RETVAL"} = $args{"retval"}; $ENV{"MON_OPSTATUS"} = $sref->{"_op_status"}; $ENV{"MON_ALERTTYPE"} = $alert_type; $ENV{"MON_STATEDIR"} = $CF{"STATEDIR"}; $ENV{"MON_LOGDIR"} = $CF{"LOGDIR"}; if( defined($sref->{"_intended"}) ) { $ENV{"MON_TRAP_INTENDED"} = $sref->{"_intended"}; } else { undef ($ENV{"MON_TRAP_INTENDED"}) if (defined($ENV{"MON_TRAP_INTENDED"})); } my $t; $t = "-u" if ($args{"flags"} & $FL_UPALERT); $t = "-T" if ($args{"flags"} & $FL_TRAP); $t = "-O" if ($args{"flags"} & $FL_TRAPTIMEOUT); my @execargs = ( $alert, "-s", "$args{service}", "-g", "$args{group}", "-h", "@groupargs", "-t", "$tmnow", ); if ($t) { push @execargs, $t; } if ($args{"args"} ne "") { push @execargs, quotewords('\s+',0,$args{"args"}); } if (!exec @execargs) { syslog ('err', "could not exec alert $alert: $!"); return undef; } exit; } # # this will block if the alert is sucking gas # print ALERT $args{"output"}; close (ALERT); waitpid $pid, 0; # # test alerts don't count # return (1) if ($args{"flags"} & $FL_TEST); # # tally this alert # if (defined $args{"pref"}) { $pref->{"_last_alert"} = $tmnow; } $sref->{"_alert_count"}++; # # store this in the log # shift @last_alerts if (@last_alerts > $CF{"MAX_KEEP"}); my $alertline = "$alerttype $args{group} $args{service}" . " $tmnow $alert ($args{args}) $summary"; push @last_alerts, $alertline; # # append to alert history file # if ($CF{"HISTORICFILE"} ne "") { if (!open (HISTFILE, ">>$CF{HISTORICFILE}")) { syslog ('err', "Could not append alert history to $CF{HISTORICFILE}: $!"); } else { print HISTFILE $alertline, "\n"; close (HISTFILE); } } return 1; } # # recursively evaluate a dependency expression # substitutes "GROUP:SERVICE" with "1" or "0" if the service is pass/fail, resp. # # returns an anonymous hash reference # # { # status =>, # "D" recursion depth exceeded # # "O" everything is OK # # "E" eval error # depend =>, # 1 for success (no deps in a failure state) # # 0 if any deps failed # error =>, # the textual error associated with "D" or "E" status # } # sub depend { my ($depend, $depth) = @_; debug (1, "checking DEP [$depend]\n"); if ($depth > $CF{"DEP_RECUR_LIMIT"}) { return { status => "D", depend => undef, error => "recursion too deep for ($depend)", }; } foreach my $depstr ($depend =~ /[a-zA-Z0-9_.-]+:[a-zA-Z0-9_.-]+/g) { my ($group ,$service) = split(':', $depstr); my $sref = \%{$watch{$group}->{$service}}; my $depval = undef; # # disabled watches and services are counted as "passing" # if ($watch_disabled{$group} || $sref->{"disable"} == 1) { $depval = 1; # # root dependency found # } elsif ($sref->{"depend"} eq "") { debug (1, " found root dep $group,$service\n"); $depval = $SUCCESS{$sref->{"_op_status"}}; # # not a root dep, recurse # } else { # # do it recursively # my $dstatus = depend ($sref->{"depend"}, $depth + 1); debug (1, "recur depth $depth returned $dstatus->{status},$dstatus->{depend}\n"); # # a bad thing happened, bail out # if ($dstatus->{"status"} ne "O") { debug (1, "recursive dep failure for $group,$service (status=$dstatus->{status})\n"); return $dstatus; } $depval = $dstatus->{"depend"} && $sref->{"_op_status"}; } my $v = int ($depval); debug (1, " ($group,$service) $depth depend=[$v][$depend]"); $depend =~ s/\b$depstr\b/$v/g; debug (1, " depend=[$depend]\n"); } debug (1, " before eval: [$depend]"); my $e = eval("$DEP_EVAL_SANDBOX $depend"); debug (1, " after eval: [$e]\n"); if ($@ eq "") { return { status => "O", depend => $e, }; } else { return { status => "E", depend => $e, error => $@, }; } } # # returns undef on error # 0 if dependency failure, sets _depend_status to 0 # 1 if dependencies are OK, sets _depend_status to 1 # sub dep_ok { my $sref = shift; my $s = depend ($sref->{"depend"}, 0); if ($s->{"status"} eq "D") { debug (1, "dep recursion too deep\n"); return undef; } elsif ($s->{"status"} eq "E") { syslog ("notice", "eval error for dependency starting at $sref->{depend}"); return undef; } elsif ($s->{"status"} eq "O" && !$s->{"depend"}) { $sref->{"_depend_status"} = 0; return 0; } $sref->{"_depend_status"} = 1; return 1; } # # convert a string to a hex-escaped string, returning # the escaped string. # # $str is the string to be escaped # if $inquotes is true, backslashes are doubled, making # the escaped string suitable to be enclosed in # single quotes and later passed to Text::quotewords. # For example, var='quoted value' # sub esc_str { my $str = shift; my $inquotes = shift; my $escstr = ""; for (my $i = 0; $i < length ($str); $i++) { my $c = substr ($str, $i, 1); if (ord ($c) < 32 || ord ($c) > 126 || $c eq "\"" || $c eq "\'") { $c = sprintf ("\\%02x", ord($c)); } elsif ($inquotes && $c eq "\\") { $c = "\\\\"; } $escstr .= $c; } $escstr; } # # convert a hex-escaped string into an unescaped string, # returning the unescaped string # sub un_esc_str { my $str = shift; $str =~ s{\\([0-9a-f]{2})}{chr(hex($1))}eg; $str; } sub syslog_die { my $msg = shift; syslog ("err", $msg); die "$msg\n"; } # # Have a "conversation" with a PAM authentication module. This fools the # PAM module into authenticating us non-interactively. # sub pam_conv_func { my @res; while ( @_ ) { my $code = shift; my $msg = shift; my $ans = ""; $ans = $PAM_username if ($code == Authen::PAM::PAM_PROMPT_ECHO_ON() ); $ans = $PAM_password if ($code == Authen::PAM::PAM_PROMPT_ECHO_OFF() ); push @res, Authen::PAM::PAM_SUCCESS(); push @res, $ans; } push @res, Authen::PAM::PAM_SUCCESS(); return @res; } sub write_dtlog { my ($sref, $group, $service) = @_; my $tmnow = time; $sref->{"_first_failure"} = $START_TIME if ($sref->{"_first_failure"} == 0); if (!open (DTLOG, ">>$CF{DTLOGFILE}")) { syslog ('err', "could not append to $CF{DTLOGFILE}: $!"); $CF{"DTLOGGING"} = 0; } else { $CF{"DTLOGGING"} = 1; print DTLOG ($tmnow, " $group", " $service", " ", 0 + $sref->{"_first_failure"}, " ", 0 + $tmnow - $sref->{"_first_failure"}, " $sref->{'interval'}", " $sref->{'_last_summary'}\n") or syslog ('err', "error writing to $CF{DTLOGFILE}: $!"); close(DTLOG); } }