#!/usr/bin/perl
#
# mon - schedules service tests and triggers alerts upon failures
#
# Jim Trocki, trockij@transmeta.com
#
# $Id: mon,v 1.110 1999/06/17 18:06:45 trockij Exp $
#
# Copyright (C) 1998 Jim Trocki
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
$RCSID='$Id: mon,v 1.110 1999/06/17 18:06:45 trockij Exp $';
$AUTHOR='trockij@transmeta.com';
use Getopt::Std;
use Text::ParseWords;
use POSIX;
use Fcntl;
use Socket;
use Time::Period;
use Sys::Hostname;
use Sys::Syslog;
use Time::HiRes qw(gettimeofday tv_interval usleep);
use Mon::SNMP;
#use SNMP in read_cf()

sub init_globals;
sub init_cf_globals;
sub normalize_paths;
sub gen_scriptdir_hash;
sub client_write_opstatus;
sub clear_timers;
sub set_op_status;
sub readhistoricfile;
sub call_alert;

getopts ("fhlMSva:A:b:B:c:d:D:i:L:m:o:p:P:r:s:");

#
# these two things can be taken care of without
# initializing things further
#
if ($opt_v) {
    print "$RCSID\n";
    exit;
}

if ($opt_h) {
    &usage();
    exit;
}

#
# definitions
#
die "basedir $opt_b does not exist\n" if ($opt_b && ! -d $opt_b);

init_globals();
init_cf_globals();

die "config file $CF{CF} does not exist\n" if (! -f $CF{"CF"});

$OS eq "Linux" && Sys::Syslog::setlogsock ('unix');
openlog ("mon", "cons,pid", "daemon");

#
# read config file
#
&read_cf ($CF{"CF"}) ||
    &die_die ("err", "could not open cf file: $CF{CF}: $!");

#
# cmdline args override config file
#
$CF{"ALERTDIR"}  = $opt_a if ($opt_a);
$CF{"BASEDIR"}   = $opt_b if ($opt_b);
$CF{"AUTHFILE"}  = $opt_A if ($opt_A);
$CF{"LOGDIR"}    = $opt_L if ($opt_L);
$CF{"STATEDIR"}  = $opt_D if ($opt_D);
$CF{"SCRIPTDIR"} = $opt_s if ($opt_s);
$CF{"OCFILE"}    = $opt_o if ($opt_o);

$CF{"PIDFILE"}   = $opt_P if defined($opt_P);	# allow empty pidfile
$CF{"MAX_KEEP"}  = $opt_k if ($opt_k);
$CF{"MAXPROCS"}  = $opt_m if ($opt_m);
$CF{"SERVPORT"}  = $opt_p if ($opt_p);

$SLEEPINT  = $opt_i if ($opt_i);

if ($opt_r) {
    die "bad randstart value\n" if (!defined(&dhmstos($opt_r)));
    $CF{"RANDSTART"} = &dhmstos($opt_r);
}

if ($opt_S) {
    $STOPPED = 1;
    $STOPPED_TIME = time;
}


#
# do some path cleanups and
# build lookup tables for alerts and monitors
#
normalize_paths();
gen_scriptdir_hash();

if ($opt_d) {
    &debug_dir();
}

#
# load the auth control, oncall, bind, and listen
#
&load_auth(1);
%oncall = ();
#&load_oncall(1);

#
# init client interface
#   %clients is an I/O structure, indexed by the fd of the client
#   $numclients is the number of clients currently connected
#   $clientcount is used in &client_accept
#   $iovec is fd_set for clients and traps
#
%clients = ();
$numclients = 0;
$clientcount = 0;
$iovec = '';
&setup_server();

#
# fork and become a daemon
#
&open_dtlog() if ($CF{"DTLOGGING"});
&daemon() if ($opt_f);
if ($CF{"PIDFILE"} ne '' && open PID, ">$CF{PIDFILE}") {
    print PID "$$\n";
    close PID;
}
&set_last_test ();

#
# randomize startup checks if asked to
#
&randomize_startdelay() if ($CF{"RANDSTART"});

@last_alerts = ();
readhistoricfile ();

@last_failures = ();
$procs = 0;				# number of outstanding procs
$i=0;					# loop iteration counter
$lasttm=time;				# the last time(2) the mon loop started
$fdset_rbits = $fdset_ebits = '';
%watch_disabled = ();

$SIG{HUP} = \&reset;
$SIG{INT} = \&handle_sigterm;		# for interactive debugging
$SIG{TERM} = \&handle_sigterm;
$SIG{PIPE} = 'IGNORE';

#
# load previously saved state
#
&load_state ("disabled") if ($opt_l);

syslog ('info', "mon server started");

#
# startup alerts
#
&do_startup_alerts();

#
# main monitoring loop
#
for (;;) {
&debug (1, "$i" . ($STOPPED ? " (stopped)" : "") . "\n");
    $i++;
    $tm = time;

    #
    # step through the watch groups, decrementing and
    # handing expired timers
    #
    if (!$STOPPED) {
	foreach my $group (keys %watch) {
	    #
	    # skip over disabled watch
	    #
	    next if ($watch_disabled{$group} == 1);

	    foreach my $service (keys %{$watch{$group}}) {

		my $sref = \%{$watch{$group}->{$service}};

		my $t = $tm - $lasttm;
		$t = 1 if ($t <= 0);

		#
		# trap timer
		#
		if ($sref->{"traptimeout"}) {
		    $sref->{"_trap_timer"} -= $t;

		    if ($sref->{"_trap_timer"} <= 0 && $tm - $sref->{"_last_uptrap"} >
				$sref->{"traptimeout"}) {
			$sref->{"_trap_timer"} = $sref->{"traptimeout"};
			&handle_trap_timeout ($group, $service);
		    }
		}

		#
		# trap duration timer
		#
		if (defined ($sref->{"_trap_duration_timer"})) {
		    $sref->{"_trap_duration_timer"} -= $t;

		    if ($sref->{"_trap_duration_timer"} <= 0) {
		    	set_op_status ($group, $service, $STAT_OK);
			undef $sref->{"_trap_duration_timer"};
		    }
		}

		#
		# polling monitor timer
		#
		if ($sref->{"interval"} && $sref->{"_timer"} <= 0 &&
			!$running{"$group/$service"}) {

		    if (($CF{"MAXPROCS"} && $procs < $CF{"MAXPROCS"}) || !$CF{"MAXPROCS"}) {
			&run_monitor($group, $service);
		    } else {
			syslog ('info', "throttled at $procs processes");
		    }

		} else {
		    $sref->{"_timer"} -= $t;
		    if ($sref->{"_timer"} < 0) {
		    	$sref->{"_timer"} = 0;
		    }
		}
	    }
	}
    }

    $lasttm = time;

    #
    # collect any output from subprocs
    #
    &collect_output();

    #
    # clean up after exited processes, and trigger alerts
    #
    &proc_cleanup();

    #
    # handle client, server, and trap I/O
    # this routine sleeps for $SLEEPINT if no I/O is ready
    #
    &handle_io();
}

&clean_up();
exit;

##############################################################################

#
# startup alerts
#
sub do_startup_alerts {
    foreach my $group (keys %watch) {
    	foreach my $service (keys %{$watch{$group}}) {
	    &do_alert ($group, $service, "", 0, $FL_STARTUPALERT);
	}
    }
}


#
# handle alert event, throttling the alert call if necessary
#
sub do_alert {
    my ($group, $service, $output, $retval, $flags) = @_;
    my (@groupargs, $last_alert, $alert);
    my ($sref, $pref, $range, @alerts);

    $sref = \%{$watch{$group}->{$service}};

    my $tmnow = time;

    #
    # if the alarm is disabled, ignore it
    #
    if ($sref->{"disable"} == 1) {
	syslog ("notice", "ignoring alert for $group.$service");
	return;
    }

    #
    # no alerts for ack'd failures
    #
    if ($sref->{"_ack"} == 1) {
	syslog ("notice", "no alert for $group.$service" .
		" because of ack'd failure");
	return;
    }

    my ($summary) = split("\n", $output);
    $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m);

    #
    # check each time period for pending alerts
    #
    foreach my $periodlabel (keys %{$sref->{"periods"}}) {
	#
	# only send alerts that are in the proper period
	#
    	next if (!inPeriod ($tmnow, $sref->{"periods"}->{$periodlabel}->{"period"}));

    	$pref = \%{$sref->{"periods"}->{$periodlabel}};

	#
	# do this if we're not handling an upalert or startupalert
	#
	if (!($flags & $FL_UPALERT) && !($flags & $FL_STARTUPALERT)) {
	    #
	    # only alert once every "alertevery" seconds, unless
	    # output from monitor is different
	    #
	    if ($pref->{"alertevery"} != 0 &&
		    ($tmnow - $pref->{"_last_alert"} < $pref->{"alertevery"}) &&
			( $pref->{"_alertsum"} ? 
			    ($sref->{"_failure_output"} =~ /^$summary/ ) :
			    ($sref->{"_failure_output"} eq $output) ) ) {

		syslog ("info", "not alerting for failure of $group/$service");
		next;
	    }

	    #
	    # alertafter NUM
	    #
	    if (defined $pref->{"alertafter_consec"}) {
	    	next if ($sref->{"_consec_failures"} < $pref->{"alertafter_consec"});

	    #
	    # alertafter NUM timeval
	    #
	    } elsif (defined ($pref->{"alertafter"})) {
		$pref->{"_failcount"}++;

		if ($tmnow - $pref->{'_1stfailtime'} <= $pref->{'alertafterival'}
		    && $pref->{"_failcount"} < $pref->{"alertafter"}) {
		    next;
		}

		#
		# start a new time interval
		#
		if ($tmnow - $pref->{'_1stfailtime'} > $pref->{'alertafterival'}) {
		    $pref->{"_failcount"} = 1;
		}

		if ($pref->{"_failcount"} == 1) {
		    $pref->{"_1stfailtime"} = $tmnow;
		}

		if ($pref->{"_failcount"} < $pref->{"alertafter"}) {
		    next;
		}
	    }
	}

	#
	# at this point, no alerts are blocked,
	# so send the alerts
	#

	#
	# trigger multiple alerts in this period
	#
	if ($flags & $FL_UPALERT) {
	    @alerts = @{$pref->{"upalerts"}};
	} elsif ($flags & $FL_STARTUPALERT) {
	    @alerts = @{$pref->{"startupalerts"}};
	} else  {
	    @alerts = @{$pref->{"alerts"}};
	}

	for (my $i=0;$i<@alerts;$i++) {
	    my ($range, $fac, $args);

	    if ($alerts[$i] =~ /^exit\s*=\s*((\d+|\d+-\d+))\s/i) {
		$range=$1;
		next if (!&inRange($retval, $range));
		($fac, $args) = (split (/\s+/, $alerts[$i], 3))[1,2];
	    } else {
		($fac, $args) = split (/\s+/, $alerts[$i], 2);
	    }

	    call_alert (
		group	=> $group,
		service	=> $service,
		output	=> $output,
		retval	=> $retval,
		flags	=> $flags,

		pref	=> $pref,
		alert	=> $fac,
		args	=> $args,
	    );
	}
    }
}



#
# walk through the watch list and reset the time
# the service was last called
#
sub set_last_test {
    my ($i, $k, $t);
    $t = time;
    foreach $k (keys %watch) {
    	foreach my $service (keys %{$watch{$k}}) {
	    $watch{$k}->{$service}->{"_timer"} = $watch{$k}->{$service}->{"interval"};
	    $watch{$k}->{$service}->{"_last_alert"} = 0;
	}
    }

}


#
# parse configuration file
#
# build the following data structures:
#
# %group
#       each element of %group is an array of hostnames
#       group records are terminated by a blank line in the
#       configuration file
# %watch{"group"}->{"service"}{"variable"} = value
# %watch is a hash of hashes of hashes
#
sub read_cf {
    my ($CF) = @_;
    my ($l, $var, $watchgroup, $ingroup, $curgroup, $inwatch,
	$watchgroup, $servnum, $args, $hosts, %disabled, $h, $i,
	$aliasReading, $aliasGroup);
    my ($sref, $pref);

    #
    # parse configuration file
    #
    if ($opt_M) {
	open (CFG, "m4 $CF |") ||
	    die "could not open m4 pipe of cf file: $CF: $!\n";
    } else {
	open (CFG, $CF) ||
	    die "could not open cf file: $CF: $!\n";
    }

    $servnum = 0;
    %alias = ();

    for (;;) {
    	last if (!defined ($l = <CFG>));
	next if $l =~ /^#/;

	chomp $l;
	$l =~ s/^\s*//;
	$l =~ s/\s*$//;

	#
	# variables than can be overriden by the command line
	#
	if ($l =~ /^(\w+) \s* = \s* (.*) \s*$/ix) {
	    if ($1 eq "alertdir") {
		$CF{"ALERTDIR"} = $2;
		next;
	    } elsif ($1 eq "basedir") {
		$CF{"BASEDIR"} = $2;
		$CF{"BASEDIR"} = "$PWD/$CF{BASEDIR}" if ($CF{"BASEDIR"} !~ m{^/});
		$CF{"BASEDIR"} =~ s{/$}{};
		next;
	    } elsif ($1 eq "cfbasedir") {
		$CF{"CFBASEDIR"} = $2;
		$CF{"CFBASEDIR"} = "$PWD/$CF{CFBASEDIR}" if ($CF{"CFBASEDIR"} !~ m{^/});
		$CF{"CFBASEDIR"} =~ s{/$}{};
		next;
	    } elsif ($1 eq "mondir") {
		$CF{"SCRIPTDIR"} = $2;
		next;
	    } elsif ($1 eq "logdir") {
		$CF{"LOGDIR"} = $2;
		next;
	    } elsif ($1 eq "histlength") {
		$CF{"MAX_KEEP"} = $2;
		next;
	    } elsif ($1 eq "serverport") {
		$CF{"SERVPORT"} = $2;
		next;
	    } elsif ($1 eq "trapport") {
		$CF{"TRAPPORT"} = $2;
		next;
	    } elsif ($1 eq "pidfile") {
		$CF{"PIDFILE"} = $2;
		next;
	    } elsif ($1 eq "randstart") {
		$CF{"RANDSTART"} = &dhmstos($2);
		die "cf error: bad syntax, line $.\n"
		    if (!defined ($CF{"RANDSTART"}));
		next;
	    } elsif ($1 eq "maxprocs") {
		$CF{"MAXPROCS"} = $2;
		next;
	    } elsif ($1 eq "statedir") {
		$CF{"STATEDIR"} = $2;
		next;
	    } elsif ($1 eq "authfile") {
		$CF{"AUTHFILE"} = $2;
		next;
	    } elsif ($1 eq "authtype") {
		$CF{"AUTHTYPE"} = $2;
		next;
	    } elsif ($1 eq "userfile") {
		$CF{"USERFILE"} = $2;
		next;
	    } elsif ($1 eq "ocfile") {
		$CF{"OCFILE"} = $2;
		next;
	    } elsif ($1 eq "historicfile") {
	    	$CF{"HISTORICFILE"} = $2;
		next;
	    } elsif ($1 eq "historictime") {
	    	$CF{"HISTORICTIME"} = &dhmstos($2);
		die "cf error: bad syntax, line $.\n"
		    if (!defined $CF{"HISTORICTIME"});
		next;
	    } elsif ($1 eq "cltimeout") {
		$CF{"CLIENT_TIMEOUT"} = &dhmstos($2);
		die "cf error: bad syntax, line $.\n"
		    if (!defined ($CF{"CLIENT_TIMEOUT"}));
		next;
	    } elsif ($1 eq "use snmp") {
		$CF{"SNMP"} = 1;
		eval "use SNMP";
		die "could not use SNMP: $@\n" if ($@ ne "");
		next;
	    } elsif ($1 eq "dtlogfile") {
		$CF{"DTLOGFILE"} = $2;
		next;
	    } elsif ($1 eq "dtlogging") {
		$CF{"DTLOGGING"} = 0;
		if ($2 == 1 || $2 eq "yes" || $2 eq "true") {
		    $CF{"DTLOGGING"} = 1;
		}
		next;
	    } elsif ($1 eq "snmpport") {
		$CF{"SNMPPORT"} = $2;
		next;
	    } elsif ($1 eq "dtlogfile") {
		$CF{"DTLOGFILE"} = $2;
		next;
	    } else {
		die "cf error: unknown variable, line $.\n";
	    }
	}

	#
	# end of record
	#
	if ($l eq "") {
	    $ingroup    = 0;
	    $curgroup   = "";
	    $inwatch    = 0;
	    $watchgroup = "";
	    $servnum = 0;
	    $period = 0;
	    undef $aliasReading;
	    next;
	}

	#
	# group record
	#
	if ($l =~ /^hostgroup\s+([a-zA-Z0-9_.-]+)\s*(.*)/) {
	    $curgroup = $1;
	    $hosts = $2;
	    %disabled = ();
	    foreach $h (grep (/^\*/, @{$groups{$curgroup}})) {
		$h =~ s/^\*//;
		$disabled{$h} = 1;
	    }
	    @{$groups{$curgroup}} = split(/\s+/, $hosts);
	    #
	    # keep hosts which were previously disabled
	    #
	    for ($i=0;$i<@{$groups{$curgroup}};$i++) {
		$groups{$curgroup}[$i] = "*$groups{$curgroup}[$i]"
		    if ($disabled{$groups{$curgroup}[$i]});
	    }
	    $ingroup = 1;
	    next;
	
	} elsif ($ingroup) {
	    push (@{$groups{$curgroup}}, split(/\s+/, $l));
	    for ($i=0;$i<@{$groups{$curgroup}};$i++) {
		$groups{$curgroup}[$i] = "*$groups{$curgroup}[$i]"
		    if ($disabled{$groups{$curgroup}[$i]});
	    }
	    next;
	}
	
	#
	# alias record
	#
	if ($l =~ /^alias\s+([a-zA-Z0-9_.-]+)\s*$/) {
	    $aliasReading = 1;
	    $aliasGroup = $1;
	    next;

	} elsif ($aliasReading) {
	    if ($l =~ /\A(.*)\Z/) {
		push (@{$alias{$aliasGroup}}, $1);
		next;
	    }
	}

	#
	# watch record
	#
	if ($l =~ /^watch\s+([a-zA-Z0-9_.-]+)\s*/) {
	    $watchgroup = $1;
	    if (!defined ($groups{$watchgroup})) {
	    	@{$groups{$watchgroup}} = ($watchgroup);
	    }
	    die "cf error: watch already defined, line $.\n"
	    	if ($watch{$watchgroup});
	    $ingroup    = 0;
	    $curgroup   = "";
	    $service = "";
	    $period = 0;
	    $inwatch = 1;
	    next;

	} elsif ($inwatch) {
	    #
	    # env variables
	    #
	    if ($l =~ /^([A-Z_][A-Z0-9_]*)=(.*)/) {
		die "cf error: environment variable defined without a service, line $.\n"
		    if ($service eq "");
		$watch{$watchgroup}->{$service}->{"ENV"}->{$1} = $2;
	    	next;

	    #
	    # non-env variables
	    #
	    } else {
		$l =~ /^(\w+)\s*(.*)$/;
		$var = $1;
		$args = $2;
	    }

	    #
	    # service entry
	    #
	    if ($var eq "service") {
	    	$service = $args;
		die "cf error: invalid service tag, line $.\n"
		    if ($service !~ /^[a-zA-Z0-9_.-]+$/);
		$period = 0;
		$sref = \%{$watch{$watchgroup}->{$service}};
		$sref->{"service"} = $args;
		$sref->{"interval"} = undef;
		$sref->{"randskew"} = 0;
		$sref->{"_op_status"} = $STAT_UNTESTED;
		$sref->{"_last_op_status"} = $STAT_UNTESTED;
		$sref->{"_ack"} = 0;
		$sref->{"_ack_comment"} = '';
		$sref->{"_consec_failures"} = 0;
		$sref->{"_failure_count"} = 0 if (!defined($sref->{"_failure_count"}));
		$sref->{"_start_of_monitor"} = time if (!defined($sref->{"_start_of_monitor"}));
		$sref->{"_alert_count"} = 0 if (!defined($sref->{"_alert_count"}));
		$sref->{"_last_failure"} = 0 if (!defined($sref->{"_last_failure"}));
		$sref->{"_last_success"} = 0 if (!defined($sref->{"_last_success"}));
		$sref->{"_last_trap"} = 0 if (!defined($sref->{"_last_trap"}));
		$sref->{"_exitval"} = "undef" if (!defined($sref->{"_exitval"}));
		$sref->{"_last_check"} = undef;
		next;
	    }

	    if ($service eq "") {
	    	die "cf error: need to specify service in watch record, line $.\n";
	    }


	    #
	    # period definition
	    #
	    if ($var eq "period") {
		$period = 1;

		if ($args =~ /^([a-z_]\w*) \s* : \s* (.*)$/ix) {
		    $periodstr = $1;
		    $args = $2;
		} else {
		    $periodstr = $args;
		}
		$pref = \%{$sref->{"periods"}->{$periodstr}};

	    	if (inPeriod (time, $args) == -1) {
		    die "cf error: malformed period, line $.\n";
		}

		$pref->{"period"} = $args;
		$pref->{"alertevery"} = 0;
		@{$pref->{"alerts"}} = ();
		@{$pref->{"upalerts"}} = ();
		@{$pref->{"startupalerts"}} = ();
		next;
	    }

	    #
	    # alert
	    #
	    if ($var eq "alert" && !$period) {
	    	die "cf error: need to specify a period for alert, line $.\n";
	    } elsif ($var eq "upalert" && !$period) {
	    	die "cf error: need to specify a period for upalert, line $.\n";
	    } elsif ($var eq "alertevery" && !$period) {
	    	die "cf error: need to specify a period for alertevery, line $.\n";
	    } elsif ($var eq "alertafter" && !$period) {
	    	die "cf error: need to specify a period for alertafter, line $.\n";
	    }

	    #
	    # for each service there can be one or more alert periods
	    # this is stored as an array of hashes named
	    #     %{$watch{$watchgroup}->{$service}->{"periods"}}
	    # each index for this hash is something like "wd {Mon-Fri} hr {7am-11pm}"
	    # the value of the hash is an array containing the list of alert commands
	    # and arguments
	    #
	    if ($var eq "alert") {
	    	push @{$pref->{"alerts"}}, $args;
	    } elsif ($var eq "upalert") {
	    	$sref->{"_upalert"} = 1;
	    	push @{$pref->{"upalerts"}}, $args;
	    
	    } elsif ($var eq "startupalert") {
	    	push @{$pref->{"startupalerts"}}, $args;

	    #
	    # non-alert variables
	    #
	    } else {
		if ($var eq "interval") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid time interval, line $.\n";

		} elsif ($var eq "traptimeout") {
		    $args = &dhmstos ($args) ||
		    	die "cf error: invalid waitfortrap interval, line $.\n";
		    $sref->{"_trap_timer"} = $args;

		} elsif ($var eq "trapduration") {
		    $args = &dhmstos ($args) ||
		    	die "cf error: invalid trapduration interval, line $.\n";

		} elsif ($var eq "randskew") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid random skew time, line $.\n";

		} elsif ($var eq "alertevery") {
                    my $summary_flag;
                    if ($args =~ /(\S+)(\s+)summary(\s*)$/i) {
                        $summary_flag = 1;
                        $args = $1;
                    } else {
                        $summary_flag = 0;
                    }
		    $args = &dhmstos ($args) ||
			die "cf error: invalid time interval, line $.\n";
		    $pref->{"alertevery"} = $args;
		    $pref->{"_alertsum"} = $summary_flag;
		    next;

		} elsif ($var eq "alertafter") {
		    my ($p1, $p2);

		    if ($args =~ /^(\d+)$/) {
		    	$p1 = $1;
			$pref->{"alertafter_consec"} = $p1;

		    } elsif ($args =~ /(\d+)\s+(\d+[hms])$/) {
		    	($p1, $p2) = ($1, $2);
			if (($p1 - 1) * $sref->{"interval"} >= &dhmstos($p2)) {
			    die "cf error:  interval & alertafter not sensible.\n" .
				"No alerts can be generated with those parameters, line $.\n";
			}
			$pref->{"alertafter"} = $p1;
			$pref->{"alertafterival"} = &dhmstos ($p2);

			$pref->{"_1stfailtime"} = 0;
			$pref->{"_failcount"} = 0;

		    } else {
		    	die "cf error: invalid interval specification, line $.\n";
		    }

		} elsif ($var eq "upalertafter") {
		    $args = &dhmstos ($args) ||
			die "cf error: invalid upalertafter specification, line $.\n";
		}

		$sref->{$var} = $args;
	    }

	    next;
	}
    }
    close (CFG);
    1;
}


#
# convert a string like "20m" into seconds
#
sub dhmstos {
    my ($str) = @_;
    my ($s);

    if ($str =~ /^\s*(\d+(?:\.\d+)?)([dhms])\s*$/i) {
	if ($2 eq "m") {
	    $s = $1 * 60;
	} elsif ($2 eq "h") {
	    $s = $1 * 60 * 60;
	} elsif ($2 eq "d") {
	    $s = $1 * 60 * 60 * 24;
	} else {
	    $s = $1;
	}
    } else {
    	return undef;
    }
    $s;
}


#
# reset the state of the server on SIGHUP, and reread config
# file.
#
sub reset {
    my ($keepstate) = @_;
    my ($pid, $group, $service);


    #
    # reap children that may be running
    #
    foreach $pid (keys %runningpid) {
	($group, $service) = split (/\//, $runningpid{$pid});
    	kill 15, $pid;
	waitpid ($pid, 0);
	syslog ('info', "reset killed child $pid, exit status $?");
	&remove_proc ($pid);
    }

    %watch = ();
    %groups = ();
    $procs = 0;
    syslog ('info', "resetting, and re-reading configuration $CF{CF}");
    &read_cf ($CF{"CF"});
    normalize_paths;
    gen_scriptdir_hash();
    $lasttm=time; # the last time(2) the loop started
    $fdset_rbits = $fdset_ebits = '';
    &set_last_test ();
    &randomize_startdelay() if ($CF{"RANDSTART"});
    &load_state ("disabled") if ($keepstate);
    if ($CF{"DTLOGGING"}) {
	close (DTLOG);
	&open_dtlog();
    }
}


sub open_dtlog {
    my $t = time;

    return if (!$CF{"DTLOGGING"});

    if (!open (DTLOG, ">>$CF{DTLOGFILE}")) {
       syslog ('err', "could not append to $CF{DTLOGFILE}: $!");
       $CF{"DTLOGGING"} = 0;
    } else {
       select (DTLOG); $| = 1; select (STDOUT);
       $CF{"DTLOGGING"} = 1;
       print DTLOG <<EOF;
#
# downtime log start $t
# time back up, group, service, first failure, downtime, interval, summary
#
EOF
    }
}


#
# remove a process from our state
#
sub remove_proc {
    my ($pid) = @_;

    return if (!defined $runningpid{$pid});

    vec ($fdset_rbits, fileno($fhandles{$runningpid{$pid}}), 1) = 0;
    close ($fhandles{$runningpid{$pid}});
    delete $fhandles{$runningpid{$pid}};
    delete $running{$runningpid{$pid}};
    delete $runningpid{$pid};
    $procs--;
}


#
# clean up before exiting
#
sub clean_up {
    unlink $CF{"PIDFILE"} unless $CF{"PIDFILE"} eq '';
}


#
# exit on SIGTERM
#
sub handle_sigterm {
    syslog ("info", "caught TERM signal, exiting");
    &clean_up();
    exit (1);
}


#
# setup server
#
sub setup_server {
    my ($proto, $fl);

    #
    # client server, such as moncmd
    #
    $proto = getprotobyname ('tcp');
    socket (SERVER, PF_INET, SOCK_STREAM, $proto) ||
    	&die_die ("err", "could not create TCP socket: $!");
    setsockopt (SERVER, SOL_SOCKET, SO_REUSEADDR, pack ("l", 1)) ||
    	&die_die ("err", "could not setsockopt: $!");
    bind (SERVER, sockaddr_in ($CF{"SERVPORT"}, INADDR_ANY)) ||
    	&die_die ("err", "could not bind TCP server port: $!");
    listen (SERVER, SOMAXCONN);

    #
    # remote monitor traps
    #
    $proto = getprotobyname ('udp');
    socket (TRAPSERVER, PF_INET, SOCK_DGRAM, $proto) ||
    	&die_die ("err", "could not create UDP socket: $!");
    bind (TRAPSERVER, sockaddr_in ($CF{"TRAPPORT"}, INADDR_ANY)) ||
    	&die_die ("err", "could not bind UDP server port: $!");
#    $fl = fcntl (TRAPSERVER, F_GETFL, $fl)
    fcntl (TRAPSERVER, F_GETFL, $fl)
    	|| &die_die ("err", "could not get fd options: $!");
    $fl |= O_NONBLOCK;
    fcntl (TRAPSERVER, F_SETFL, $fl)
    	|| &die_die ("err", "could not set fd options: $!");
    
    return if (!$CF{"SNMP"});

    #
    # SNMP traps
    #
    $proto = getprotobyname ('udp');
    socket (SNMPSERVER, PF_INET, SOCK_DGRAM, $proto) ||
    	&die_die ("err", "could not create UDP socket: $!");
    bind (SNMPSERVER, sockaddr_in ($CF{"SNMPPORT"}, INADDR_ANY)) ||
    	&die_die ("err", "could not bind UDP server port: $!");
    fcntl (SNMPSERVER, F_GETFL, $fl)
    	|| &die_die ("err", "could not get fd options: $!");
    $fl |= O_NONBLOCK;
    fcntl (SNMPSERVER, F_SETFL, $fl)
    	|| &die_die ("err", "could not set fd options: $!");
}


#
# set up a client connection if necessary
#
sub client_accept {
    my ($rin, $rout, $n, $fno, $sock, $port, $addr, $fl, $CLIENT);

    $CLIENT = "c" . $clientcount++;

    if (!defined ($sock = accept ($CLIENT, SERVER))) {
    	syslog ('err', "accept returned error: $!");
	return;
    }

&debug(1, "accepted client $CLIENT\n");
    $fno = fileno ($CLIENT);

    #
    # set socket to nonblocking
    #
    if (!defined ($fl = fcntl ($CLIENT, F_GETFL, $fl))) {
	syslog ("err", "could not get fd options for client: $!");
	close ($CLIENT);
	return;
    }

    $fl |= O_NONBLOCK;

    if (!defined (fcntl ($CLIENT, F_SETFL, $fl))) {
    	syslog ("err", "could not set fd options for client: $!");
	close ($CLIENT);
	return;
    }

    ($port, $addr) = unpack_sockaddr_in ($sock);
    syslog ('info', "client connection from " . inet_ntoa ($addr) .
	    ":" . $port);

    select ($CLIENT);
    $|=1;
    select (STDOUT);

    $clients{$fno}{"fhandle"} = $CLIENT;
    $clients{$fno}{"user"} = undef;		# username if authenticated
    $clients{$fno}{"timeout"} = $CF{"CLIENT_TIMEOUT"};
    $clients{$fno}{"last_read"} = time;		# last time data was read
    $clients{$fno}{"buf"} = '';
    $numclients++;
}


#
# do all pending client commands
#
sub client_dopending {
    my ($cl, $cmd, $l);

    foreach $cl (keys %clients) {
    	if ($clients{$cl}->{"buf"} =~ /^([^\r\n]*)[\r\n]+/s) {
	    $cmd = $1;
	    $l = length ($cmd);
	    $clients{$cl}->{"buf"} =~ s/^[^\r\n]*[\r\n]+//s;
	    &client_command ($cl, $cmd);
	}
    }
}


#
# close a client connection
#
sub client_close {
    my ($cl, $reason) = @_;

    syslog ('info', "closing client $cl: $reason") if (defined $reason);
    close ($clients{$cl}{"fhandle"});
    delete $clients{$cl};
    vec ($iovec, $cl, 1) = 0;
    $numclients--;
}


#
# Handle a connection from a client
#
sub client_command {
    my ($cl, $l) = @_;
    my ($cmd, $args, $group, $service, $s, $sname, $stchanged);
    my ($var, $value, $msg, @l, $sock, $port, $addr, $sref, $auth, $fh);
    my ($user, $pass, @argsList, $comment);


    syslog ('info', "client command \"$l\"")
	if ($l !~ /^\s*login/i);

    $fh = $clients{$cl}{"fhandle"};

#    &sock_write ($fh, "220 $HOSTNAME mon server ready.\n");

    if ($l !~ /^(login|disable|enable|quit|list|set|get|
		    stop|start|loadstate|savestate|reset|
		    reload|term|test|servertime|ack|version)\s*(.*)?$/ix) {
	&sock_write ($fh, "520 invalid command\n");
	return;
    }
    ($cmd, $args) = ("\L$1", $2);

    $stchanged = 0;

    #
    # quit command
    #
    if ($cmd eq "quit") {
	&sock_write ($fh, "220 quitting\n");
	&client_close ($cl);

    #
    # login
    #
    } elsif ($cmd eq "login") {
	($user, $pass) = split (/\s+/, $args, 2);
	if (!defined &auth ($CF{"AUTHTYPE"}, $user, $pass)) {
	    &sock_write ($fh,  "530 login unsuccessful\n");
	} else {
	    $clients{$cl}{"user"} = $user;
	    syslog ("info", "authenticated $user");
	    &sock_write ($fh,  "220 login accepted\n");
	}

    #
    # reset
    #
    } elsif ($cmd eq "reset" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	my ($keepstate);
	if ($args =~ /stopped/i) {
	    $STOPPED = 1;
	    $STOPPED_TIME = time;
	}
	if ($args =~ /keepstate/) {
	    $keepstate = 1;
	}
	&reset($keepstate);
	&sock_write ($fh,  "220 reset PID $$@$HOSTNAME\n");

    #
    # reload
    #
    } elsif ($cmd eq "reload" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	if (!defined &reload (split (/\s+/, $args))) {
	    &sock_write ($fh,  "520 unknown reload command\n");
	} else {
	    &sock_write ($fh,  "220 reload completed\n");
	}
    
    #
    # clear
    #
    } elsif ($cmd eq "clear" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
    	if ($args =~ /^timers \s+ ([a-zA-Z0-9_.-]+) \s+ ([a-zA-Z0-9_.-]+)/ix) {
	    if (!defined $watch{$1}->{$2}) {
		&sock_write ($fh,  "520 unknown group\n");
	    } else {
		clear_timers ($1, $2);
		&sock_write ($fh,  "220 clear timers completed\n");
	    }

	} else {
	    &sock_write ($fh,  "520 unknown clear command\n");
	    next;
	}

    #
    # test
    #
    } elsif ($cmd eq "test" && &check_auth ($clients{$cl}{"user"}, $cmd))  {
	my ($cmd, $args) = split (/\s+/, $args, 2);

	#
	# test monitor
	#
	if ($cmd eq "monitor") {
	    my ($group, $service) = split (/\s+/, $args);

	    if (!defined $watch{$group}->{$service}) {
		&sock_write ($fh,  "$group $service not defined\n");
	    } else {
		$watch{$group}->{$service}->{"_timer"} = 0;
	    }
	    &sock_write ($fh,  "220 test monitor completed\n");
	
	#
	# test alert
	#
	} elsif ($cmd =~ /^alert|startupalert|upalert$/) {
	    my ($group, $service, $retval, $period) = split (/\s+/, $args, 4);

	    if (!defined $watch{$group}->{$service}) {
		&sock_write ($fh,  "520 $group $service not defined\n");

	    } elsif (!defined $watch{$group}->{$service}->{"periods"}->{$period}) {
		    &sock_write ($fh,  "520 period not defined\n");

	    } else {
		my $f = 0;
		my $a;

		if ($cmd eq "alert") {
		    $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"alerts"};
		} elsif ($cmd eq "startupalert") {
		    $f = $FL_STARTUPALERT;
		    $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"startupalerts"};
		} elsif ($cmd eq "upalert") {
		    $f = $FL_UPALERT;
		    $a = $watch{$group}->{$service}->{"periods"}->{$period}->{"upalerts"};
		}

		for (@{$a}) {
		    my ($alert, $args) = split (/\s+/, $_, 2);

		    if ($args =~ /^exit=/) {
		    	$args =~ s/^exit=\S+ \s+//x;
		    }

		    call_alert (
			group	=> $group,
			service	=> $service,
			output	=> "test\ntest detail\n",
			retval	=> $retval,
			flags	=> $f | $FL_TEST,
			alert	=> $alert,
			args	=> $args,
		    );
		}

		&sock_write ($fh,  "220 test alert completed\n");
	    }

	} else {
	    &sock_write ($fh,  "520 test error\n");
	}
    
    #
    # version
    #
    } elsif ($cmd eq "version") {
    	&sock_write ($fh, "version $PROT_VERSION\n");
    	&sock_write ($fh, "220 version completed\n");

    #
    # load state
    #
    } elsif ($cmd eq "loadstate" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	foreach (split (/\s+/, $args)) {
	    &load_state ($_);
	}
	&sock_write ($fh,  "220 loadstate completed\n");

    #
    # save state
    #
    } elsif ($cmd eq "savestate" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	foreach (split (/\s+/, $args)) {
	    &save_state ($_);
	}
	&sock_write ($fh,  "220 savestate completed\n");

    #
    # term
    #
    } elsif ($cmd eq "term"  && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	&sock_write ($fh,  "220 terminating server\n");
	&client_close ($fh, "terminated by user command");
	close (SERVER);
	syslog ("info", "terminating by user command");
	&clean_up();
	exit;

    #
    # stop testing
    #
    } elsif ($cmd eq "stop"&& &check_auth ($clients{$cl}{"user"}, $cmd)) {
	$STOPPED = 1;
	$STOPPED_TIME = time;
	&sock_write ($fh,  "220 stop completed\n");

    #
    # start testing
    #
    } elsif ($cmd eq "start" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	$STOPPED = 0;
	$STOPPED_TIME = 0;
	&sock_write ($fh,  "220 start completed\n");

    #
    # set
    #
    } elsif ($cmd eq "set" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	if ($args =~ /^maxkeep\s+(\d+)/) {
	    $CF{"MAX_KEEP"} = $1;
	    &sock_write ($fh,  "220 set completed\n");
	} else {
	    ($group, $service, $var, $value) = split (/\s+/, $args);
	    if (!defined $watch{$group}->{$service}) {
		&sock_write ($fh,  "$group,$service not defined\n");
	    } elsif ($var eq "opstatus") {
		if (!defined ($OPSTAT{$value})) {
		    &sock_write ($fh,  "520 undefined opstatus\n");
		} else {
		    set_op_status ($group, $service, $value);
		    &sock_write ($fh,  "220 set completed\n");
		}

	    } else {
		$watch{$group}->{$service}->{$var} = $value;
		&sock_write ($fh,  "$group $service $var = $value\n");
		&sock_write ($fh,  "220 set completed\n");
	    }
	}

    #
    # get
    #
    } elsif ($cmd eq "get" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	if ($args =~ /^maxkeep\s*$/) {
	    &sock_write ($fh,  "maxkeep = $CF{MAX_KEEP}\n");
	    &sock_write ($fh,  "220 set completed\n");
	} else {
	    ($group, $service, $var) = split (/\s+/, $args);
	    if (!defined $watch{$group}->{$service}) {
		&sock_write ($fh,  "520 $group,$service not defined\n");
	    } else {
		&sock_write ($fh,  "$group $service $var = " .
			"$watch{$group}->{$service}->{$var}\n");
		&sock_write ($fh,  "220 get completed\n");
	    }
	}

    #
    # list
    #
    } elsif ($cmd eq "list" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	@argsList = split(/\s+/, $args);
	($cmd, $args) = split (/\s+/, $args);

	#
	# list service descriptions
	#
	if ($cmd eq "descriptions") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    my $d = $watch{$group}->{$service}->{"description"};
		    $d =~ s/"/\\"/g;
		    &sock_write ($fh,  "$group $service" . " \"$d\"\n");
		}
	    }
	    &sock_write ($fh,  "220 list descriptions completed\n");

	#
	# list group members
	#
	} elsif ($cmd eq "group") {
	    if ($groups{$args}) {
		&sock_write ($fh,  "hostgroup $args @{$groups{$args}}\n");
		&sock_write ($fh,  "220 list group completed\n");
	    } else {
		&sock_write ($fh,  "520 list group error, undefined group\n");
	    }

	#
	# list status of all services
	#
	} elsif ($cmd eq "opstatus") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    client_write_opstatus ($fh, $group, $service);
		}
	    }
	    &sock_write ($fh,  "220 list opstatus completed\n");

	#
	# list disabled hosts and services
	#
	} elsif ($cmd eq "disabled") {
	    foreach $group (keys %groups) {
		@l = grep (/^\*/, @{$groups{$group}});
		if (@l) {
		    grep (s/^\*//, @l);
		    &sock_write ($fh,  "group $group: @l\n");
		}
	    }
	    foreach $group (keys %watch) {
		if ($watch_disabled{$group} == 1) {
		    &sock_write ($fh,  "watch $group\n");
		}
		foreach $service (keys %{$watch{$group}}) {
		    if ($watch{$group}->{$service}->{'disable'} == 1) {
			&sock_write ($fh,  "watch $group service " .
			    "$service\n");
		    }
		}
	    }
	    &sock_write ($fh,  "220 list disabled completed\n");

	#
	# list last alert history
	#
	} elsif ($cmd eq "alerthist") {
	    &sock_write ($fh,  join ("\n", @last_alerts) . "\n") if @last_alerts;
	    &sock_write ($fh,  "220 list alerthist completed\n");

	#
	# list time of last failures for each service
	#
	} elsif ($cmd eq "failures") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    my $sref = \%{$watch{$group}->{$service}};
		    client_write_opstatus ($fh, $group, $service)
			if ($FAILURE{$sref->{"_op_status"}});
		}
	    }
	    &sock_write ($fh,  "220 list failures completed\n");

	#
	# list the failure history
	#
	} elsif ($cmd eq "failurehist") {
	    &sock_write ($fh,  join ("\n", @last_failures) . "\n")
		if @last_failures;
	    &sock_write ($fh,  "220 list failurehist completed\n");

	#
	# list the time of last successes for each service
	#
	} elsif ($cmd eq "successes") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    my $sref = \%{$watch{$group}->{$service}};
		    client_write_opstatus ($fh, $group, $service)
			if ($SUCCESS{$sref->{"_op_status"}});
		}
	    }
	    &sock_write ($fh,  "220 list successes completed\n");

	#
	# list warnings
	#
	} elsif ($cmd eq "warnings") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    my $sref = \%{$watch{$group}->{$service}};
		    client_write_opstatus ($fh, $group, $service)
			if ($WARNING{$sref->{"_op_status"}});
		}
	    }
	    &sock_write ($fh,  "220 list successes completed\n");

	#
	# list process IDs
	#
	} elsif ($cmd eq "pids") {
	    &sock_write ($fh,  "$$ server\n");
	    foreach $value (keys %runningpid) {
		($group, $service) = split (/\//, $runningpid{$value});
		&sock_write ($fh,  "$value $group $service\n");
	    }
	    &sock_write ($fh,  "220 list pids completed\n");

	#
	# list watch groups and services
	#
	} elsif ($cmd eq "watch") {
	    foreach $group (keys %watch) {
		foreach $service (keys %{$watch{$group}}) {
		    if (!defined $watch{$group}->{$service}) {
			&sock_write ($fh,  "$group (undefined service)\n");
		    } else {
			&sock_write ($fh,  "$group $service\n");
		    }
		}
	    }
	    &sock_write ($fh,  "220 list watch completed\n");

	#
	# list server state
	#
	} elsif ($cmd eq "state") {
	    if ($STOPPED) {
		&sock_write ($fh,  "scheduler stopped since $STOPPED_TIME\n");
	    } else {
		&sock_write ($fh,  "scheduler running\n");
	    }
	    &sock_write ($fh,  "220 list state completed\n");
	
	#
	# list aliases
	#
	} elsif ($cmd eq "aliases") {
	    my (@listAliasesRequest) = @argsList;

	    shift (@listAliasesRequest);

	    # if no alias request, all alias are responded
	    unless (@listAliasesRequest) {
	    	@listAliasesRequest = keys (%alias);
	    }

	    foreach $alias (@listAliasesRequest){
	    	&sock_write ($fh, "alias $alias\n");
		foreach $value (@{$alias{$alias}}) {
		    &sock_write ($fh,  "$value\n");
		}
		&sock_write ($fh, "\n");
	    }
	    &sock_write ($fh,  "220 list aliases completed\n");
	
	#
	# list aliasgroups
	#
	} elsif ($cmd eq "aliasgroups") {
	    my (@listAliasesRequest);
	    @listAliasesRequest = keys (%alias);

	    &sock_write ($fh,  "@listAliasesRequest\n");
	    &sock_write ($fh,  "220 list aliasgroups completed\n");

	} else {
	    &sock_write ($fh,  "520 unknown list command\n");
	}


    #
    # acknowledge a failure
    #
    } elsif ($cmd eq "ack" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	($group, $service, $comment) = split (/\s+/, $args, 3);
	my ($svc);
	my $sref = \%{$watch{$group}->{$service}};

	if (!defined ($watch{$group})) {
	    &sock_write ($fh,  "520 unknown group\n");
	} elsif (!defined $watch{$group}->{$service}) {
	    &sock_write ($fh,  "520 unknown service\n");
	} elsif ($sref->{"_op_status"} == $STAT_OK ||
		$sref->{"_op_status"} == $STAT_UNTESTED ||
		$sref->{"_op_status"} == $STAT_DEPEND) {
	    &sock_write ($fh,  "520 service is in a non-failure state\n");
	} else {
	    $sref->{"_ack"} = 1;
	    $sref->{"_ack_comment"} = $comment;
	    &sock_write ($fh,  "220 ack completed\n");
	}

    #
    # disable watch, service or host
    #
    } elsif ($cmd eq "disable" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	($cmd, $args) = split (/\s+/, $args, 2);

	#
	# disable watch
	#
	if ($cmd eq "watch") {
	    if (!defined (&disen_watch($args, 0))) {
		&sock_write ($fh,  "520 disable error, unknown watch \"$args\"\n");
	    } else {
		$stchanged++;
		&sock_write ($fh,  "220 disable watch completed\n");
	    }

	#
	# disable service
	#
	} elsif ($cmd eq "service") {
	    ($group, $service) = split (/\s+/, $args, 2);

	    if (!defined (&disen_service ($group, $service, 0))) {
		&sock_write ($fh,  "520 disable error, unknown group or service\n");
	    } else {
		$stchanged++;
		&sock_write ($fh,  "220 disable service completed\n");
	    }

	#
	# disable host
	#
	} elsif ($cmd eq "host") {
	    foreach $var (split (/\s+/, $args)) {
		&disen_host ($var, 0);
	    }
		$stchanged++;
	    &sock_write ($fh,  "220 disable host completed\n");
	}

    #
    # enable watch, service or host
    #
    } elsif ($cmd eq "enable" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	($cmd, $args) = split (/\s+/, $args, 2);

	#
	# enable watch
	#
	if ($cmd eq "watch") {
	    if (!defined(&disen_watch($args, 1))) {
		&sock_write ($fh,  "520 enable error, unknown watch\n");
	    } else {
		$stchanged++;
		&sock_write ($fh,  "220 enable watch completed\n");
	    }


	#
	# enable service
	#
	} elsif ($cmd eq "service") {
	    ($group, $service) = split (/\s+/, $args, 2);

	    if (!defined (&disen_service ($group, $service, 1))) {
		&sock_write ($fh,  "520 enable error, unknown group\n");
	    } else {
		$stchanged++;
		&sock_write ($fh,  "220 enable completed\n");
	    }

	#
	# enable host
	#
	} elsif ($cmd eq "host") {
	    foreach $var (split (/\s+/, $args)) {
		&disen_host ($var, 1);
		$stchanged++;
	    }
	    &sock_write ($fh,  "220 enable completed\n");

	} else {
	    &sock_write ($fh,  "520 command could not be executed\n");
	}

    #
    # server time
    #
    } elsif ($cmd eq "servertime" && &check_auth ($clients{$cl}{"user"}, $cmd)) {
	&sock_write ($fh,  join ("", time, " ", scalar (localtime), "\n"));
	&sock_write ($fh,  "220 servertime completed\n");

    } else {
	&sock_write ($fh,  "520 command could not be executed\n");
    }

    &save_state ("disabled") if ($stchanged);
}


sub client_write_opstatus {
    my $fh = shift;
    my ($group, $service) = @_;

    my $sref = \%{$watch{$group}->{$service}};
    my $s = $sref->{"_last_summary"};
    $s =~ s/['"]/\\$&/g;

    my $comment;
    if ($sref->{"_ack"} == 1) {
	$comment = '"' . $sref->{"_ack_comment"} . '"';
    } else {
	$comment = '""';
    }

    &sock_write ($fh,
	"group=$group" . 
	" service=$service" .
	" opstatus=$sref->{_op_status}" .
	" last_opstatus=$sref->{_last_op_status}" .
	" exitval=$sref->{_exitval}" .
	" timer=$sref->{_timer}" .
	" last_success=$sref->{_last_success}" .
	" last_failure=$sref->{_last_failure}" .
	" last_trap=$sref->{_last_trap}" .
	" last_check=$sref->{_last_check}" .
	" ack=$sref->{_ack}" .
	" ackcomment=$comment" .
	" monitor=\"$sref->{monitor}\"" .
	" last_summary=\"$s\"\n");
}


#
# show usage
#
sub usage {
    print <<"EOF";
usage: mon [-a dir] [-c config] [-d] [-f] [-i secs] [-k num]
	[-m num] [-p num] [-P file] [-r num] [-s dir] 
       mon -v

  -a dir	alert script dir
  -A file	authorization file
  -b dir	base directory for alerts and monitors (basedir)
  -B dir	base directory for configuration files (cfbasedir)
  -c config	config file, defaults to "mon.cf"
  -d		debug
  -D dir	state directory (statedir)
  -f		fork and become a daemon
  -h		this help
  -i secs	sleep interval (seconds), defaults to 1
  -k num	keep history of last num events
  -l		load old state from statedir
  -L dir	log directory (logdir)
  -M            pre-process config file with m4
  -m num	throttle at maximum number of monitor processes
  -o file       on-call schedule
  -p num	server listens on port num
  -P file	PID file
  -r num	randomize startup schedule
  -s dir	monitor script dir
  -S		start with scheduler stopped
  -v		print version

Report bugs to $AUTHOR
$RCSID
EOF
}


#
# become a daemon
#
sub daemon {
    my $pid;

    if ($pid = fork()) {
	# the parent goes away all happy and stuff
    	exit (0);
    } elsif (!defined $pid) {
    	die "could not fork: $!\n";
    }

    setsid();

    #
    # make it so that we cannot regain a controlling terminal
    #
    if ($pid = fork()) {
	# the parent goes away all happy and stuff
    	exit (0);
    } elsif (!defined $pid) {
	syslog ('err', "could not fork: $!");
	exit 1;
    }

    chdir ('/');
    umask (022);

    if (!open (N, "+</dev/null")) {
    	syslog ("err", "could not open /dev/null: %m");
    	exit(1);
    }
    if (!open(STDOUT, ">&N") ||
        !open (STDIN, "<&N") ||
	!open (STDERR, ">&N")) {
    	syslog ("err", "could not redirect: %m");
	exit(1);
    }
    syslog ('info', "running as daemon");
}


#
# debug
#
sub debug {
    my ($level, @l) = @_;

    return if ($level > $opt_d);

    if ($opt_d && !$opt_f) {
    	print STDERR @l;
    } else {
    	syslog ('debug', join ('', @l));
    }
}


#
# die_die
#
sub die_die {
    my ($level, $msg) = @_;

    die "[$level] $msg\n" if ($opt_d);

    syslog ($level, "fatal, $msg");
    closelog();
    exit (1);
}


#
# handle cleanup of exited processes
# trigger alerts on failures (or send no alert if disabled)
# do some accounting
#
sub proc_cleanup {
    my ($summary, $tmnow);

    $tmnow = time;
    return if (keys %running == 0);

    while ((my $p = waitpid (-1, &WNOHANG)) >0) {

	my ($group, $service) = split (/\//, $runningpid{$p});
	my $sref = \%{$watch{$group}->{$service}};

	#
	# suck in any extra data
	#
	my $fh = $fhandles{$runningpid{$p}};
	while (my $z = sysread ($fh, $buf, 8192)) {
	    $ibufs{$runningpid{$p}} .= $buf;
	}

	$sref->{"_exitval"} = int($?>>8);
&debug (1, "PID $p ($runningpid{$p}) exited with [$sref->{'_exitval'}]\n");

	#
	# error exit value, handle alert
	#
	if ($?) {

	    #
	    # accounting
	    #
	    $sref->{"_failure_count"}++;
	    $sref->{"_consec_failures"}++;
	    $sref->{"_last_failure"} = $tmnow;
	    if ($sref->{"_op_status"} == $STAT_OK ||
		    $sref->{"_op_status"} == $STAT_UNKNOWN ||
		    $sref->{"_op_status"} == $STAT_DEPEND ||
		    $sref->{"_op_status"} == $STAT_UNTESTED) {
		$sref->{"_first_failure"} = $tmnow;
	    }
	    set_op_status ($group, $service, $STAT_FAIL);
	    my ($summary) = split("\n", $ibufs{$runningpid{$p}});
	    $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m);
	    $sref->{"_last_summary"} = $summary;
	    shift @last_failures if (@last_failures > $CF{"MAX_KEEP"});
	    push @last_failures, "$group $service" .
		" $tm $summary";
	    &syslog ('crit', "failure for $last_failures[-1]");

	    #
	    # send an alert if necessary
	    #
	    &do_alert ($group, $service, $ibufs{$runningpid{$p}},
		    $?>>8, $FL_MONITOR);

	    $sref->{"_failure_output"} = $ibufs{$runningpid{$p}};

	#
	# record the time of the last success
	#
	} else {

	    if ($CF{"DTLOGGING"} && defined ($sref->{"_op_status"}) &&
		   $sref->{"_op_status"} == $STAT_FAIL) {

	       $sref->{"_first_failure"} = $START_TIME
		   if ($sref->{"_first_failure"} == 0);

	       print DTLOG ($tmnow,
		   " $group",
		   " $service",
		   " ", 0 + $sref->{"_first_failure"},
		   " ", 0 + $tmnow - $sref->{"_first_failure"},
		   " $sref->{'interval'}",
		   " $sref->{'_last_summary'}\n") or
		   syslog ('err', "error writing to $CF{DTLOGFILE}: $!");
	    }

	    #
	    # if this service has just come back up and
	    # we are paying attention to this event,
	    # let someone know
	    #
	    if (defined ($sref->{"_op_status"}) &&
		    $sref->{"_op_status"} == $STAT_FAIL) {
		if (defined($sref->{"_upalert"}) && (!defined($sref->{"upalertafter"}) ||
			$tmnow - $sref->{"_first_failure"} >= $sref->{"upalertafter"})) {
		    &do_alert ($group, $service, $sref->{"_last_output"}, 0, $FL_UPALERT);
		}

		syslog ('info', "downtime for $group/$service" .
		    " is $sref->{'_last_downtime'} ");
	    }

	    $sref->{"_ack"} = 0;
	    $sref->{"_ack_comment"} = '';
	    $sref->{"_first_failure"} = 0;
	    $sref->{"_last_failure"} = 0;
	    $sref->{"_consec_failures"} = 0;

	    #
	    # reset the alertevery timer
	    #
	    foreach my $period (keys %{$sref->{"periods"}}) {
		$sref->{"periods"}->{$period}->{"_last_alert"} = 0;
	    }

	    $sref->{"_last_success"} = $tmnow;
	    set_op_status ($group, $service, $STAT_OK);
	}

	#
	# save the output
	#
	$sref->{"_last_output"} = $ibufs{$runningpid{$p}};

	&remove_proc ($p);
    }
}


#
# collect output from running processes
#
sub collect_output {
    my $buf;

    return if (!keys %running);

    my $nfound = select($rout=$fdset_rbits, undef, undef, 0);
&debug (1, "select returned $nfound file handles\n");
    if ($nfound) {
	#
	# look for the file descriptors that are readable,
	# and try to read as much as possible from them
	#
	foreach my $k (keys %fhandles) {
	    my $fh = $fhandles{$k};
	    if (vec ($rout, fileno($fh), 1) == 1) {
		my $z = 0;
		while ($z = sysread ($fh, $buf, 8192)) {
		    $ibufs{$k} .= $buf;
&debug (1, "[$buf] from $fh\n");
		}

		#
		# ignore if EAGAIN, since we're nonblocking
		#
		if (!defined($z) && $! == &EAGAIN) {

		#
		# error on this descriptor
		#
		} elsif (!defined($z)) {
&debug (1, "error on $fh: $!\n");
		    &syslog ('err', "error on $fh: $!");
		    vec($fdset_rbits, fileno($fh), 1) = 0;
		} elsif ($z == 0 && $! == &EAGAIN) {
&debug (1, "EAGAIN on $fh\n");

		#
		# if EOF encountered, stop trying to
		# get input from this file descriptor
		#
		} elsif ($z == 0) {
&debug (1, "EOF on $fh\n");
		    vec($fdset_rbits, fileno($fh), 1) = 0;

		}
	    }
	}
    }
}




#
# handle forking a monitor process, and set up variables
#
sub run_monitor {
    my ($group, $service) = @_;
    my (@args, @groupargs, $pid, @ghosts, $monitor, $monitorargs);

    my $sref = \%{$watch{$group}->{$service}};
    $fhandles{"$group/$service"} = "\UA${group}_$service";
    $fhandles{"$group/$service"} =~ s/[.-]/_/g;

    ($monitor, $monitorargs) = ($sref->{"monitor"} =~ /^(\S+)(\s+(.*))?$/);

    if (!defined $MONITORHASH{$monitor} || ! -f $MONITORHASH{$monitor}) {
    	syslog ('err', "no monitor found while trying to run [$monitor]");
	return undef;
    } else {
    	$monitor = $MONITORHASH{$monitor};
    }

    $monitor .= " " . $monitorargs if ($monitorargs);

    #
    # if monitor ends with ";;", do not append groups
    # to command line
    #

    @ghosts = ();
    if ($monitor =~ s/\s*;;\s*$//) {
	@args = quotewords ('\s+', 0, $monitor);
	@ghosts = (1);

    } else {
	@ghosts = grep (!/^\*/, @{$groups{$group}});
	@args = (quotewords ('\s+', 0, $monitor), @ghosts);
    }

    if (@ghosts == 0 && !defined ($sref->{"allow_empty_group"})) {
    	syslog ('err', "monitor for $group/$service" .
		" not called because of no host arguments\n");

    } else {

	#
	# dependency check
	#
	if ($sref->{"depend"}) {
	    my $status = &checkDepend ($group, $service);

	    if (!defined $status) {
		return;

	    } elsif (!$status) {
		if ($sref->{"_op_status"} != $STAT_FAIL &&
			$sref->{"_op_status"} != $STAT_DEPEND) {
		    $sref->{'_last_op_status'} = $sref->{'_op_status'};
		    set_op_status ($group, $service, $STAT_DEPEND);
		}
		return;

	    } else {
		$sref->{'_op_status'} = $sref->{'_last_op_status'};
	    }
	}


	$pid = open($fhandles{"$group/$service"}, '-|');
	if (!defined $pid) {
	    syslog ('err', "Could not fork: $!");
	    delete $fhandles{"$group/$service"};
	    return 0;

	} elsif ($pid == 0) {
	    close (TRAPSERVER);
	    close (SNMPSERVER);
	    close (SERVER);

	    open(STDERR, '>&STDOUT')
		or syslog ('err', "Could not dup stderr: $!");
	    open(STDIN, "</dev/null")
		or syslog ('err', "Could not connect stdin to /dev/null: $!");
	    my $v;
	    foreach $v (keys %{$sref->{"ENV"}}) {
	    	$ENV{$v} = $sref->{"ENV"}->{$v};
	    }
	    $ENV{"MON_LAST_SUMMARY"} = $sref->{"_last_summary"};
	    $ENV{"MON_LAST_OUTPUT"} = $sref->{"_last_output"};
	    $ENV{"MON_LAST_FAILURE"} = $sref->{"_last_failure"};
	    $ENV{"MON_FIRST_FAILURE"} = $sref->{"_first_failure"};
	    $ENV{"MON_LAST_SUCCESS"} = $sref->{"_last_success"};
	    $ENV{"MON_STATEDIR"} = $CF{"STATEDIR"};
	    $ENV{"MON_LOGDIR"} = $CF{"LOGDIR"};
	    exec @args or syslog ('err', "could not exec '@args': $!")
		&& exit(1);
	}

	$sref->{"_last_check"} = scalar (time);

&debug (1, "watching file handle ", fileno ($fhandles{"$group/$service"}),
    " for $group/$service\n");

	#
	# set nonblocking I/O and setup bit vector for select(2)
	#
	$fl = fcntl $fhandles{"$group/$service"}, F_GETFL, $fl;
	$fl |= O_NONBLOCK;
	fcntl $fhandles{"$group/$service"}, F_SETFL, $fl;

	vec ($fdset_rbits,
	    fileno($fhandles{"$group/$service"}), 1) = 1;
	$fdset_ebits |= $fdset_rbits;

	#
	# note that this is running
	#
	$running{"$group/$service"} = 1;
	$runningpid{$pid} = "$group/$service";
	$ibufs{"$group/$service"} = "";
	$procs++;
    }

    #
    # set the countdown timer for this service
    #
    if ($sref->{"randskew"} != 0) {
    	$sref->{"_timer"} = $sref->{"interval"} +
	     (int (rand (2)) == 0 ? -int(rand($sref->{"randskew"}) + 1) :
	     	int(rand($sref->{"randskew"})+1));
    } else {
	$sref->{"_timer"} = $sref->{"interval"};
    }
}


#
# randomize the delay before each test
# $opt{"randstart"} is seconds
#
sub randomize_startdelay {
    my ($group, $service);

    foreach $group (keys %watch) {
	foreach $service (keys %{$watch{$group}}) {
            $watch{$group}->{$service}->{"_timer"} =
                int (rand ($CF{"RANDSTART"}));
        }
    }

}


#
# return 1 if $val is within $range,
# where $range = "number" or "number-number"
#
sub inRange {
    my ($val, $range) = @_;
    my ($retval);

    $retval = 0;
    if ($range =~ /^(\d+)$/ && $val == $1) {
        $retval = 1

    } elsif ($range =~ /^(\d+)\s*-\s*(\d+)$/ &&
	    ($val >= $1 && $val <= $2)) {
        $retval = 1
    }

    $retval;
}


#
# disable ($cmd==0) or enable a watch
#
sub disen_watch {
    my ($w, $cmd) = @_;

    return undef if (!defined ($watch{$w}));
    if ($cmd == 0) {
	$watch_disabled{$w} = 1;
    } else {
	$watch_disabled{$w} = 0;
    }
}


#
# disable ($cmd==0) or enable a service
#
sub disen_service {
    my ($g, $s, $cmd) = @_;
    my ($snum);

    return undef if (!defined $watch{$g});
    return undef if (!defined $watch{$g}->{$s});
    if ($cmd == 0) {
	$watch{$g}->{$s}->{"disable"} = 1;
    } else {
	$watch{$g}->{$s}->{"disable"} = 0;
    }
}


#
# disable ($cmd==0) or enable a host
#
sub disen_host {
    my ($h, $cmd) = @_;
    my ($var, $g);

    foreach $g (keys %groups) {
	if ($cmd == 0) {
	    grep (s/^$h$/*$h/, @{$groups{$g}});
	} else {
	    grep (s/^\*$h$/$h/, @{$groups{$g}});
	}
    }
}


#
# save state
#
sub save_state {
    my (@states) = @_;
    my ($group, $service, @l, $state);

    foreach $state (@states) {
	if ($state eq "disabled") {
	    if (!open (STATE, ">$CF{STATEDIR}/disabled")) {
		syslog ("err", "could not write to state file: $!");
		next;
	    }

	    foreach $group (keys %groups) {
		@l = grep (/^\*/, @{$groups{$group}});
		if (@l) {
		    grep (s/^\*//, @l);
		    grep { print STATE "disable host $_\n" } @l;
		}
	    }
	    foreach $group (keys %watch) {
		if ($watch_disabled{$group} == 1) {
		    print STATE "disable watch $group\n";
		}
		foreach $service (keys %{$watch{$group}}) {
		    if ($watch{$group}->{$service}->{'disable'} == 1) {
			print STATE "disable service $group $service\n";
		    }
		}
	    }
	    close (STATE);

	} elsif ($state eq "opstatus") {
	    if (!open (STATE, ">$CF{STATEDIR}/opstatus")) {
		syslog ("err", "could not write to opstatus state file: $!");
		next;
	    }
	    foreach $group (keys %watch) {
	    	foreach $service (keys %{$watch{$group}}) {
		    print STATE "group=$group service=$service" .
			" op_status=$watch{$group}->{$service}->{_op_status}" .
			" failure_count=$watch{$group}->{$service}->{_failure_count}" .
			" alert_count=\n";
		}
	    }
	    close (STATE);
	}
    }
}


#
# load state
#
sub load_state {
    my (@states) = @_;
    my ($l, $cmd, $args, $group, $service, $what, $state);

    foreach $state (@states) {
    	if ($state eq "disabled") {
	    if (!open (STATE, "$CF{STATEDIR}/disabled")) {
		syslog ("err", "could not read state file: $!");
		next;
	    }

	    while (defined ($l = <STATE>)) {
		chomp $l;
		($cmd, $what, $args) = split (/\s+/, $l, 3);

		next if ($cmd ne "disable");

		if ($what eq "host") {
		    &disen_host ($args);
		} elsif ($what eq "watch") {
		    syslog ("err", "undefined watch reading state file: $l")
			if (!defined &disen_watch ($args));
		} elsif ($what eq "service") {
		    ($group, $service) = split (/\s+/, $args, 2);
		    syslog ("err",
		    	"undefined group or service reading state file: $l")
			if (!defined &disen_service ($group, $service));
		}
	    }

	    syslog ("info", "state '$state' loaded");
	    close (STATE);
	}
    }
}


#
# authenticate a login
#
sub auth {
    my ($type, $user, $plaintext) = @_;
    my ($pass, %u, $l, $u, $p);

    if ($user eq "" || $plaintext eq "") {
	syslog ('err', "an undef username[$user] or password[$plaintext] supplied");
    	return undef;
    }

    #
    # standard UNIX passwd
    #
    if ($type eq "getpwnam") {
	(undef, $pass) = getpwnam($user);
	return undef
	    if (!defined $pass);

	if ((crypt ($plaintext, $pass)) ne $pass) {
	    return undef;
	}
	return 1;

    #
    # shadow password
    #
    } elsif ($type eq "shadow") {

    #
    # "mon" authentication
    #
    } elsif ($type eq "userfile") {
    	if (!open (U, $CF{"USERFILE"})) {
	    syslog ('err', "could not open user file '$CF{USERFILE}': $!");
	    return undef;
	}
	while (<U>) {
	    next if (/^\s*#/ || /^\s*$/);
	    chomp;
	    ($u,$p) = split (/\s*:\s*/, $_, 2);
	    $u{$u} = $p;
	}
	close (U);

	return undef if ((crypt ($plaintext, $u{$user})) ne $u{$user});
	return 1;
    	
    } else {
    	syslog ('err', "authentication type '$type' not known");
    }

    return undef;
}


#
# load the table of who can do which commands
#
sub load_auth {
    my ($startup) = @_;
    my ($l, $cmd, $users, $u, $host, $user, $password, $sect);

    %AUTHCMDS = ();
    %AUTHTRAPS = ();
    %AUTHSNMPTRAPS = ();
    $sect = "command";

    if (!open (C, $CF{"AUTHFILE"})) {
	&err_startup ($startup, "could not open $CF{AUTHFILE}: $!");
	return undef;
    }

    while (defined ($l = <C>)) {
	next if ($l =~ /^\s*#/ || $l =~ /^\s*$/);
	chomp $l;
	$l =~ s/^\s*//;
	$l =~ s/\s*$//;

	if ($l =~ /^command\s+section/) {
	    $sect = "command";
	    next;
	} elsif ($l =~ /^trap\s+section/) {
	    $sect = "trap";
	    next;
	} elsif ($l =~ /^snmp trap section/) {
	    $sect = "snmptrap";
	    next;
	}

	if ($sect eq "command") {
	    ($cmd, $users) = split (/\s*:\s*/, $l, 2);
	    if (!defined $users) {
		&err_startup ($startup, "could not parse line $. of auth file\n");
		next;
	    }
	    foreach $u (split (/\s*,\s*/, $users)) {
		$AUTHCMDS{"\L$cmd"}{$u} = 1;
	    }

	} elsif ($sect eq "trap") {
	    if ($l !~ /^(\S+)\s+(\S+)\s+(\S+)$/) {
		syslog ('err', "invalid entry in trap sect of $CF{AUTHFILE}, line $.");
	    	next;
	    }
	    ($host, $user, $password) = ($1, $2, $3);

	    if ($host eq "*") {
		#
	    	# allow traps from all hosts
		#

	    } elsif ($host =~ /^[a-z]/ && ($host = gethostbyname ($host)) eq "") {
		syslog ('err', "invalid host in $CF{AUTHFILE}, line $.");
		next;
	    } elsif ($host =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ &&
	    		($host = inet_aton ($host)) eq "") {
		syslog ('err', "invalid host in $CF{AUTHFILE}, line $.");
		next;
	    } else {
	    	syslog ('err', "invalid host in $CF{AUTHFILE}, line $.");
		next;
	    }
	    $host = inet_ntoa ($host);
	    $AUTHTRAPS{$host}{$user} = $password;

	} elsif ($sect eq "snmptrap") {

	    if ($l !~ /^(\S+)\s+(\S+)$/) {
	    	syslog ('err', "invalid line in $CF{AUTHFILE}, line $.");
		next;
	    }

	    ($host, $password) = ($1, $2);
	    $AUTHSNMPTRAPS{$host}{$password} = 1;

	} else {
	    syslog ('err', "unknown section in $CF{AUTHFILE}: $l");
	}
    }
    close (C);
}


#
# return undef if $user isn't permitted to perform $cmd
#
sub check_auth {
    my ($user, $cmd) = @_;

    return 1 if ($AUTHCMDS{$cmd}{"all"});
    return 1 if (defined ($user) && $AUTHCMDS{$cmd}{$user});
    syslog ("err", "user '$user' tried '$cmd', not authenticated");
    return undef;
}


#
# reload things
#
sub reload {
    my (@what) = @_;

    for (@what) {
    	if ($_ eq "auth") {
	    &load_auth();
	} elsif ($_ eq "oncall") {
	    &load_oncall();

	} else {
	    return undef;
	}
    }

    return 1;
}


#
# (re)load the oncall schedule
#
sub load_oncall {
    my ($startup) = @_;
    my ($group, $service, $time, $who, %newoncall);

    if (!open (ONCALL, $CF{"OCFILE"})) {
    	&err_startup ($startup, "could not open $CF{OCFILE}: $!");
	return undef;
    }

    %newoncall = ();
    while (<ONCALL>) {
    	next if (/^\s*$/ || /^\s*#/);
	chomp;
	if (!/^\s* ([a-zA-Z0-9_.-]+) \s+
		([a-zA-Z0-9_.-]+) \s+
		(\w{3} \s+ \d{1,2}:\d\d|default|none) \s+
		(.*) \s*$/xi) {

	    &err_startup ($startup,
	    	syslog ('err', "error in oncall configuration, line $."));
	    close (ONCALL);
	    return undef;
	}

	($group, $service, $time, $who) = ($1, $2, $3, $4);
	$group =~ tr/A-Z/a-z/;
	$service =~ tr/A-Z/a-z/;
	$time =~ tr/A-Z/a-z/;

	if (!defined($groups{$group})) {
	    &err_startup ($startup,
	    	"group $group in oncall line $. not defined in $CF{OCFILE}");
	    close (ONCALL);
	    return undef;
	} elsif (!defined $watch{$group}->{$service}) {
	    &err_startup ($startup,
	    	"service $service in oncall line $. not defined in $CF{OCFILE}");
	    close (ONCALL);
	    return undef;
	}

	print "[$group] [$service] [$time] [$who]\n";
    }
    close (ONCALL);

    %oncall = %newoncall;
    1;
}


sub err_startup {
    my ($startup, $msg) = @_;

    if ($startup) {
    	die "$msg\n";
    } else {
    	syslog ('err', $msg);
    }
}


#
# handle SNMP trap
#
sub handle_snmp_trap {
    my ($buf, $from) = @_;
    my ($port, $addr, $fromip);
    my (%traphash);

    ($port, $addr) = sockaddr_in ($from);
    $fromip = inet_ntoa ($addr);

    if (!defined ($AUTHSNMPTRAPS{$fromip})) {
    	syslog ('err', "got SNMP trap from unauthorized agent: $fromip");
	return undef;
    }

    $TRAP_PDU->buffer ($buf);
    %traphash = $TRAP_PDU->decode;

    if (! keys %traphash) {
    	syslog ('err', "error decoding SNMP trap: " . $TRAP_PDU->error);
	return undef;
    }

    if ($AUTHSNMPTRAPS{$fromip} ne
	    crypt ($traphash{"community"}, $traphash{"community"})) {
    	syslog ('err', "unauthorized community from agent: $fromip");
	return undef;
    }

    #
    # here's the real meat
    #
}


#
# handle a trap
#
sub handle_trap {
    my ($buf, $from) = @_;
    my ($sref, $time, $l, $lasttag);
    my ($port, $addr, $fromip, $noalert, %trap);
    my ($traphost, $trapuser, $trappass);

    $time = time;
    $noalert = 0;
    %trap = ();
    undef $lasttag;

#
# MON-specific tags
# pro	protocol
# aut	auth
# usr	username
# pas	password
# typ	type  ("failure", "up", "startup", "trap", "traptimeout")
# spc	specific type (TRAP_*)
# seq	sequence
# grp	group
# svc	service
# hst	host
# sta	status (opstatus)
# tsp	timestamp as time(2) value
# sum	summary output
# dtl	detail (terminated by \n.\n)
#

    foreach $l (split (/\n/, $buf)) {
    	if ($l =~ /^(\w+)=(.*)/) {
	    chomp $2;
	    $lasttag = $1;
	    $trap{$1} = $2;
	} elsif (defined $lasttag) {
	    $trap{$lasttag} .= "\n$l";
	} else {
	    syslog ('err', "unspecified tag in trap: $l");
	}
    }

    ($port, $addr) = sockaddr_in ($from);
    $fromip = inet_ntoa ($addr);

    #
    # trap authentication
    #
    if (defined ($AUTHTRAPS{"*"})) {
	$traphost = "*";
    } else {
    	$traphost = $addr;
    }

    if (defined ($AUTHTRAPS{$traphost}{"*"})) {
    	$trapuser = "*";
	$trappass = "";
    } else {
    	$trapuser = $trap{"usr"};
	$trappass = $trap{"pas"};
    }

    if (!defined ($AUTHTRAPS{$traphost})) {
	syslog ('err', "received trap from unauthorized host: $fromip");
    	return undef;
    }

    if ($trapuser ne "*" &&
	    crypt ($trappass, $AUTHTRAPS{$traphost}{$trapuser}) ne
	    $AUTHTRAPS{$traphost}{$trapuser}) {
	syslog ('err', "received trap from unauthorized user $trapuser, host $traphost");
    	return undef;
    }

    if ($trap{"pro"} < $TRAP_PRO_VERSION) {
    	syslog ('err', "cannot handle traps from version less than $TRAP_PRO_VERSION");
	return undef;
    }

    if (!defined $trap{"typ"} || !defined ($trap{"spc"})) {
	syslog ('err', "no trap type specified from $fromip");
    	return undef;
    }

    if (!defined ($groups{$trap{"grp"}})) {
    	syslog ('err', "trap received for undefined group $trap{grp}");
	return;
    } elsif (!defined $watch{$trap{"grp"}}->{$trap{"svc"}}) {
    	syslog ('err', "trap received for undefined service type $trap{grp}/$trap{svc}");
	return;
    }

    $sref = \%{$watch{$trap{"grp"}}->{$trap{"svc"}}};
    $sref->{"_last_trap"} = $time;

    syslog ('info', "trap $trap{typ} $trap{spc} from " .
    	"$fromip for $trap{grp} $trap{svc}, status $trap{sta}");

    my $group = $trap{"grp"};
    my $service = $trap{"svc"};

    #
    # Not sure what I want to do with this. It's not done, and
    # just because it's here doesn't mean that it is meant to work
    # how it is coded.
    #
    if (1) {
	if ($trap{"spc"} == $STAT_COLDSTART) {
	    set_op_status ($group, $service, $STAT_COLDSTART);
	    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
		if ($sref->{"trapduration"});

	} elsif ($trap{"spc"} == $STAT_WARMSTART) {
	    set_op_status ($group, $service, $STAT_WARMSTART);
	    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
		if ($sref->{"trapduration"});
	    $sref->{"_last_uptrap"} = $time;

	} elsif ($trap{"spc"} == $STAT_LINKDOWN) {
	    set_op_status ($group, $service, $STAT_LINKDOWN);
	    $sref->{"_failure_count"}++;
	    $sref->{"_first_failure"} = $tm if ($sref->{"_op_status"} != $STAT_FAIL);
	    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
		if ($sref->{"trapduration"});

	} elsif ($trap{"spc"} == $STAT_OK) {
	    set_op_status ($group, $service, $STAT_OK);
	    $sref->{"_last_uptrap"} = $time;
	    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
		if ($sref->{"trapduration"});

	} elsif ($trap{"spc"} == $STAT_FAIL) {
	    set_op_status ($group, $service, $STAT_FAIL);
	    $sref->{"_first_failure"} = $tm if ($sref->{"_op_status"} != $STAT_FAIL);
	    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
	    	if ($sref->{"trapduration"});
	
	} elsif ($trap{"spc"} == $STAT_WARN) {
	    set_op_status ($group, $service, $STAT_WARN);

# 	} elsif ($trap{"spc"} == $STAT_HEARTBEAT) {
# 	    set_op_status ($group, $service, $STAT_OK);
# 	    $sref->{"_last_uptrap"} = $time;
# 	    $noalert++;

	} else {
	    syslog ('err', "trap received from $fromip" .
		    " for undefined type $trap{typ} $trap{spc} $trap{grp}");
	    return;
	}
    }

    shift @last_failures if (@last_failures > $CF{"MAX_KEEP"});
    push @last_failures, "$trap{grp} $trap{svc}" .
	" $tm $trap{typ} $trap{spc} $trap{sum}";

    &do_alert ($trap{"grp"}, $trap{"svc"},
    	$trap{"sum"} . $trap{"dtl"}, $trap{"sta"}, $FL_TRAP) unless ($noalert);
}


#
# trap timeout
#
sub handle_trap_timeout {
    my ($group, $service) = @_;
    my ($tmnow);

    $tmnow = time;

    my $sref = \%{$watch{$group}->{$service}};
    $sref->{"_failure_count"}++;
    $sref->{"_last_failure"} = $tmnow;
    $sref->{"_first_failure"} = $tmnow if ($sref->{"_op_status"} != $STAT_FAIL);
    set_op_status ($group, $service, $STAT_FAIL);
    $sref->{"_last_summary"} = "trap timeout";
    shift @last_failures if (@last_failures > $CF{"MAX_KEEP"});
    push @last_failures, "$group $service $tm $sref->{_last_summary}";
    &syslog ('crit', "failure for $last_failures[-1]");

    &do_alert ($group, $service, undef, undef, $FL_TRAPTIMEOUT);
}


#
# dependency check
#
# return -1, if "infinite" loop type is "O"
# return undef, for unknown status of dependent service 
#      or a loop type of -O
# return 0, if dependent service failed
# return op_status > 0, if dependent service is successfull
#
sub checkDepend {
	my ($group, $service, $depth) = @_;

	my $sref = \%{$watch{$group}->{$service}};
	my $dsref =  \%{$watch{$group}->{$service}};	
	my $depend = $dsref->{'depend'};
	my $dstatus = $dsref->{'_op_status'};
	chomp $depend;

	#
	# MAKE THIS IGNORE DISABLED SERVICES AND WATCHES
	#
	my $dlastchecked;
	if($dsref->{'_last_success'} > $dsref->{'_last_failure'}) {
		$dlastChecked=$dsref->{'_last_success'};

	} else {
		$dlastChecked=$dsref->{'_last_failure'};
	}

	my @traverse = ();
	if (!$depth) {
	    $depth = 0;
	    undef @traverse;
	}
	push (@traverse, "$group.$service");	# -O, 

	#
	# we have reach a watch:service without any
	# dependencies. or a known op_status of a ---BUG--
	# service. return it.
	#
	if (!defined $depend || (
		($dstatus == $STAT_OK || $dstatus == $STAT_DEPEND ||
		 $dstatus == $STAT_UNKNOWN) && $depth > 0)) {
		return $dstatus
	}

	#
	# check for loops in the dependency
	#
	for (my $i=0; $i<@traverse; $i++) {
	    for (my $j=$i+1; $j<@traverse; $j++) {
		#
		# is there a loop?
		#
		if ($traverse[$i] eq $traverse[$j]) {
		    #
		    # -O loop
		    #
		    if ($j > 2 && $i > 0) {
			#
			# unitialize op status for type -O
			#
			set_op_status ($group, $service, $STAT_DEPEND)
			    if ($sref->{"_op_status"} == $STAT_OK);
			return undef;

		    #
		    # O loop
		    #
		    } else {
			return -1;
		    }
		}
	    }
	}


	#
	# recursively evaluate the dependencies
	#
	my @dservices = $depend =~ /[a-zA-Z0-9_.-]+:[a-zA-Z0-9_.-]+/g;
	foreach my $str (@dservices) {
		my($sublastChecked, $subsref);
		next if ($str =~ /^\d+$/);
		next if ($str =~ /^$/);

		my ($dgroup ,$dservice) = split(':', $str);

		$subsref =  \%{$watch{$dgroup}->{$dservice}};

		if($subsref->{'_last_success'} > $subsref->{'_last_failure'}) {
			$sublastChecked=$subsref->{'_last_success'};
		} else {
			$sublastChecked=$subsref->{'_last_failure'};
		}

		#
		# do it recursively
		#
		$dstatus = &checkDepend($dgroup, $dservice, ++$depth);

		#
		# this is bad,  break deadlock
		#
		return $dstatus if ($dstatus < 0);

		$dstatus='undef' if ($dstatus == $STAT_DEPEND ||
		    $dstatus == $STAT_UNTESTED || $dstatus == $STAT_UNKNOWN);

		#
		# is this an depency of A<-B<-C However 
		# Either A or B was not checked BEFORE
		# C. In this case we just set to undef
		# The better way is to for check
		# the other services. So we would
		# need to force check A and B.
		#
		$dstatus='undef' if ($dlastChecked >= $sublastChecked);	

		$depend =~ s/^${str}([^[\w\._\-:]*)/${dstatus}$1/g;		# head sub
	      	$depend =~ s/([^\w\._\-:]+)${str}([^[\w\._\-:]+)/$1${dstatus}$2/g;
		$depend =~ s/([^\w\._\-:]+)${str}$/$1${dstatus}/g;		# trail sub
	}
	return eval($depend);
}


#
# write to a socket
#
sub sock_write {
    my ($sock, $buf) = @_;
    my ($nleft, $nwritten);

    $nleft = length ($buf);
    while ($nleft) {
    	$nwritten = syswrite ($sock, $buf, $nleft);
	if (!defined ($nwritten)) {
	    return undef if ($! != EAGAIN);
	    usleep (100000);
	    next;
	}
	$nleft -= $nwritten;
	substr ($buf, 0, $nwritten) = "";
    }
}


#
# do I/O processing for traps and client connections
#
sub handle_io {

    #
    # build iovec for server connections, traps, and clients
    #
    $iovec = '';
    my $niovec = '';
    vec ($iovec, fileno (TRAPSERVER), 1) = 1;
    vec ($iovec, fileno (SERVER), 1) = 1;
    vec ($iovec, fileno (SNMPSERVER), 1) = 1 if ($CF{"SNMP"});
    foreach my $cl (keys %clients) {
	vec ($iovec, $cl, 1) = 1;
    }

    #
    # handle client I/O while there is some to handle
    #
    my $sleep = $SLEEPINT;
    my $tm0 = [gettimeofday];
    my $n;
    while ($n = select ($niovec = $iovec, undef, undef, $sleep)) {
	if ($! == &EINTR) {
	}

	my $tm1 = [gettimeofday];

	#
	# mon trap
	#
	if (vec ($niovec, fileno (TRAPSERVER), 1)) {
	    if (!defined ($from = recv (TRAPSERVER, $trapbuf, 65536, 0))) {
		syslog ('err', "error trying to recv a trap: $!");
	    } else {
		&handle_trap($trapbuf, $from);
	    }
	    next;
	
	#
	# SNMP trap
	#
	} elsif ($CF{"SNMP"} && vec ($niovec, fileno (SNMPSERVER), 1)) {
	    my ($from, $trapbuf);
	    if (!defined ($from = recv (SNMPSERVER, $trapbuf, 65536, 0))) {
		syslog ('err', "error trying to recv an SNMP trap: $!");
	    } else {
		&handle_snmp_trap($trapbuf, $from);
	    }
	    next;

	#
	# client connections
	#
	} elsif (vec ($niovec, fileno (SERVER), 1)) {
	    &client_accept();
	}

	#
	# read data from clients if any exists
	#
	if ($numclients) {
	    foreach $cl (keys %clients) {
		next if (!vec ($niovec, $cl, 1));

		my $buf = '';
		$n = sysread ($clients{$cl}{"fhandle"}, $buf, 8192);
		if ($n == 0 && $! != &EAGAIN) {
		    &client_close ($cl);
		} elsif (!defined $n) {
		    &client_close ($cl, "read error: $!");
		} else {
		    $clients{$cl}{"buf"} .= $buf;
		    $clients{$cl}{"timeout"} = $CF{"CLIENT_TIMEOUT"};
		    $clients{$cl}{"last_read"} = time;
		}
	    }
	}

	#
	# execute client commands which have been read
	#
	&client_dopending() if ($numclients);
	last if (tv_interval ($tm0, $tm1) >= $SLEEPINT);
	$sleep = $SLEEPINT - tv_interval ($tm0, $tm1);
    }

    if (!defined ($n)) {
	    syslog ('err', "select returned an error for I/O loop: $!");
    }

    #
    # count down client inactivity timeouts and close expired connections
    #
    if ($numclients) {
	foreach $cl (keys %clients) {
	    $clients{$cl}{"timeout"} = time - $clients{$cl}{"last_read"};
	    if ($clients{$cl}{"timeout"} >= $CF{"CLIENT_TIMEOUT"}) {
		&client_close ($cl, "timeout after ${CLIENT_TIMEOUT}s");
	    }
	}
    }
}


#
# generate alert and monitor path hashes
#
sub gen_scriptdir_hash {
    my ($d, @scriptdirs, @alertdirs, $service, $group, $monitor, $period, $found);

    %MONITORHASH = ();
    %ALERTHASH = ();

    foreach $d (split (/\s*:\s*/, $CF{"SCRIPTDIR"})) {
	if (-d "$d" && -x "$d") {
	    push (@scriptdirs, $d);
	} else {
	    syslog ('err', "scriptdir $d is not usable");
	}
    }

    foreach $d (split (/\s*:\s*/, $CF{"ALERTDIR"})) {
	if (-d $d && -x $d) {
	    push (@alertdirs, $d);
	} else {
	    syslog ('err', "alertdir $d is not usable");
	}
    }

    #
    # monitors
    #
    foreach $group (keys %watch) {
    	foreach $service (keys %{$watch{$group}}) {
	    next if (!defined $watch{$group}->{$service}->{"monitor"});
	    $monitor = (split (/\s+/, $watch{$group}->{$service}->{"monitor"}))[0];
	    $found = 0;
	    foreach (@scriptdirs) {
	    	if (-x "$_/$monitor") {
		    $MONITORHASH{$monitor} = "$_/$monitor"
		    	unless (defined $MONITORHASH{$monitor});
		    $found++;
		    last;
		}
	    }
	    if (!$found) {
	    	syslog ('err', "$monitor not found in one of (\@scriptdirs)");
	    }
	}
    }

    #
    # alerts
    #
    foreach $group (keys %watch) {
    	foreach $service (keys %{$watch{$group}}) {
	    foreach $period (keys %{$watch{$group}->{$service}->{"periods"}}) {
		foreach $my_alert (
			@{$watch{$group}->{$service}->{"periods"}->{$period}->{"alerts"}},
			@{$watch{$group}->{$service}->{"periods"}->{$period}->{"upalerts"}},
			@{$watch{$group}->{$service}->{"periods"}->{$period}->{"startupalerts"}},
			    ) {
		    $alert = $my_alert;
		    $alert =~ s/^(\S+=\S+ )*(\S+).*$/$2/;
		    $found = 0;
		    foreach (@alertdirs) {
			if (-x "$_/$alert") {
			    $ALERTHASH{$alert} = "$_/$alert"
			    	unless (defined $ALERTHASH{$alert});
			    $found++;
			}
		    }
		    if (!$found) {
			syslog ('err', "$alert not found in one of (\@alerttdirs)");
		    }
		}
	    }
	}
    }

}


#
# do some processing on dirs
#
sub normalize_paths {
    #
    # do some sanity checks on dirs
    #
    $CF{"STATEDIR"} = "$CF{BASEDIR}/$CF{STATEDIR}" if ($CF{"STATEDIR"} !~ m{^/});
    syslog ('err', "$CF{STATEDIR} does not exist") if (! -d $CF{"STATEDIR"});

    $CF{"LOGDIR"} = "$CF{BASEDIR}/$CF{LOGDIR}" if ($CF{"LOGDIR"} !~ m{^/});
    syslog ('err', "$CF{LOGDIR} does not exist") if (! -d $CF{LOGDIR});


    $CF{"AUTHFILE"} = "$CF{CFBASEDIR}/$CF{AUTHFILE}"
	    if ($CF{"AUTHFILE"} !~ m{^/});
    syslog ('err', "$CF{AUTHFILE} does not exist")
	    if (! -f $CF{"AUTHFILE"});

    $CF{"OCFILE"} = "$CF{CFBASEDIR}/$CF{OCFILE}"
	    if ($CF{"OCFILE"} !~ m{^/});

    if ($CF{"AUTHTYPE"} eq "userfile") {
	$CF{"USERFILE"} = "$CF{CFBASEDIR}/$CF{USERFILE}"
		if ($CF{"USERFILE"} !~ m{^/});
	syslog ('err', "$CF{USERFILE} does not exist")
		if (! -f $CF{"USERFILE"});
    }

    $CF{"DTLOGFILE"} = "$CF{LOGDIR}/$CF{DTLOGFILE}"
	    if ($CF{"DTLOGFILE"} !~ m{^/});

    if ($CF{"HISTORICFILE"} ne "") {
	$CF{"HISTORICFILE"} = "$CF{LOGDIR}/$CF{HISTORICFILE}"
		if ($CF{"HISTORICFILE"} !~ m{^/});
    }

    #
    # script and alert dirs may have multiple paths
    #
    foreach my $dir (\$CF{"SCRIPTDIR"}, \$CF{"ALERTDIR"}) {
	my @n;
	foreach my $d (split (/\s*:\s*/, $$dir)) {
	    $d =~ s{/$}{};
	    $d = "$CF{BASEDIR}/$d" if ($d !~ m{^/});
	    syslog ('err', "$d does not exist, check your alertdir and mondir paths")
		unless (-d $d);
	    push @n, $d;
	}
	$$dir = join (":", @n);
    }
}


#
# set opstatus and save old status
#
sub set_op_status {
    my ($group, $service, $status) = @_;

    $watch{$group}->{$service}->{"_last_op_status"} = 
	$watch{$group}->{$service}->{"_op_status"};
    $watch{$group}->{$service}->{"_op_status"} = $status;
}


sub debug_dir {
    print <<EOF;
    basedir	[$CF{BASEDIR}]
    cfbasedir	[$CF{CFBASEDIR}]

    cf		[$CF{CF}]
    statedir	[$CF{STATEDIR}]
    logdir	[$CF{LOGDIR}]
    authfile	[$CF{AUTHFILE}]
    ocfile	[$CF{OCFILE}]
    userfile	[$CF{USERFILE}]
    dtlogfile	[$CF{DTLOGFILE}]
    historicfile[$CF{HISTORICFILE}]
    scriptdir	[$CF{SCRIPTDIR}]
    alertdir	[$CF{ALERTDIR}]
EOF

    foreach $m (keys %MONITORHASH) {
	print "M $m=[$MONITORHASH{$m}]\n";
    }
    foreach $m (keys %ALERTHASH) {
	print "A $m=[$ALERTHASH{$m}]\n";
    }
}


#
# globals affected by config file are
# all stored in %CF
#
sub init_cf_globals {
    $CF{"BASEDIR"} = $opt_b || "/usr/lib/mon";
    $CF{"BASEDIR"} =~ s{/$}{};
    $CF{"CFBASEDIR"} = $opt_B || "/etc/mon";
    $CF{"CF"} = $opt_c || "$CF{CFBASEDIR}/mon.cf";
    $CF{"CF"} = "$PWD/$CF{CF}" if ($CF{"CF"} !~ /^\//);
    $CF{"SCRIPTDIR"} = "mon.d";
    $CF{"ALERTDIR"}  = "alert.d";
    $CF{"LOGDIR"} = $opt_L || "log.d";
    $CF{"STATEDIR"}  = -d "/var/state/mon" ? "/var/state/mon"
		: -d "/var/lib/mon" ? "/var/lib/mon"
		: "state.d";
    $CF{"AUTHFILE"}  = "auth.cf";
    $CF{"AUTHTYPE"}  = "getpwnam";
    $CF{"USERFILE"}  = "monusers.cf";
    $CF{"OCFILE"}    = "oncall.cf";
    $CF{"PIDFILE"}   = (-d "/var/run/mon" ? "/var/run/mon"
		    : -d "/var/run" ? "/var/run"
		    : "/etc") . "/mon.pid";
    $CF{"DTLOGFILE"} = "downtime.log";
    $CF{"DTLOGGING"} = 0;
    $CF{"MAX_KEEP"}  = 100;
    $CF{"CLIENT_TIMEOUT"} = 30;
    $CF{"SERVPORT"}  = getservbyname ("mon", "tcp") || 2583;
    $CF{"TRAPPORT"}  = getservbyname ("mon", "udp") || 2583;
    $CF{"MAXPROCS"}  = 0;
    $CF{"SNMP"} = 0;
    $CF{"SNMPPORT"} = 34000;
    $CF{"HISTORICFILE"} = "";
    $CF{"HISTORICTIME"} = 0;
}


#
# globals not affected by config file
#
sub init_globals {
    $TRAP_PRO_VERSION = 0.3807;
    $SLEEPINT  = 1;
    $STOPPED   = 0;
    $STOPPED_TIME = 0;
    $START_TIME = time;
    $PROT_VERSION = "0.38.9";
    $HOSTNAME  = hostname;
    $OS = `uname -s 2>/dev/null` || "Unknown";
    chomp $OS;
    $PWD = getcwd;

    #
    # flags
    #
    $FL_MONITOR = 1;
    $FL_UPALERT = 2;
    $FL_TRAP = 4;
    $FL_TRAPTIMEOUT = 8;
    $FL_STARTUPALERT = 16;
    $FL_TEST = 32;

    #
    # specific trap types
    #
    ($TRAP_COLDSTART, $TRAP_WARMSTART, $TRAP_LINKDOWN, $TRAP_LINKUP,
	$TRAP_AUTHFAIL, $TRAP_EGPNEIGHBORLOSS, $TRAP_ENTERPRISE, $TRAP_HEARTBEAT) = (0..7);

    #
    # operational statuses
    #
    ($STAT_FAIL, $STAT_OK, $STAT_COLDSTART, $STAT_WARMSTART, $STAT_LINKDOWN,
	$STAT_UNKNOWN, $STAT_TIMEOUT, $STAT_UNTESTED, $STAT_DEPEND, $STAT_WARN) = (0..9);
    
    %FAILURE = (
    	$STAT_FAIL => 1,
	$STAT_LINKDOWN => 1,
	$STAT_TIMEOUT => 1,
    );

    %SUCCESS = (
    	$STAT_OK => 1,
    );

    %WARNING = (
    	$STAT_COLDSTART => 1,
	$STAT_WARMSTART => 1,
	$STAT_UNKNOWN => 1,
	$STAT_DEPEND => 1,
	$STAT_WARN => 1,
    );

    %OPSTAT = ("fail" => $STAT_FAIL, "ok" => $STAT_OK, "coldstart" => $STAT_COLDSTART,
	    "warmstart" => $STAT_WARMSTART, "linkdown" => $STAT_LINKDOWN,
	    "unknown" => $STAT_UNKNOWN, "timeout" => $STAT_TIMEOUT,
	    "untested" => $STAT_UNTESTED, "dependency" => $STAT_DEPEND);

    #
    # fast lookup hashes for alerts and monitors
    #
    %MONITORHASH = ();
    %ALERTHASH = ();

    $TRAP_PDU = new Mon::SNMP;
}


#
# clear timers
#
sub clear_timers {
    my ($group, $service) = @_;

    return undef if (!defined $watch{$group}->{$service});

    my $sref = \%{$watch{$group}->{$service}};

    $sref->{"_trap_timer"} = $sref->{"traptimeout"}
    	if ($sref->{"traptimeout"});

    $sref->{"_trap_duration_timer"} = $sref->{"trapduration"}
    	if ($sref->{"trapduration"});

    $sref->{"_timer"} = $sref->{"interval"}
    	if ($sref->{"interval"});

    foreach my $period (keys %{$sref->{"periods"}}) {
    	my $pref = \%{$sref->{"periods"}->{$period}};

	$pref->{"_last_alert"} = 0
	    if ($pref->{"alertevery"});
	
	$pref->{"_consec_failures"} = 0
	    if ($pref->{"alertafter_consec"});
	
	$pref->{'_1stfailtime'} = 0
	    if ($pref->{"alertafterival"});
    }
}


#
# load some amount of the alert history into memory
#
sub readhistoricfile {
    return if ($CF{"HISTORICFILE"} eq "");

    if (!open (HISTFILE, $CF{"HISTORICFILE"})) {
	syslog ('err',  "Could not read history from $CF{HISTORICFILE} : $!");	
	return;
    }

    my $epochLimit = 0;
    if ($CF{"HISTORICTIME"} != 0) {
	$epochLimit = time - $CF{"HISTORICTIME"};
    }

    while (<HISTFILE>) {
	next if (/^\s*$/ || /^\s*#/);
    	chomp;
	my $epochAlert = (split(/\s+/))[3];
	push (@last_alerts, $_) if ($epochAlert >= $epochLimit);
    }

    close (HISTFILE);

    if (defined $CF{"MAX_KEEP"}) {
    	splice(@last_alerts, 0, $#last_alerts + 1 - $CF{"MAX_KEEP"});
    }
}


#
# This routine simply calls an alert.
#
# call with %args = (
#       group		=> "name of group",
#       service		=> "name of service",
#       pref		=> "optional period reference",
#	alert		=> "alert script",
#	args		=> "args to alert script",
# 	flags		=> "flags, as in $FL_*",
#	retval		=> "return value of monitor",
#	output		=> "output of monitor",
# )
#
sub call_alert {
    my (%args) = @_;

    foreach my $mandatory_arg (qw(
		group service flags
		retval alert output
	    )) {
    	return (undef) if (!defined $args{$mandatory_arg});
    }

    my @groupargs = grep (!/^\*/, @{$groups{$args{"group"}}});

    my $tmnow = time;
    my ($summary) = split("\n", $args{"output"});
    $summary = "(NO SUMMARY)" if ($summary =~ /^\s*$/m);

    my $sref = \%{$watch{$args{"group"}}->{$args{"service"}}};
    if (defined $args{"pref"}) {
	my $pref = $args{"pref"};
    }

    my $alert = "";
    if (!defined $ALERTHASH{$args{"alert"}} ||
	    ! -f $ALERTHASH{$args{"alert"}}) {
	syslog ('err', "no alert found while trying to run $args{alert}");
	return undef;
    } else {
	$alert = $ALERTHASH{$args{"alert"}};
    }

    my $alerttype = "";           # sent to syslog and stored in @last_alerts
    my $alert_type = "failure";   # MON_ALERTTYPE set to this
    if ($args{"flags"} & $FL_UPALERT) {
    	$alerttype = "upalert";
	$alert_type = "up";
    } elsif ($args{"flags"} & $FL_STARTUPALERT) {
    	$alerttype = "startupalert";
	$alert_type = "startup";
    } elsif ($args{"flags"} & $FL_TRAPTIMEOUT) {
    	$alerttype = "traptimeoutalert";
	$alert_type = "traptimeout";
    } elsif ($args{"flags"} & $FL_TRAP) {
    	$alerttype = "trapalert";
	$alert_type = "trap";
    } elsif ($args{"flags"} & $FL_TEST) {
    	$alerttype = "testalert";
	$alert_type = "test";
    } else {
    	$alerttype = "alert";
    }

    #
    # log why we are triggering an alert
    #
    syslog ("alert", "calling $alerttype $alert for" .
	" $args{group}/$args{service} ($alert,$args{args}) $summary");

    my $pid = open (ALERT, "|-");
    if (!defined $pid) {
    	syslog ('err', "could not fork: $!");
	return undef;
    }

    #
    # child, the actual alert
    #
    if ($pid == 0) {
	#
	# set env variables to pass to the alert
	#
	foreach my $v (keys %{$sref->{"ENV"}}) {
	    $ENV{$v} = $sref->{"ENV"}->{$v};
	}

	$ENV{"MON_LAST_SUMMARY"}	= $sref->{"_last_summary"};
	$ENV{"MON_LAST_OUTPUT"}		= $sref->{"_last_output"};
	$ENV{"MON_LAST_FAILURE"}	= $sref->{"_last_failure"};
	$ENV{"MON_FIRST_FAILURE"}	= $sref->{"_first_failure"};
	$ENV{"MON_LAST_SUCCESS"}	= $sref->{"_last_success"};
	$ENV{"MON_DESCRIPTION"}		= $sref->{"description"};
	$ENV{"MON_GROUP"}		= $args{"group"};
	$ENV{"MON_SERVICE"}		= $args{"service"};
	$ENV{"MON_RETVAL"}		= $args{"retval"};
	$ENV{"MON_OPSTATUS"}		= $sref->{"_op_status"};
	$ENV{"MON_ALERTTYPE"}		= $alert_type;
	$ENV{"MON_STATEDIR"}		= $CF{"STATEDIR"};
	$ENV{"MON_LOGDIR"}		= $CF{"LOGDIR"};

	my $t;
	$t = "-u" if ($args{"flags"} & $FL_UPALERT);
	$t = "-T" if ($args{"flags"} & $FL_TRAP);
	$t = "-O" if ($args{"flags"} & $FL_TRAPTIMEOUT);

	my @execargs = (
	    $alert,
	    "-s", "$args{service}",
	    "-g", "$args{group}",
	    "-h", "@groupargs",
	    "-t", "$tmnow",
	);

	if ($t) {
	    push @execargs, $t;
	}

	if ($args{"args"} ne "") {
	    push @execargs, quotewords('\s+',0,$args{"args"});
	}

	if (!exec @execargs) {
	    syslog ('err', "could not exec alert $alert: $!");
	    return undef;
	}
	exit;
    }

    #
    # this will block if the alert is sucking gas
    #
    print ALERT $args{"output"};
    close (ALERT);
    waitpid $pid, 0;

    #
    # test alerts don't count
    #
    return (1) if ($args{"flags"} & $FL_TEST);

    #
    # tally this alert
    #
    if (defined $args{"pref"}) {
	$args{"pref"}->{"_last_alert"} = $tmnow;
    }
    $sref->{"_alert_count"}++;

    #
    # store this in the log
    #
    shift @last_alerts if (@last_alerts > $CF{"MAX_KEEP"});

    my $alertline = "$alerttype $args{group} $args{service}" .
	" $tmnow $alert ($args{args}) $summary";
    push @last_alerts, $alertline;

    #
    # append to alert history file
    #
    if ($CF{"HISTORICFILE"} ne "") {
    	if (!open (HISTFILE, ">>$CF{HISTORICFILE}")) {
	    syslog ('err',  "Could not append alert history to $CF{HISTORICFILE}: $!");
	} else {
	    print HISTFILE $alertline, "\n";
	    close (HISTFILE);
	}
    }

    return 1;
}
