#!/usr/bin/perl -w
# See copyright, etc in below POD section.
######################################################################

require 5.005;
use IO::File;
use Pod::Usage;
use Getopt::Long;
use Schedule::Load::Hosts;
use Time::localtime;
use strict;

######################################################################

use vars qw($Debug
	    $Debug_User
	    $Nice19 $Sendmail
	    $Opt_Renice_Min $Opt_Cpu_Min $Opt_Reserved_Min
	    %Complaints
	    @Opt_Cmds
	    );

$Nice19 = "renice19 -only_if_at";
$Sendmail = "/usr/lib/sendmail";

######################################################################

my %opt_server_params = ();
$Debug_User = $ENV{USER};   # Who to send --debug mail to

Getopt::Long::config ("no_auto_abbrev");
if (!GetOptions (
		 "help"		=> \&usage,
		 "debug!"	=> \$Debug,
		 "debug-user=s"	=> \$Debug_User,
		 "cpu_min=i"	=> \$Opt_Cpu_Min,	# Old _'s
		 "cpu-min=i"	=> \$Opt_Cpu_Min,
		 "renice_min=i"	=> \$Opt_Renice_Min,	# Old _'s
		 "renice-min=i"	=> \$Opt_Renice_Min,
		 "reserved_min=i"=> \$Opt_Reserved_Min,	# Old _'s
		 "reserved-min=i"=> \$Opt_Reserved_Min,
		 "port=i"	=> sub {shift; $opt_server_params{port} = shift;},
		 "dhost=s"	=> sub {shift; push @{$opt_server_params{dhost}}, split(':',shift);},
		 "<>"		=> \&parameter,
		 )) {
    die "%Error: Bad usage, try 'slpolice --help'\n";
}

my $scheduler = Schedule::Load::Hosts->fetch(%opt_server_params);
gather_loads($scheduler);
gather_reserveds($scheduler);
complain();

#----------------------------------------------------------------------

sub usage {
    print "Version: $Schedule::Load::VERSION\n";
    pod2usage(-verbose=>2, -exitval => 2);
    exit(1);
}

sub parameter {
    my $param = shift;
    if ($param =~ /^(.*)=([0-9]+)$/) {
	my $re = $1;  my $minutes = $2;
	if ($re !~ /[.?*^\$]/) { $re = '^'.$re.'$'; }
	print "Regexp for cmd: $re  $minutes\n" if $Debug;
	push @Opt_Cmds, [qr/$re/, $minutes];
    } else {
	die "%Error: Unknown argument (missing=): $param\n";
    }
}

######################################################################

sub gather_loads {
    my $scheduler = shift;

    (my $FORMAT =           "%-12s   %6s    %-8s     %4s    %6s     %-5s    %6s     %5s%%    %s\n") =~ s/\s\s+/ /g;
    foreach my $host ($scheduler->hosts_sorted) {
	foreach my $p ( sort {$b->pctcpu <=> $a->pctcpu}
			@{$host->top_processes} ) {
	    my $mach = $host->hostname;
	    my $name = $p->uname;
	    #print "ck $line\n";
	    next if !$p->time;
	    next if $name eq "root";
	    my $pid = $p->pid;
	    my $cmd = $p->fname||"";
	    my $min = $p->time/60.0;
	    $min = 0  if ($Debug);
	    $min = 9999 if ($Debug && $p->uid eq $<);  # Test... Every one of runner's violates
	    my $line = sprintf ($FORMAT,
				$host->hostname,
				$p->pid,
				$p->uname,		$p->nice0,
				int(($p->size||0)/1024/1024)."M",
				$p->state,		$p->time_hhmm,
				sprintf("%3.1f", $p->pctcpu),
				$p->fname);

	    print "Min $min  Name $name   Cmd $cmd  Pid $pid\n" if $Debug;
	    my $renice_limit = $Opt_Renice_Min;
	    my $cpu_limit    = $Opt_Cpu_Min;
	    foreach my $cmdminref (@Opt_Cmds) {
		my $re = $cmdminref->[0];  my $minutes = $cmdminref->[1];
		if ($cmd =~ /$re/ || $p->uname =~ /$re/) {
		    print "  Command_line_regexp match\n" if $Debug;
		    if ($Opt_Renice_Min) { $renice_limit = $minutes; }
		    if ($Opt_Cpu_Min) { $cpu_limit = $minutes; }
		}
	    }

	    if ($renice_limit
		&& $min >= $renice_limit) {
		my $lowered = 0;
		my $succ = `ssh $mach $Nice19 $pid 2>&1`;
		print "Lowering $mach $pid $succ\n" if $Debug;
		if ($succ !~ /%/) {
		    $Complaints{$name}{niced}{$pid}
		    = {one_subject => sprintf("Reniced Process %5d on $mach", $pid),
		       many_subject => sprintf("Reniced Processes"),
		       body_header => ("The following processes were reniced to 19\n"
				       ."Use `renice10 <pid>` to prevent this.\n"),
		       body_line => "   ".$line,
		       };
		}
	    }

	    if ($cpu_limit
		&& $min >= $cpu_limit) {
		$Complaints{$name}{cpu}{$pid}
		    = {one_subject => sprintf("High CPU Time Process %5d on $mach", $pid),
		       many_subject => sprintf("CPU Consuming Processes"),
		       body_header => ("The following processes have large CPU times\n"
				       ."Please consider killing them.\n"),
		       body_line => "   ".$line,
		       };
	    }
	}
    }
}

sub gather_reserveds {
    my $scheduler = shift;

    (my $FORMAT =             "%-12s   %-25s  %s\n") =~ s/\s\s+/ /g;
    foreach my $host ($scheduler->hosts_sorted) {
	if ($host->reserved) {
	    my $ostype = $host->archname ." ". $host->osvers;
	    $ostype =~ s/enterprise//;
	    $ostype .= " (on ".$host->slreportd_hostname.")" if $host->slreportd_hostname ne $host->hostname;

	    # Really the scheduler should provide preparsed information....
	    if ($host->reserved !~ /(\S+) at (\d\d)-(\S+) (\d\d):(\d\d)(;?.*)$/) {
		print "Res Parse Failed: ".$host->reserved."\n" if $Debug;
		next;
	    }

	    my ($name,$mday,$mon, $hr,$dmin,$cmt) = ($1, $2,$3,  $4,$5,    $6);
	    print "Compare then $mday $hr:$dmin  now ",localtime->mday," ",localtime->hour,":",localtime->min,"\n"  if $Debug;
	    my $min = localtime->min - $dmin;
	    $min += 60 * (localtime->hour - $hr);
	    $min += 24 * 60 * (localtime->mday - $mday);
	    $min = 0 if $min < 0;	# Too lazy to check months.  Let user get away with it.
	    #$min = 9999 if ($Debug);  # Test... Every one of runner's violates

	    my $res = "Reserved: ".$host->reserved;
	    my $line = sprintf ($FORMAT,
				$host->hostname,
				$ostype,
				$res);
	    print "Res: $min: $line\n" if $Debug;

	    if ($name ne "root"
		&& $Opt_Reserved_Min
		&& !$cmt
		&& $min >= $Opt_Reserved_Min) {
		$Complaints{$name}{reserved}{$host->hostname}
		    = {one_subject => sprintf("Long Reservation of %s", $host->hostname),
		       many_subject => sprintf("Long Reservations"),
		       body_header => ("The following reservations have been around for a long time\n"
				       ."Please consider releasing them.  Or, use a --comment with\n"
				       ."the reservation explaining your reservation reason.\n"),
		       body_line => "   ".$line,
		       };
	    }
	}
    }
}

sub complain {
    for my $to (sort (keys %Complaints)) {
	my $body = "";
	$body .= "DEBUGGING.  Really-To: $to\n\n" if $Debug;
	my $mailto = $to;
	$mailto = $Debug_User if $Debug;

	my $subj = undef;
	for my $topic (sort (keys %{$Complaints{$to}})) {
	    my $newtopic = 1;
	    for my $proc (sort (keys %{$Complaints{$to}{$topic}})) {
		my $procref = $Complaints{$to}{$topic}{$proc};
		if (!defined $subj) {
		    $subj = $procref->{one_subject};
		} else {
		    $subj = $procref->{many_subject};
		}
		if ($newtopic) {
		    $newtopic = 0;
		    $body .= "\n";
		    $body .= $procref->{body_header};
		    $body .= "\n";
		}
		$body .= $procref->{body_line};
	    }
	}

	# Some cleanups
	$body =~ s/\n\n+/\n\n/mg;
	print "To: $to\nSubject: $subj\n$body\n" if $Debug;

	# Send the mail
	my $cmd = "$Sendmail -F 'Rschedule Police' -f root $mailto";
	my $fh = IO::File->new("|$cmd") or die "%Error: $! $cmd";
	print $fh "To: $mailto\n";
	print $fh "From: Rschedule Police <root>\n";
	print $fh "Subject: Rschedule Police: $subj\n";
	print $fh "\n";
	print $fh "To see latest status, use:  rloads or rhosts\n";
	print $fh "\n";
	print $fh "$body\n";
	$fh->close;
    }
}

######################################################################
######################################################################
__END__

=pod

=head1 NAME

slpolice - Warn and renice top CPU hogs

=head1 SYNOPSIS

B<slpolice>
[ B<--help> ]
[ B<--port=>I<port> ]
[ B<--dhost=>I<host> ]
[ B<--cpu-hours> ]
[ B<--version> ]
[ B<--version> ]

=head1 DESCRIPTION

slpolice will determine the top cpu users across a cluster of hosts.
It will send mail if a process has over a specified amount of cpu time.

It will also mail if a user has a reservation for a long period of time.

Usually slpolice is run with a crontab entry similar to:

    5 8-21 * * * /usr/local/bin/slpolice --cpu_min 120 --reserved_min 120 long=999 >/dev/null 2>&1

This sends warnings each hour after 2 hours of CPU time.  It does not check
at night so that long overnight jobs will not receive warnings.

Additional non-parameter arguments specify specific command regular
expressions.  When a process' command matches that regexp, the specified
number of minutes will be used to determine when to send mail instead of
the default.

This program is most valuable when used with the L<nicercizerd> program, or
a operating system where nice 19 processes get only leftover cpu resources.
It requires a program called L<nice19> which is a version of nice that is
setgid root and renices a job to 19.  This comes with L<nicercizerd>.

=head1 ARGUMENTS

=over 4

=item --help

Displays this message and program version and exits.

=item --debug-user

With --debug, who to send the mail to instead of the process owner.

=item --port <portnumber>

Specifies the port number that slchoosed uses.

=item --dhost <hostname>

Specifies the host name that slchoosed uses.  May be specified multiple
times to specify backup hosts.  Defaults to SLCHOOSED_HOST environment
variable, which contains colon separated host names.

=item --cpu-min

Number of cpu minutes the job should have before being reported to the user.
Defaults to 0, which is off.

=item --renice-min

Number of minutes after which the nice value of a high cpu using process
that is not at 1 or 10 is reniced to 19.  Defaults to 0, which is off.

=item --reserved-min

Number of minutes a host may be reserved before reporting it to the user.
Defaults to 0, which is off.

=item --version

Displays program version and exits.

=back

=head1 DISTRIBUTION

The latest version is available from CPAN and from L<http://www.veripool.org/>.

L<nicercizerd> is available from L<http://www.veripool.org>.

Copyright 1998-2011 by Wilson Snyder.  This package is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.

=head1 AUTHORS

Wilson Snyder <wsnyder@wsnyder.org>

=head1 SEE ALSO

L<Schedule::Load>, L<nicercizerd>,  L<nice19>,

=cut
######################################################################
### Local Variables:
### compile-command: "./slpolice --debug --cpu_min 120 --reserved_min 120 "
### End:
