#! /bin/perl --    # -*-Perl-*-
# 
# inflow-collect - Flush & rotate batchfile produced by news server.
#                  Extract article count & volume data per sending site and
#                  per hierarchy. Update tab spaced datafile.
#
# newsfeeds entry required to provide batchfile:
#
#       INFLOW:*:Tf,Wsgbt:
#     
# batchfile format: remote_site group size arrival_time
# ex:               news.inter.net misc.test 502 835740023
#
# History: The inflow-package is based on the ideas of counter.pl, a perl
#          script written by Juan Garcia, Rediris, 1995-96.
#
# 960806 V1.0 released
# 960826 V1.0.1 fixed resolv.conf parsing (Gerhard Winkler)
# 960919 V1.1.0 stores inflow-collect command line options in result files
#               for later use (Juan Garcia)
#               removes bug in collecting data for different -A and -L options 
# 961215 V1.1.1   optional hourly summaries added (Felix Kugler)
# 961216 V1.1.2b2 collects now monthly data too (Juan Garcia)
# 970113 V1.2     some defaults & paths slightly modified
#
$Copy      = "(c) 1996 Felix.Kugler\@switch.ch";

$RELDATE = "Tue Jan 14 01:03:57 MET 1997";
$RELEASE = "V1.2";

# ---- begin config section -------------------------------------------------

$BATCHDIR  = "/var/spool/news/out.going"; # where INN puts data
$CHANNEL   = "INFLOW";
$SUMDIR    = "/usr/local/news/stat";       # summaries from previous periods
$SUMFILE   = "inflow.sum";
$HOURDIR   = "/usr/local/news/stat/hour";  # dir for hourly summaries
$CTLINND   = "/usr/local/news/inn/bin/ctlinnd";
$TIMEOUT   = 180;		# ctlinnd timeout
$MAXAGE    = 20;		# minutes after midnight after which to cycle
				# result file; sounds complicated, but this 
                                # allows to process logs normally right after
                                # midnight and cycle the result file next time
$STATSCRIPT = "inflow-stat -wVFp"; # script to start if -s option is given

# ---- end config section ----------------------------------------------------

require "getopts.pl";
require "ctime.pl";
#require "timelocal.pl";

($path,$0) = ($0 =~ /^(.*)\/([^\/]+)$/);                # strip path...

&Getopts('cdhpsvA:DL:');

$usage="$0      -  $Copy

release:  $RELEASE  of $RELDATE

usage:    $0 [-cdDhps][-A<level>][-L<level>]

Collects and preprocesses inbound news traffic on a News server.
$0 flushes channel $CHANNEL and resets the batchfile 
$BATCHDIR/$CHANNEL. 

Traffic data extracted from batchfiles is used to update the preprocessed 
data stored in $SUMDIR/$SUMFILE.

Options:  -h:         this help
          -c:         cycle resultfile after midnight i.e. reset resultfile
                      and label and save a copy of yesterday's results on
                      $SUMDIR
          -p:         prepare non-cumulative data files for plots which can
                      be postprocessed offline; files saved on
                      $HOURDIR
          -s:         process statistics when terminated i.e. start inflow-stat
          -A<level>:  resolve 'alt'-groups down to <level> hierarchy
                      levels. Default is to collect info about toplevel
                      alt-hierarchies only.
          -L<level>:  hierarchy levels to resolve. Default is to collect info
                      about toplevel hierarchies only. 

debug:   -d:          print debug info to STDERR
         -D:          developement mode: preserve batchfile, all created files
                      are stamped with a trailing \"D\".

The News server has to be configured to write a batchfile containing
site, group, size, and time information for every received article..
With INN, this can easily be achieved with a newsfeeds line

       $CHANNEL:*:Tf,Wsgbt:

which writes a batchfile with format

       remote_site group size arrival_time

It is common to configure inflow to produce hourly snapshots. A typical way
to achieve this is to start this script every hour by cron with a line like

0 * * * * $path/$0 -cs

\n";

if ($opt_h) { print "$usage"; exit 0; }

# init some variables
#$MONFILE='';		# init later when needed
$defaulttype = $opt_c ? "perday" : "continuous";  # continous, perday, permonth

# default per-hierarchie options
$opt_A=1 unless ($opt_A);
$opt_L=1 unless ($opt_L);
$options="-A$opt_A -L$opt_L";	

&gethostandfqdn;

if ($opt_D) {			# Developement mode
    unless (-f "$BATCHDIR/$CHANNEL.done") { 
	die "unable to locate data file of last period: $!\n";
    }
    system("/bin/cp $BATCHDIR/$CHANNEL.done $BATCHDIR/$CHANNEL.doneD\n");
    unless ($? == 0) {		# in case we could copy INFLOW data
	die "unable to copy data file of last period: $!\n";
    }
    $rawbatchfile = "$BATCHDIR/$CHANNEL.doneD";
    $SUMFILE .= "D"; 
} else {			# normal operation mode
    unless (-e "$BATCHDIR/$CHANNEL") { 
	system("$CTLINND -t $TIMEOUT flush $CHANNEL>/dev/null 2>&1");
	sleep 2;
    }
    system("/bin/mv $BATCHDIR/$CHANNEL $BATCHDIR/$CHANNEL.done\n");
    system("$CTLINND -t $TIMEOUT flush $CHANNEL>/dev/null 2>&1");
    unless ($? == 0) {		# in case we could not flush INFLOW channel
	# move back the batchfile to avoid data loss
	system("/bin/mv $BATCHDIR/$CHANNEL.done $BATCHDIR/$CHANNEL\n");
	# try again later...
	die "unable to flush $CHANNEL - keeping data\n" ;
    }
    $rawbatchfile = "$BATCHDIR/$CHANNEL.done";
}

$timenowstr = &MakeTimeStr;	# time of processing newlogs
$timenow = time; 

if (-f "$rawbatchfile") {          # new data ready to process
    &readoldresults($defaulttype,"$SUMDIR/$SUMFILE");
    &readnewdata;
    &writeresults($defaulttype,"$SUMDIR/$SUMFILE");
    &updmonthresults if ($update_monfile == 1);

    # start making statistics 
    if ($opt_s && $STATSCRIPT ne "") { exec("$path/$STATSCRIPT"); }
}
else { warn "no data file $rawbatchfile\n"; }


# updmonthresults
# ----------------------------------------------------------------------
# acumulate monthly results (file type 'permonth')
#
sub updmonthresults {
    $MONFILE  = $SUMFILE . "." . &MakeShortTimeStr(time - 86400);
    warn "updating monthly summaries in $MONFILE...\n" if $opt_d;

    # reset vars
    %arts=%bytes=%artsph=();
    %bytesph=%artspsh=%bytespsh=();
    %artspah=%bytespah=();
    $totalarts = $totalbytes = $_tmp = 0;
    $lastperiod = $firstart = 0;

    # read current month's results if available
    if (-e "$SUMDIR/$MONFILE") {
	warn "read current month's results from $MONFILE...\n" if $opt_d;
	&readoldresults('permonth',"$SUMDIR/$MONFILE");
	$firstartinmonth=$firstart if ($firstart > 0);  # preserve $firstart
    }

#    # last period to 1 day if previous month file doesn't exist
#    $lastperiod =  $timenow+24*3600 - $timenow unless ($lastperiod > 0);

    # now append yesterday's results
    warn "append yesterday's results from $SUMFILE.yesterday...\n" if $opt_d;
    &readoldresults('permonth',"$SUMDIR/$SUMFILE.yesterday");
    $firstart = $firstartinmonth if ($firstartinmonth > 0); # restore firstart

    # write new monthly data
    warn "write updated month's results back to $MONFILE...\n" if $opt_d;
    &writeresults('permonth',"$SUMDIR/$MONFILE");
}


# readoldresults
# ----------------------------------------------------------------------
# read existing resultfile
#
sub readoldresults {
    local($type,$FILE) = @_;
    local($par,$val,$val2);

    unless (open(OLD,"$FILE")) {
	warn "missing $FILE\n";
	return;
    }
    warn "reading old results from $FILE...\n" if $opt_d;
    $firstart = $lastart = 0;

    while(<OLD>) {		#  read header
	chop;
	unless (/^\#/) { seek(OLD,0,0); last; }
	next if (/^\#\s*$/);
	($par,$val,$val2)  = /^\# ([^:]+): +(\S+) *(.*)$/;
	warn "par=$par    val=$val\n" if $opt_d;
	if ($par eq 'host') { $oldhost = $val; }
	elsif ($par eq 'firstart') { $firstart = $val; }
	elsif ($par eq 'lastart') { $lastart = $val; }
	elsif ($par eq 'timenow') { $oldtimenow = $val; }
	elsif ($par eq 'period') { $oldperiod = $val; }
	elsif ($par eq 'options') { $options = "$val". " $val2"; }
	elsif ($par eq 'totalarts') { $oldtotalarts = $val; }
	elsif ($par eq 'totalbytes') { $oldtotalbytes = $val; }
    }

    # Only accept  compatible results (same options)
    $options =~ /-A(\d+)/ && ($opt_A = $1);
    $options =~ /-L(\d+)/  && ($opt_L  = $1);

    # sum total arts & bytes
    $totalarts += $oldtotalarts;
    $totalbytes += $oldtotalbytes;

    if ("$FILE" eq "$SUMDIR/$SUMFILE.yesterday" ) {
        # extract last period info for monthly result file
    	$lastarts=$oldtotalarts;
	$lastbytes=$oldtotalbytes;
	$lastperiod = $oldperiod;
    } elsif ("$FILE" eq "$SUMDIR/$MONFILE" ) {

    } else  {
   	 $lastperiod = $timenow - $oldtimenow;
   	 $startday = (localtime($firstart))[3];
   	 $thisweekday = (localtime($timenow))[6];
   	 $thisday = (localtime($timenow))[3];
   	 $thishour = (localtime($timenow))[2];
   	 $thisminute = (localtime($timenow))[1];
    }	
    if ($opt_d) {
	warn "readoldresults_1: type=$type firstart=$firstart lastart=$lastart\n";
	warn "readoldresults_1: should be 0: p_arts=$p_arts  p_bytes=$p_bytes\n";
	warn "readoldresults_1: lastperiod=$lastperiod thisday=$thisday\n";
	warn "readoldresults_1: thishour=$thishour thisminute=$thisminute\n"; 
	warn "readoldresults_1: thisweekday=$thisweekday\n"; 
    }

    # cycle resultfile if a new day has begun and $type == 'perday'
    if  ($type eq 'perday') {  # for monthly results don't run this block
        if ( ( $opt_c && ($startday ne $thisday) && 
	      ($thishour*60 + $thisminute>$MAXAGE) ) || ($opt_d && $opt_c) ) { 
	    close(OLD);

	    # update $MONFILE after cycling resultfiles
	    $update_monfile = 1;

	    unless (rename("$SUMDIR/$SUMFILE", "$SUMDIR/$SUMFILE.yesterday")) {
		warn "could not rename $SUMFILE to $SUMFILE.yesterday: $!\n";
	    }
	    $firstart = $lastart = $totalarts = $totalbytes = 0;
	    return;
        } 
    }

    while (<OLD>) {		# read summary data
	($par,$rsite,$cnt,$vol) = split;
	if ($par eq 'arts') { $arts{$rsite} += $cnt; }
	elsif ($par eq 'bytes') { $bytes{$rsite} += $cnt; }
	elsif ($par eq 'artsph') { $artsph{$rsite} += $cnt; }
	elsif ($par eq 'bytesph') { $bytesph{$rsite} += $cnt; }
	elsif ($par eq 'artspsh') { $artspsh{$rsite} += $cnt; }
	elsif ($par eq 'bytespsh') { $bytespsh{$rsite} += $cnt; }
	elsif ($par eq 'artspah') { $artspah{$rsite} += $cnt; }
	elsif ($par eq 'bytespah') { $bytespah{$rsite} += $cnt; }

	elsif ($par eq 'pn') { $arts{$rsite} += $cnt; $bytes{$rsite} += $vol; }
	elsif ($par eq 'ph') { 
	    $artsph{$rsite} += $cnt; $bytesph{$rsite} += $vol;
	}
	elsif ($par eq 'psh') { 
	    $artspsh{$rsite} += $cnt; $bytespsh{$rsite} += $vol;
	}
	elsif ($par eq 'pah') { 
	    $artspah{$rsite} += $cnt; $bytespah{$rsite} += $vol;
	}
    }
    close(OLD);
}


# readnewdata
# ----------------------------------------------------------------------
# extract new data from logfile
#
sub readnewdata {
    open(NEW,"$rawbatchfile") || 
	die "cannot open $rawbatchfile: $!\n";

    $p_firstart = $p_lastart = 0;  # first & last article in last period
    %p_arts = %p_bytes = ();       # articles & bytes in last period, per rsite
    $lastarts = $lastbytes = 0;    # articles & bytes in last period

    while (<NEW>) {
	chop;
	@newsgroup=();
	$hier='';
	$subhier='';
	($rsite,$group,$size,$arr) = split(/ /);
        $lastart=$p_lastart=$arr;
        $firstart=$arr unless $firstart;
        $p_firstart=$arr unless $p_firstart;
        $lastperiod='' unless $lastperiod;

        @newsgroup=split(/\./,$group);
        $hier=$newsgroup[0];

        $bytes{$rsite} += $size;                # total summaries
        $arts{$rsite}++;
        $bytesph{$hier} += $size;
        $artsph{$hier}++;
	$totalarts++;                           
	$totalbytes += $size;

	$lastarts++;                            # last period summaries
	$lastbytes += $size;
        $p_bytes{$rsite} += $size;
        $p_arts{$rsite}++;

        if ($opt_A>1 && $hier eq "alt") {	# resolve alt subhierarchies
	    $subhier=join('.',splice(@newsgroup,0,$opt_A)); 
	    $bytespah{$subhier}+=$size;
	    $artspah{$subhier}++;
	}
        if ($opt_L>1 && $hier ne "alt") {	# resolve down to $opt_L 
	                                        # hierarchy levels
	    $subhier=join('.',splice(@newsgroup,0,$opt_L)); 
	    $bytespsh{$subhier}+=$size;
	    $artspsh{$subhier}++;
	}
    }
    close(NEW);		       
    if ($opt_d) {
	warn "readnewdata: firstart=$firstart lastart=$lastart\n";
	warn "readnewdata: p_firstart=$p_firstart p_lastart=$p_lastart\n";
	warn "readnewdata: lastarts=$lastarts lastbytes=$lastbytes\n";
	warn "readnewdata: totalarts=$totalarts totalbytes=$totalbytes\n"; 
    }
    if ($opt_p) {		             # prepare hourly data for plots 
	&writehourdata;
    }
}


# writehourdata
# ----------------------------------------------------------------------
# write back updated result file
#
sub writehourdata {
    $hourfile = sprintf("%s.%01d-%02d%02d",
	        "$HOURDIR/$SUMFILE", $thisweekday, $thishour, $thisminute);

    $start = &MakeTimeStr($p_firstart);
    $end = &MakeTimeStr($p_lastart);

    # write new summary 
    open(RES,">$hourfile") || die "cannot open $hourfile: $!\n";

    # convert $lastperiod (in seconds) to a more readable form
    $_lastperiod = &MakeTimeOnlyStr("$lastperiod");

    # print header
    print RES "\# file:     $hourfile\n";
    print RES "\# filetype: plotdata\n";
    print RES "\# host:     $fqdn\n";
    print RES "\# timenow:  $timenow\t$timenowstr\n"; # time_t yymmdd.hh:mm
    print RES "\#\n";
    print RES "\# lastperiod:  $lastperiod\t$_lastperiod\n"; # sec
    print RES "\# p_firstart:  $p_firstart\t$start\n";  # time_t yymmdd.hh:mm
    print RES "\# p_lastart:   $p_lastart\t$end\n";     # time_t yymmdd.hh:mm
    print RES "\# lastarts:    $lastarts\n";          # articles in last period
    print RES "\# lastbytes:   $lastbytes\n";         # bytes in last period
    print RES "\#\n";

    # print data
    foreach $rsite (keys(%p_arts)) {                     # per node
	print RES "pn\t$rsite\t$p_arts{$rsite}\t$p_bytes{$rsite}\n";
    }
    close(RES);
}


# writeresults
# ----------------------------------------------------------------------
# write back updated result file
#
sub writeresults {
    local($type,$FILE) = @_;
    $start = &MakeTimeStr($firstart);
    $end = &MakeTimeStr($lastart);

    # make backup of regular files
    if (($type eq 'perday' || $type eq 'continous') && -e "$FILE") {	
	system("cp $FILE $FILE.old");
    }

    # write new summary 
    open(RES,">$FILE") || die "cannot open $FILE: $!\n";

    # convert $lastperiod (in seconds) to a more readable form
    $_lastperiod = &MakeTimeOnlyStr("$lastperiod");

    # compute total time period (UNIX time + human readable format)
    $totalperiod = $lastart - $firstart;
    $_totalperiod = &MakeTimeOnlyStr("$totalperiod");

    unless($options) {
	$options = "-A$opt_A" if ($opt_A);
	$options .= "-L$opt_L" if ($opt_L);
    }


    # print header
    print RES "\# file:     $FILE\n";
    print RES "\# filetype: $type\n";
    print RES "\# options:  $options\n";              # options
    print RES "\# host:     $fqdn\n";
    print RES "\# timenow:  $timenow\t$timenowstr\n"; # time_t yymmdd.hh:mm
    print RES "\#\n";
    print RES "\# firstart: $firstart\t$start\n";     # time_t yymmdd.hh:mm
    print RES "\# lastart:  $lastart\t$end\n";        # time_t yymmdd.hh:mm
    print RES "\# period:   $totalperiod\t$_totalperiod\n"; # sec d+hh:mm:ss
    print RES "\# totalarts:   $totalarts\n";         # 
    print RES "\# totalbytes:  $totalbytes\n";        # 
    print RES "\#\n";
    print RES "\# lastperiod:  $lastperiod\t$_lastperiod\n"; # sec
    print RES "\# lastarts:    $lastarts\n";          # articles in last period
    print RES "\# lastbytes:   $lastbytes\n";         # bytes in last period
    print RES "\#\n";

    # print data
    foreach $rsite (keys(%arts)) {                     # per node
	print RES "pn\t$rsite\t$arts{$rsite}\t$bytes{$rsite}\n";
    }
    foreach $hier (keys(%artsph)) {                    # per hierarchy
	print RES "ph\t$hier\t$artsph{$hier}\t$bytesph{$hier}\n";
    }
    if ($opt_L>1) {                    # per subhierarchy data 
	foreach $subhier (keys(%artspsh)) {
	    print RES "psh\t$subhier\t$artspsh{$subhier}\t$bytespsh{$subhier}\n";
	}
    }
    if ($opt_A>1) {                    # alt.* hierarchy data 
	foreach $subhier (keys(%artspah)) {
	    print RES "pah\t$subhier\t$artspah{$subhier}\t$bytespah{$subhier}\n";
	}
    }
    close(RES);
}


# MakeDateStr
# ----------------------------------------------------------------------
# make a date string yymmdd corresponding to the actual time.
# optional arg is number of days to subtract from current time
#
sub MakeDateStr {
    local($minusdays) = @_;
    local($tstr);
    local(@tarr) = localtime(time-$minusdays*86400);
    $tstr = sprintf ("%02d%02d%02d", $tarr[5], $tarr[4]+1, $tarr[3]);
}
 

# MakeTimeStr
# ----------------------------------------------------------------------
# make a time string yymmdd.hh:mm from current time (no args given) or 
# from UNIX time (long int) passed as argument
#
sub MakeTimeStr {
    local($arg) = @_;
    local($tstr);
    if ($arg == 0) { $arg = time; }
    local(@tarr) = localtime($arg);
    $tstr = sprintf ("%02d%02d%02d.%02d:%02d", 
		     $tarr[5], $tarr[4]+1, $tarr[3],$tarr[2], $tarr[1]);
}


# MakeTimeOnlyStr
# ----------------------------------------------------------------------
# convert seconds to time string d+hh:mm:ss 
#
sub MakeTimeOnlyStr {
    local($arg) = @_;
    local($tstr);
    return '' if ($arg == 0);
    local(@tarr) = gmtime($arg);
    $tstr = sprintf ("%d+%02d:%02d:%02d", 
		     $tarr[3]-1,$tarr[2], $tarr[1], $tarr[0]);
}

# MakeShortTimeStr
# ----------------------------------------------------------------------
# make a time string yymm from current time (no args given) or 
# from UNIX time (long int) passed as argument
#
sub MakeShortTimeStr {
    local($arg) = @_;
    local($tstr);
    if ($arg == 0) { $arg = time; }
    local(@tarr) = localtime($arg);
    $tstr = sprintf ("%02d%02d",$tarr[5], $tarr[4]+1);
}


# gethostandfqdn
#---------------------------------------------------------------------- 
# construct fully qualified domain name...
sub gethostandfqdn {
    chop($str=`uname -n`);
    if ($str =~ /\./) {             # str is fqdn
	$fqdn = $str;
	($hostname) = ($str =~ /^([^.]+)\./);
    } else {                        # str is simple hostname
	$hostname = $str;
	$str = `/bin/grep domain /etc/resolv.conf`;
	$str =~ /domain\s*(\S+)$/;
	$fqdn = $hostname . "." . $1;
    }
}
