#!/bin/perl
#
# flowcons - consolidate news flow summary files into daily summaries, from
# the potentially numerous and sometimes large files created not only when
# midnight is reached but any time the channel feed is flushed (e.g. by
# various maintenance operations and by group creation/deletion).
#
# The input files should be named on the command line, and while they need not
# be in the current directory, the filenames (as distinct from path) should be
# of the form flowsum.yymmdd.hhmmss.dayname. The information from all input
# files corresponding to a particular day will be extracted and consolidated 
# into a file with the name flowsum.yymmdd.all in the current directory. [To
# avoid confusing the reporting scripts, these should *not* be in the main
# summary-file directory *unless* the raw summary files are removed - leaving
# just the consolidated files - before a reporting run which will look at 
# the relevant days. In general, daily, weekly and perhaps monthly reports
# will be based on the raw data, which will then be consolidated in order that
# the data needed for annual reports will not consume an unreasonable amount
# of disc space while retaining the daily information needed to regenerate
# any of the reports (with minor variations in totals due to the original
# values being rounded to the precision saved in the files).
#
# Created:
#       15-Oct-1995 JML
# 
# Modified:
#       17-Feb-1996 JML V1.5: no changes, but give it a version number 
#                       matching the other scripts.
#

($PROGNAME = $0) =~ s/^.*\///;          # progam name for messages

unless ($#ARGV > -1) { die "Usage: $PROGNAME file [file...]\n"; }

# Sort filenames and check they all match the expected pattern (though not
# to the extent of rejecting names that would refer to non-existent days of
# the month, etc.).

@files = sort(@ARGV);

# Append a dummy entry with date 999999 to the filename list, to simplify the
# reading + reporting loop.

push(@files,'flowsum.999999.999999.Noday');

$date = '';                             # last date seen

foreach $file (@files)
{
    unless ($file =~ /(^|\/)flowsum\.(\d{6,6})\.\d{6,6}\..*day$/)
    {
        die "$PROGNAME: invalid input filename $file\n";
    }

    $latest = $2;                       # date from filename
    
    if ($latest ne $date)              
    {
        if ($date ne '')                # report if not first file
        {
            print OUT "> $start $end\n\n";
            print OUT "# Consolidated news flow statistics for $date\n\n";
            print OUT "# Primary groups Secondary groups\n";
            print OUT "# count bytes count bytes\n";
            
            $pgrp = 0;                  # count of primary groups
            $sgrp = 0;                  # count of secondary groups
            $totvol = 0;                # total news volume
            $totart = 0;                # total articles

            foreach $group (sort(keys(%scount)))
            {
                $sgrp++;
                $totvol += $svol{$group};
                $totart += $scount{$group};
                if ($pcount{$group}) { $pgrp++; }

                printf(OUT "%.2f %.2f %.2f %.2f %s\n",
                    $pcount{$group},$pvol{$group},
                    $scount{$group},$svol{$group},$group);
            }

            printf(OUT "# Total news volume = %.3fMB in %d articles\n\n",
                $totvol/(1024*1024),$totart);
            print OUT "# Primary groups: $pgrp\n";
            print OUT "# Secondary groups: $sgrp\n\n";
            print OUT "< $start $end\n\n";
            $date = $latest;            # new current date
        }
        close(OUT);                     # close old output

        if ($date eq '999999') { exit 0; }      # dummy date => end

        open(OUT,">flowsum.$latest.all") ||
            die "$PROGNAME: unable to open flowsum.$latest.all for writing\n";
        $date = $latest;                # note new current date

# Initialise arrays etc. for new day.
        %pcount = ();                   # primary count
        %pvol = ();                     # primary volume
        %scount = ();                   # secondary count
        %svol = ();                     # secondary volume
        $start = '';                    # start timestamp from first file
        $end = '';                      # end timestamp from last file
    }

# Open file, extract statistics.

    open(IN,$file) || die "$PROGNAME: unable to open $file for reading\n";
    $hdr = 0;                           # not seen header yet
    
    while (<IN>)
    {
        if (/^\s*$/ || /^#/)            # comment or blank line
        {
            next;
        }
        elsif (/^> (\d+) (\d+)/)        # header line
        {
            $hdr = 1;                   # have seen header for this file
            if ($start eq '') { $start = $1; }
            $end = $2;                  # always save end - this may be last
        }
        elsif (/^</)                    # trailer line
        {
            $hdr = 2;
        }
        elsif (/^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\S+)/)
        {                               # should be data line...
            $group = $5;
            $pcount{$group} += $1;
            $pvol{$group} += $2;
            $scount{$group} += $3;
            $svol{$group} += $4;
        }
        else
        {
            warn "$PROGNAME: $file: unexpected $_";
            next;
        }
    }
    close(IN);
    
    unless ($hdr == 2) 
        { die "$PROGNAME: $file: header/trailer missed? hdr=$hdr\n"; }
}
