#!/usr/bin/perl -w
#
# rule-hits-over-time - produce graphs of rule hits over time, using gnuplot
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use GD;
use Statistics::DEA;

use strict;
use warnings;
use Fcntl;
use Getopt::Long;
use SDBM_File;

sub usage {
  die q{
usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...]

  --rule=rulename       specify rule to map
  --period=secs         specify period (default: 1 day)
  --ignore_older=days   ignore hits older than N days (default: 0 = none)
  --scale_period=n      scale period up to N items of data, 0=no scaling
                        (default: 0)
  --size_x=pixels       width of output graphs, in pixels (def: 800)
  --size_y=pixels       height of ONE of the output graphs, in pixels
                        (default: 400)
  --cgi                 CGI output, to stdout with HTTP headers
  --text                text output only
};
}

use vars qw(
        $opt_rule $opt_size_x $opt_size_y $opt_text $opt_cgi
        $opt_period $opt_scale_period $opt_ignore_older
        $opt_debug
);

GetOptions(
        'rule=s',
        'size_x=i',
        'size_y=i',
        'text',
        'cgi',
        'scale_period=i',
        'ignore_older=i',
        'period=i',
        'debug'
) or usage();

usage() unless $opt_rule;

my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug
# $DEBUG_TMPDIR = 1;

# fix PATHs for sucky Solaris compatibility.
$ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH};
$ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib";

my $rule_re = qr/[, ]${opt_rule}[, ]/;

# my $period = $opt_period || (24 * 60 * 60 * 1);
my $period = $opt_period || 3600;

my $graph_x                         = $opt_size_x || 800;
my $graph_y                         = $opt_size_y || 400;

my $fname_counter = 1;
my %graph_png_data = ();

my %allbuckets = ();
my %allresults = ();
my @allfiles = ();

my $graph_times = [];
my $graph_data = [];

my $this_file_results;
my $lastbucket;
my $nextbucket;
my $seen_y;
my $seen_n;

my $tmpdir = "/tmp/rulehits.$$";
if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); }

mkdir ($tmpdir) or die "collided on $tmpdir";

my $outdir = ".";
if ($opt_cgi) {
  $outdir = $tmpdir;
}

my $file_sets = [ ];    # split into ham and spam
$file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ];

foreach my $file (@ARGV) {
  if ($file =~ /\bham\b/) {
    push @{$file_sets->[1]}, $file;
  } else {
    push @{$file_sets->[0]}, $file;
  }
}

foreach my $set (@{$file_sets}) {
  @allfiles = ();
  %allbuckets = ();
  %allresults = ();

  my $settitle = '';
  if ($set->[0] =~ /^TITLE:(.*)$/) {
    $settitle = $1; shift(@{$set});
  }

  create_gp("$opt_rule $settitle");

  foreach my $file (@{$set}) {
    if (!$opt_text) {
      my $title = $file;
      $title =~ s/^.*\///;
    }
    push (@allfiles, $file);

    if (1) {
      # use an on-disk file to avoid massive VM usage for this hash
      # on huge datasets
      unlink("$tmpdir/graph.tmp.dir");
      unlink("$tmpdir/graph.tmp.pag");
      tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp",
                O_RDWR|O_CREAT, 0600) or die "tie failed: $!";
    }
    else {
      %{$allresults{$file}} = ();
    }

    $this_file_results = $allresults{$file};
    read_logs($file);

    $graph_times = [];
    $graph_data = [];
    summarise();
  }

  $opt_scale_period and collapse_periods();

  plot_gp();
}

my $format = "gif";

{
  my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2));
  my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1);
  my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1);

  if (!$file01 || !$file02) {
    warn "bad input.  leaving graph blank";
  }
  else {
    $both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1);
    $both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1);
  }

  if ($opt_cgi) {
    use CGI qw(:standard);
    print header("image/$format"); binmode STDOUT;
    print STDOUT $both->$format();
  }
  else {
    open(IMG, ">both.$format") or die $!; binmode IMG;
    print IMG $both->$format();
    close IMG;
  }

  $both->gif();
}

if (!$DEBUG_TMPDIR) {
  unlink(<$tmpdir/*.*>); rmdir $tmpdir;
} else {
  system ("ls -l $tmpdir/*.* 1>&2");
}

exit;

sub summarise {
  foreach my $bucket (sort keys %allbuckets) {
    my @cols = ();
    foreach my $file (@allfiles) {
      my $res = $allresults{$file}->{$bucket};
      my $sy;
      my $sn;

      if (!$res) {
        $sn = $sy = -1;
      }
      elsif ($res !~ /^y(\d+)n(\d+)$/) {
        warn "bad results: $res for $file $bucket";
        next;
      }
      else {
        $sy = $1;
        $sn = $2;
      }

      if (!defined $sy && !defined $sn) {
        $sn = $sy = -1;
      } elsif (!defined $sy || !defined $sn) {
        # assert: enforce both < 0, if either is
        warn "oops? sy=$sy sn=$sn, should be both < 0";
        $sn = $sy = -1;
      }

      if (($sy+$sn) > 0) {
        push @cols, ($sy / ($sy + $sn)) * 100.0;
      }
      else {
        push @cols, -1;
      }
    }

    if ($opt_text) {
      print $bucket," ".join(' ',@cols)."\n";
    }
    else {
      push (@{$graph_times}, $bucket);
      push (@{$graph_data}, \@cols);
    }
  }
}


sub collapse_periods {
  while (scalar @{$graph_data} > $opt_scale_period) {
    my $num_files = (scalar @allfiles - 1);
    my $newtimes = [ ];
    my $newdata = [ ];
    my $i;
    for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) {
      $newtimes->[$i >> 1] = $graph_times->[$i];
      foreach my $j (0 .. $num_files)
      {
        my $v1 = $graph_data->[$i]->[$j];
        my $v2 = $graph_data->[$i+1]->[$j];
        if (!defined $v2) { $v2 = -1; }

        if ($v1 >= 0.0 && $v2 >= 0.0) {
          # both are valid.  take their mean
          $v1 = ($v1 + $v2) / 2.0;
        }
        elsif ($v2 >= 0.0) {
          # only one is valid; use it and ignore the invalid one
          $v1 = $v2;
        }
        else {
          # we're good, v1 is the valid one anyway
        }

        $newdata->[$i >> 1]->[$j] = $v1;
      }
    }
    @{$graph_times} = @{$newtimes};
    @{$graph_data} = @{$newdata};
    $period *= 2;
  }
}


sub read_logs {
  my $file = shift;

  # limit to a range from [4 years ago, today] to avoid OOM craziness
  # from corrupt input
  #
  if ($opt_ignore_older <= 0) {
    $opt_ignore_older = 365 * 4;
  }
  my $limit_hi = time;
  my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older);

  $lastbucket = undef;
  $nextbucket = undef;
  $seen_y = 0;
  $seen_n = 0;

  if ($file =~ /\.gz$/) {
    open (IN, "gunzip -cd '$file'|") or die "cannot gunzip $file";
  } else {
    open (IN, "<$file") or die "cannot read $file";
  }

  while (<IN>) {
    next if /^#/;

    my $t;
    /\btime=(\d+),/ and $t = $1;
    next unless $t;

    if ($t < $limit_lo || $t > $limit_hi) {
      warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)";
      next;
    }

    my $found = ($_ =~ $rule_re);
    
    if (!defined $lastbucket) {
      $lastbucket = $t - ($t % $period);
      $nextbucket = $lastbucket + $period;
    }

    if ($t < $nextbucket) {
      if ($found) {
        $seen_y++;
      } else {
        $seen_n++;
      }
    }
    else {
      while ($t >= $nextbucket) {
        completeline();
        $lastbucket = $nextbucket;
        $nextbucket += $period;
      }
    }
  }
  close IN;
  completeline();
}

sub completeline {
  return unless ($lastbucket);
  $allbuckets{$lastbucket} = undef;
  $this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n;
  $seen_y = 0;
  $seen_n = 0;
}

sub create_gp {
  my $title = shift;

  my $mailtype = 'mail';
  if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; }
  my $y_label = "\%age of $mailtype in period";

  $SIG{PIPE} = sub {
            die "unexpected SIGPIPE received!";
        };

  open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot";

  # eye-candy
  my $niceperiod = "$period secs";
  if ($period % (24*60*60) == 0) {
    $niceperiod = ($period / (24*60*60))." days";
  }

  # (NOTE: -1% hitrate means no data for that time period)'
  print GP qq{

    set terminal png small \\
        interlace size $graph_x,$graph_y \\
        xffffff x444444 x33cc00 \\
        xff3300 x0000cc x99cc00 xff9900 \\
        xcccc00 x333333 x999999 x9500d3

    set out '$tmpdir/out.png'

    set grid back xtics ytics

    set xlabel 'Time, in blocks of $niceperiod'
    set xdata time
    set timefmt "%Y-%m-%d-%H"
    set format x "%04Y%02m%02d"

    set ylabel '$y_label'
    set yrange [0:*]

    set title "$title"
    set key left top Left nobox

  };
}

sub fmt_time_t {
  my $tt = shift;
  use POSIX qw(strftime);
  return strftime "%Y-%m-%d-%H", gmtime($tt);
}

sub plot_gp {
  my $num_files = (scalar @allfiles - 1);
  my $num_datapoints = (scalar @{$graph_data} - 1);

  # specify a number of alphas for Statistics::DEA.  Right now,
  # the graph is pretty unreadable with more than one.
  my $dea_alphas = [ 0.9 ];
  my $num_alphas = (scalar @{$dea_alphas} - 1);

  my $times = [ ];
  my $avgs = [ ];

  my $graphname = sprintf("file%02d", $fname_counter++);

  if (!$opt_text)
  {
    if (@{$graph_data}) {
      my $deas = ();
      foreach my $i (0 .. $num_files) {
        foreach my $a (0 .. $num_alphas) {
          $deas->[$a]->[$i] =
                Statistics::DEA->new($dea_alphas->[$a], $period * 3);
        }
      }

      foreach my $j (0 .. $num_datapoints) {
        my (@datas) = @{$graph_data->[$j]};
        $times->[$j] = fmt_time_t($graph_times->[$j]);

        foreach my $i (0 .. $num_files) {
          my $d = $datas[$i];

          foreach my $a (0 .. $num_alphas) {
            if ($d >= 0) {
              $deas->[$a]->[$i]->update($d, $j);
            }

            my $avg;
            eval {
              # this can die if it hasn't received enough data!
              # so trap with an eval.
              $avg = $deas->[$a]->[$i]->average();
            };
            $avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1;
          }
        }
      }
    }

    # write the data plotfile
    open (DATA, ">$tmpdir/plot.$graphname.data") or die;
    if (@{$graph_data})
    {
      foreach my $j (0 .. $num_datapoints) {
        print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n";
      }
    } else {
      # a fake datapoint so gnuplot doesn't puke on us
      print DATA fmt_time_t(0)," 0 0\n";
    }
    close DATA or die;


    # write the avgs plotfiles
    foreach my $a (0 .. $num_alphas) {
      open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die;
      if (@{$graph_data}) {
        foreach my $j (0 .. $num_datapoints) {
          print DATA $times->[$j]," ",
              defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0',
              "\n";
        }
      } else {
        # a fake datapoint so gnuplot doesn't puke on us
        print DATA fmt_time_t(0)," 0 0\n";
      }
      close DATA or die;
    }


    # and the commands file
    my @plot = ();
    foreach my $i (0 .. $num_files) {
      my $legend = filename_to_legend ($allfiles[$i]);
      my $style = $i+1;
      my $col = $i+2;

      push @plot,
        qq{ '$tmpdir/plot.$graphname.data' using }.
            qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
            # note: using "lt $style" gives us points in the same
            # colour as the lines in the smoothed graph below
            qq{ with points lt $style pt $style ps 1 }.
            qq{ title '$legend' };

      foreach my $a (0 .. $num_alphas) {
        push @plot,
          qq{ '$tmpdir/avgs$a.$graphname.data' using }.
              qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
              qq{ with lines lt $style lw 3 }.
              qq{ title '  (DEA a=$dea_alphas->[$a])' };
      }
    }

    print GP "plot ",join(", ", @plot), "\n";
    close GP
        or warn "gnuplot command exited: $?";

    $graph_png_data{$graphname} = readfile("$tmpdir/out.png");
  }
}

sub readfile {
  open (IN, "<$_[0]") or die "cannot read $_[0]";
  binmode IN;
  my $str = join('',<IN>);
  close IN;
  return $str;
}

sub filename_to_legend {
  my $f = shift;

  $f =~ s/^.*\///;
  $f =~ s/LOGS\.all-//;
  $f =~ s/\.log\.\S+$//;
  return $f;
}
