#!perl
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use Pod::Usage qw(pod2usage);
use Math::SimpleHisto::XS;

GetOptions(
  my $opt = {},
  'nbins=i',
  'min=f',
  'max=f',
  'desc=s',
  'help|h',
  'xw',
  'cumulative',
  'dump_as_input|dump-as-input|dumpasinput|d',
  'dump:s',
  'man',
  'soot', # This is undocumented on purpose. Use at your own risk.
);

pod2usage({-verbose => 2}) if $opt->{man};
pod2usage({-verbose => 0}) if $opt->{help};
if ($opt->{dump_as_input} and grep $opt->{$_}, qw(xw max min nbins)) {
  pod2usage({
    -exitval => 1,
    -verbose => 0,
    -message => '--dump-as-input is not compatible with the --xw, --max, --min, or --nbins options'
  });
}
$opt->{nbins} = 10 if not defined $opt->{nbins} and not $opt->{dump_as_input};

=pod

=head1 NAME

histify - generate simple histograms from streamed data

=head1 SYNOPSIS

  generator | histify [--nbins=X] [--min=X] [--max=X] \
                      [--cumulative] \
                      [--desc=<center|left|right|number|none>] \
                      [--xw] [--dump-as-input]

Reads whitespace-separated numbers from STDIN and generates a
histogram. If no histogram boundaries are specified using
options, the number of bins defaults to 10 and the min/max are
extracted from the data. That means reading all data into
memory. If you specify min/max, the program works with constant
memory overhead.

Prints the resulting histogram contents one bin per line.

Using --desc=<type> adds an extra column to the output before the
histogram content (separated by a tab) that can be any one of:
The bin "number", the bin "center", the "left" bin boundary, or
the "right" bin boundary.

The --xw option will cause histify to read alternating X values
and weights instead of just X values from STDIN. This is useful
for re-binning partially aggregated input data.

The --dump-as-input (or -d) option indicates that the input will
not be of the form outlined above, but instead be the dump of a
L<Math::SimpleHisto::XS> histogram of any format supported by the
module. At this time, this option is not compatible with the
C<--xw, --max, --min, --nbins> options.

The --cumulative option causes C<histify> to calculate the cumulative
histogram of the input.

=cut

my $readall = (!defined($opt->{min}) || !defined($opt->{max}));

my $hist;
my $pos_weight = $opt->{xw};

# We don't know either min or max or neither
if ($opt->{dump_as_input}) {
  my $dump = join '', <STDIN>;
  #require Math::SimpleHisto::XS::Named; # TODO implement & test using this
  foreach my $type (qw(json yaml simple native_pack)) {
    binmode(STDIN) if $type eq 'native_pack';
    eval {$hist = Math::SimpleHisto::XS->new_from_dump($type, $dump);};
    last if defined $hist;
  }
  die "Could not recreate histogram from input histogram dump string"
    if not defined $hist;
}
else {
  if ($readall) {
    my (@d, @w);
    my $i = 0;
    while (<STDIN>) {
      chomp;
      s/^\s+//; s/\s+$//;
      if ($pos_weight) {
        push @{ (++$i % 2) ? \@d : \@w }, $_ for split " ", $_;
      }
      else {
        push @d, split " ", $_;
      }
    }
    exit(0) if not @d;
    my ($min, $max) = minmax(@d);
    $opt->{min} = $min if not defined $opt->{min};
    $opt->{max} = $max if not defined $opt->{max};

    $hist = Math::SimpleHisto::XS->new(map {$_ => $opt->{$_}} qw(nbins min max));
    $hist->fill($pos_weight ? (\@d, \@w) : (\@d));
  }
  else { # we have proper histogram boundaries
    use constant BATCHSIZE => 1000;
    $hist = Math::SimpleHisto::XS->new(map {$_ => $opt->{$_}} qw(nbins min max));
    my (@d, @w);
    my $i = 0;
    while (<STDIN>) {
      chomp;
      my @row = split " ", $_;
      if ($pos_weight) {
        push @{ (++$i % 2) ? \@d : \@w }, $_ for split " ", $_;
      }
      else {
        push @d, split " ", $_;
      }
      if (@d >= BATCHSIZE) {
        my $tmp;
        $tmp = pop(@w) if @d != @w;
        $hist->fill($pos_weight ? (\@d, \@w) : (\@d));
        @d = (); @w = (defined($tmp) ? ($tmp) : ());
      }
    }
    $hist->fill($pos_weight ? (\@d, \@w) : (\@d)) if @d;
  }
}

$hist = $hist->cumulative() if $opt->{cumulative};

if ($opt->{soot}) {
  my $h = $hist->to_soot;
  my $cv = TCanvas->new;
  $h->Draw();
  my $app = $SOOT::gApplication = $SOOT::gApplication; # silence warnings
  $app->Run();
  exit;
}

if (exists $opt->{dump}) {
  my $type = $opt->{dump};
  $type = 'json' if not defined $type or $type eq '';
  print $hist->dump($type),"\n";
  exit(0);
}

my $desc = lc($opt->{desc}||'');
if ($desc eq '' or $desc eq 'none') {
  print "$_\n" for @{ $hist->all_bin_contents };
}
elsif ($desc =~ /^(?:center|number|left|right)$/) {
  my $content = $hist->all_bin_contents;
  my $descriptions;
  if    ($desc eq 'center') { $descriptions = $hist->bin_centers; }
  elsif ($desc eq 'number') { $descriptions = [0..$hist->nbins-1]; }
  elsif ($desc eq 'left')   { $descriptions = $hist->bin_lower_boundaries; }
  elsif ($desc eq 'right')  { $descriptions = $hist->bin_upper_boundaries; }

  foreach my $bin (0..$hist->nbins-1) {
    print "$descriptions->[$bin]\t$content->[$bin]\n";
  }
}
else {
  die "Invalid description mode";
}



sub minmax {
  my ($min, $max);
  for (@_) {
    $min = $_ if not defined $min or $min > $_;
    $max = $_ if not defined $max or $max < $_;
  }
  return($min, $max);
}

