#!perl
use strict;
use warnings;

package histify;
our $VERSION = '1.05';

use Getopt::Long qw(GetOptions);
use Pod::Usage qw(pod2usage);
use Math::SimpleHisto::XS;

GetOptions(
  my $opt = {},
  'nbins=i',
  'min=f',
  'max=f',
  'desc=s',
  'help|h',
  'xw',
  'cumulative',
  'dump_as_input|dump-as-input|dumpasinput|d',
  'dump:s',
  'rebin=i',
  'random:i',
  'pipe',
  'man',
  'soot', # This is undocumented on purpose. Use at your own risk.
);

pod2usage({-verbose => 2}) if $opt->{man};
pod2usage({-verbose => 0}) if $opt->{help};
if ($opt->{dump_as_input} and grep exists($opt->{$_}), qw(xw max min nbins random)) {
  pod2usage({
    -exitval => 1,
    -verbose => 0,
    -message => '--dump-as-input is not compatible with the --xw, --max, --min, --nbins or --random options'
  });
}
$opt->{nbins} = 10 if not defined $opt->{nbins} and not $opt->{dump_as_input};
$opt->{dump} = $opt->{dump_as_input} = 1 if $opt->{pipe};
$opt->{random} = 1000 if exists($opt->{random})
                      and not $opt->{random};

=pod

=head1 NAME

histify - generate simple histograms from streamed data

=head1 SYNOPSIS

  generator | histify [--nbins=X] [--min=X] [--max=X] \
                      [--cumulative] \
                      [--desc=<center|left|right|number|range|none>] \
                      [--xw] [--dump-as-input] [--dump] [--pipe] \
                      [--rebin=X] [--random=X]

Reads whitespace-separated numbers from STDIN and generates a
histogram. If no histogram boundaries are specified using
options, the number of bins defaults to 10 and the min/max are
extracted from the data. That means reading all data into
memory. If you specify min/max, the program works with constant
memory overhead.

Prints the resulting histogram contents one bin per line.

Using --desc=<type> adds an extra column to the output before the
histogram content (separated by a tab) that can be any one of:
The bin "number", the bin "center", the "left" bin boundary,
the "right" bin boundary, or the bin "range" (lower and upper
boundary separated by a comma).

The --xw option will cause histify to read alternating X values
and weights instead of just X values from STDIN. This is useful
for re-binning partially aggregated input data.

The --dump-as-input (or -d) option indicates that the input will
not be of the form outlined above, but instead be the dump of a
L<Math::SimpleHisto::XS> histogram of any format supported by the
module. At this time, this option is not compatible with the
C<--xw, --max, --min, --nbins> options. The --dump option
changes the output from a TSV format to a JSON dump that will
be readable with --dump-as-input. The --pipe option enables both
--dump-as-input and --dump.
When the --dump-as-input option is enabled, then each line on STDIN
may contain a histogram dump. If there is more than one, then
histify will attempt to add the histograms. They must contain data in
identical binning.

The --cumulative option causes C<histify> to calculate the cumulative
histogram of the input.

The --rebin option causes C<histify> to rebin the histogram after
the fact by a given factor which must be a divisor of the original
number of bins.

The --random option makes C<histify> create a new histogram with the
supplied parameters (default: 10 bins between 0 and 1) and the
provided number of random fills (default: 1000).

=cut

my $readall = (!defined($opt->{min}) || !defined($opt->{max}));

my $hist;
my $pos_weight = $opt->{xw};

# We don't know either min or max or neither
if ($opt->{dump_as_input}) {
  my $tmphist;
  #require Math::SimpleHisto::XS::Named; # TODO implement & test using this
  while (my $dump = <STDIN>) {
    next if not $dump =~ /\S/;
    foreach my $type (qw(json yaml simple)) {
      eval {$tmphist = Math::SimpleHisto::XS->new_from_dump($type, $dump);};
      last if defined $tmphist;
    }
    if (defined $tmphist) {
      if ($hist) { $hist->add_histogram($tmphist) }
      else { $hist = $tmphist }
    }
  }
  die "Could not recreate histogram from input histogram dump string"
    if not defined $hist;
}
elsif ($opt->{random}) {
  $opt->{min} ||= 0;
  $opt->{max} ||= 1;
  $hist = Math::SimpleHisto::XS->new(
    min => $opt->{min},
    max => $opt->{max},
    nbins => $opt->{nbins},
  );
  my $min = $hist->min;
  my $width = $hist->width;
  $hist->fill($min + rand($width)) for 1..$opt->{random};
}
else {
  if ($readall) {
    my (@d, @w);
    my $i = 0;
    while (<STDIN>) {
      chomp;
      s/^\s+//; s/\s+$//;
      if ($pos_weight) {
        push @{ (++$i % 2) ? \@d : \@w }, $_ for split " ", $_;
      }
      else {
        push @d, split " ", $_;
      }
    }
    exit(0) if not @d;
    my ($min, $max) = minmax(@d);
    $opt->{min} = $min if not defined $opt->{min};
    $opt->{max} = $max if not defined $opt->{max};

    $hist = Math::SimpleHisto::XS->new(map {$_ => $opt->{$_}} qw(nbins min max));
    $hist->fill($pos_weight ? (\@d, \@w) : (\@d));
  }
  else { # we have proper histogram boundaries
    use constant BATCHSIZE => 1000;
    $hist = Math::SimpleHisto::XS->new(map {$_ => $opt->{$_}} qw(nbins min max));
    my (@d, @w);
    my $i = 0;
    while (<STDIN>) {
      chomp;
      my @row = split " ", $_;
      if ($pos_weight) {
        push @{ (++$i % 2) ? \@d : \@w }, $_ for split " ", $_;
      }
      else {
        push @d, split " ", $_;
      }
      if (@d >= BATCHSIZE) {
        my $tmp;
        $tmp = pop(@w) if @d != @w;
        $hist->fill($pos_weight ? (\@d, \@w) : (\@d));
        @d = (); @w = (defined($tmp) ? ($tmp) : ());
      }
    }
    $hist->fill($pos_weight ? (\@d, \@w) : (\@d)) if @d;
  }
}

$hist = $hist->rebin($opt->{rebin}) if $opt->{rebin};

$hist = $hist->cumulative() if $opt->{cumulative};

if ($opt->{soot}) {
  my $h = $hist->to_soot;
  my $cv = TCanvas->new;
  $h->Draw();
  my $app = $SOOT::gApplication = $SOOT::gApplication; # silence warnings
  $app->Run();
  exit;
}

if (exists $opt->{dump}) {
  my $type = $opt->{dump};
  $type = 'json' if not defined $type or $type eq '';
  print $hist->dump($type),"\n";
  exit(0);
}

my $desc = lc($opt->{desc}||'');
if ($desc eq '' or $desc eq 'none') {
  print "$_\n" for @{ $hist->all_bin_contents };
}
elsif ($desc =~ /^(?:center|number|left|right|range)$/) {
  my $content = $hist->all_bin_contents;
  my $descriptions;
  if    ($desc eq 'center') { $descriptions = $hist->bin_centers; }
  elsif ($desc eq 'number') { $descriptions = [0..$hist->nbins-1]; }
  elsif ($desc eq 'left')   { $descriptions = $hist->bin_lower_boundaries; }
  elsif ($desc eq 'right')  { $descriptions = $hist->bin_upper_boundaries; }
  elsif ($desc eq 'range')  { $descriptions = [ map $hist->bin_lower_boundary($_).",".$hist->bin_upper_boundary($_), 0..($hist->nbins-1) ] }

  foreach my $bin (0..$hist->nbins-1) {
    print "$descriptions->[$bin]\t$content->[$bin]\n";
  }
}
else {
  die "Invalid description mode";
}



sub minmax {
  my ($min, $max);
  for (@_) {
    $min = $_ if not defined $min or $min > $_;
    $max = $_ if not defined $max or $max < $_;
  }
  return($min, $max);
}

