#!perl
use strict;
use warnings;

package drawasciihist;
our $VERSION = '1.05';
use Getopt::Long qw(GetOptions);
use Term::Size ();
use Pod::Usage qw(pod2usage);

GetOptions(
  my $opt = {},
  'sort',
  'min=s',
  'max=s',
  'width|w=i',
  'delimiter|d=s',
  'numeric-format|nf=s',
  'style=s',
  'timestamp|ts',
  'log',
  'man',
  'help|h',
);

pod2usage({-verbose => 2}) if $opt->{man};
pod2usage({-verbose => 0}) if $opt->{help};

=pod

=head1 NAME

drawasciihist - draw simple text histograms

=head1 SYNOPSIS

  generator | drawasciihist [--sort] [--width=<ncols>] [--style=<ident>]
                            [--numeric-format|nf=<printf format>]
                            [--delimiter|d=<regexp>] [--timestamp|ts]
                            [--min=X] [--max=X]

Together with the C<histify> tool, this can easily produce simple
visualizations for quickly analyzing simple data in a console. Run the
script with the C<--man> option to see an example.

The tool expects to read histogram data from STDIN, one bin per line.
If a given input line has two or more columns (see the C<--delimiter>
option), the first column will be used as the bin name, the second as
the numeric bin content. If an input line has only one column, the
bin description will be the bin number.

By default, the output histogram will have a range from 0 to the maximum
value in any bin. You can change the histogram range with the C<--min=X>
and C<--max=X> options, providing numeric values for minimum and maximum.
The C<--max> option also allows special values of C<auto> and C<total>.
C<auto> is the default behaviour of --max and corresponds to the maximum
bin content. C<--max=total> will use the sum of all histogram contents
instead. The C<--min> option only understands C<auto> which forces it to
use the smallest bin content as the lower display boundary.

The output histogram width is determined automatically from your
terminal size, if any. Otherwise assumes 80 columns. You can set the
width explicitly using C<--width=ncols>. The C<--sort> option sorts
the bins by content instead of input order.

If the C<--numeric-format> option is present, then the actual numeric
value is included in the histogram using the given C<printf> format.
For positive integers, you would use C<--nf='%u'>, for signed integers,
use C<--nf='%i'> and for fixed precision floats, you can use something
like C<--nf='%.2f'>.

The delimiter for splitting an input line into columns defaults to
any whitespace. You can change that by supplying a Perl regular
expression to the C<--delimiter=...> option.

You can choose the character to be used for drawing histograms with the
C<--style=[character]> option. The characters '-', '=', '~' are special
cased to use an arrow-like appearance.

The C<--timestamp> option will case bin descriptions to be passed
through C<localtime()> to convert from Unix timestamps to
human-readable time strings.

The C<--log> option draws the histogram on a logarithmic scale. It is
advised to use the C<--min> option to specify an explicit minimum in
this case, as the logarithm of C<0> is not drawable. If no C<--min>
option is present, the C<--log> option will change the default minimum
from C<0> to C<--min=auto>.

=head1 EXAMPLE OUTPUT

  $ perl -e 'print $_*(rand()), "\n" for 1..100000' \
          | histify | drawasciihist -nf='%u' -w=70

   1:   33155|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~>|
   2:   19010|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~>                         |
   3:   13972|~~~~~~~~~~~~~~~~~~~~~~~>                                  |
   4:   10476|~~~~~~~~~~~~~~~~~>                                        |
   5:    7919|~~~~~~~~~~~~>                                             |
   6:    5959|~~~~~~~~~>                                                |
   7:    4446|~~~~~~>                                                   |
   8:    2913|~~~~>                                                     |
   9:    1611|~>                                                        |
  10:     538|                                                          |

=cut


my $input_col_delimiter = "\\s+";
$input_col_delimiter = qr/$opt->{delimiter}/ if defined($opt->{delimiter});
$opt->{style} = '~' if not defined $opt->{style};

# Determine the style to use for drawing the histogram
my %styles = (
  '-' => {character => '-', end_character => '>'},
  '=' => {character => '=', end_character => '>'},
  '~' => {character => '~', end_character => '>'},
);

if (not exists $styles{$opt->{style}}) {
  if (length($opt->{style}) == 1) {
    $styles{$opt->{style}} = {character => $opt->{style}, end_character => $opt->{style}};
  }
  else {
    die "Invalid histogram style '$opt->{style}'. Valid styles: '"
        . join("', '", keys %styles), "' and any single character.\n";
  }
}

my $styledef = $styles{$opt->{style}};

# read all input
my @rows;
my $rowcount = 0;
while (<STDIN>) {
  ++$rowcount;
  chomp;
  s/^\s+//;
  my @columns = split /$input_col_delimiter/o, $_;
  next if @columns == 0;
  if (@columns == 1) {
    push @rows, [$rowcount, $columns[0]];
  } else {
    push @rows, [@columns[0,1]];
  }
}

my $show_numeric = defined($opt->{"numeric-format"});
my $numeric_format = $show_numeric ? $opt->{"numeric-format"} : "%.2f";

my $convert_timestamps = $opt->{timestamp};


# extract min/max/width info from input data
my $logscale = $opt->{log};
# The $v_ prefixed variables below refer to "visible" widths in columns.
my $v_desc_width = 0;
my $v_numeric_value_width  = 0;
my $hist_total = 0;
my ($hist_max, $hist_min);
foreach my $row (@rows) {
  my ($description, $value) = @$row;
  $row->[0] = $description = localtime(int($description)) if $convert_timestamps;

  my $formatted_value = sprintf($numeric_format, $value);

  $v_desc_width = length($description) if length($description) > $v_desc_width;
  $v_numeric_value_width  = length($formatted_value) if length($formatted_value) > $v_numeric_value_width;
  $hist_min = $value if !defined $hist_min or $value < $hist_min;
  $hist_max = $value if !defined $hist_max or $value > $hist_max;
  $hist_total += $value;
  # extend each row by the formatted numeric value -- just in case.
  push @$row, $show_numeric ? $formatted_value : '';
}

# sort by value if desired
@rows = sort {$a->[1] <=> $b->[1]} @rows if $opt->{sort};

# figure out output width
my ($terminal_columns, $terminal_rows);
if (-t *STDOUT) {
  ($terminal_columns, $terminal_rows) = Term::Size::chars(*STDOUT{IO});
}
else {
  $terminal_columns = 80;
}

my $v_total_width = $opt->{width} || $terminal_columns-2;

if ($v_total_width < $v_desc_width + 3) {
  warn "Terminal or desired width is insufficient.\n";
  $v_total_width = $v_desc_width + 3;
}

$v_numeric_value_width = $show_numeric ? $v_numeric_value_width+2 : 0;
# The total output width is comprised of the bin description, possibly
# the width of the numeric bin content, and the width of the actual
# histogram.
my $v_hist_width = $v_total_width - $v_desc_width - $v_numeric_value_width - 3;

# figure out the range of values in the visible part of the histogram
my $min_display_value = $opt->{min} || 0;
if ($min_display_value =~ /^auto$/i) {
  $min_display_value = $hist_min;
}
$min_display_value = log($min_display_value||$hist_min*0.99||1e-9) if $logscale;

my $max_display_value = $opt->{max};
if (not defined $max_display_value or $max_display_value =~ /^auto$/) {
  $max_display_value = $hist_max;
}
elsif ($max_display_value =~ /^total$/i) {
  $max_display_value = $hist_total;
}
$max_display_value = log($max_display_value) if $logscale;

my $display_value_range = $max_display_value - $min_display_value;

# format the output
my $format = "%${v_desc_width}s: %${v_numeric_value_width}s|%-${v_hist_width}s|\n";
my $hchar_body = $styledef->{character};
my $hchar_end = $styledef->{end_character};
my $hchar_end_len = length($hchar_end);

# The actual output loop
foreach my $row (@rows) {
  my ($desc, $value, $formatted_value) = @$row;
  $value = log($value) if $logscale;

  my $hlen = int(($value-$min_display_value) / $display_value_range * $v_hist_width);
  $hlen = 0 if $hlen < 0;
  $hlen = $v_hist_width if $hlen > $v_hist_width;

  if ($hlen >= $hchar_end_len) {
    printf($format, $desc, $formatted_value, ($hchar_body x ($hlen-$hchar_end_len)) . $hchar_end);
  }
  else {
    printf($format, $desc, $formatted_value, ($hchar_body x $hlen));
  }
}

