#!perl

our $DATE = '2018-05-01'; # DATE
our $VERSION = '0.006'; # VERSION

use 5.010001;
use strict;
use warnings;

use Getopt::Long qw(:config bundling no_ignore_case);

my %Opts = (
    action => 'rank',
    ignore_leading_blanks => 0,
    calc_percentile => [],
    reverse => 0,
    sort => 'ascii',
    field_separator => "\t",
    sort_field => 0,
    show_rank => 1,
    show_percentile => 0,
    rank => 'default',
    # TODO: --dictionary-order, -d
    # TODO: --ignore-nonprinting, -i
    # TODO: --human-numeric-sort, -h
    # TODO: --version-sort, -V
    # TODO: --percentile-format=s
);

sub parse_cmdline {
    my $res = GetOptions(
        'action=s' => \$Opts{action},
        'calc-percentile=s' => sub {
            $Opts{action} = 'calc-percentile';
            $Opts{calc_percentile} = [split /\s*,\s*/, $_[1]];
        },
        'ignore-leading-blanks|b' => \$Opts{ignore_leading_blanks},
        'ignore-case|f' => \$Opts{ignore_case},
        'reverse|r' => \$Opts{reverse},
        'field-separator|t=s' => \$Opts{field_separator},
        'numeric-sort|n' => sub { $Opts{sort} = 'numeric' },
        'sort=s' => \$Opts{sort},
        'sort-field=i' => \$Opts{sort_field},
        'rank=s' => \$Opts{rank},
        'show-rank!' => \$Opts{show_rank},
        'show-percentile!' => \$Opts{show_percentile},
        '-p' => sub { $Opts{show_percentile} = 1 },
        'help|h' => sub {
            print <<USAGE;
Usage:
  rank [OPTIONS]... [INPUT]...
  rank --help

For more details, see the manpage/documentation.
USAGE
            exit 0;
        },
        'version|v' => sub {
            no warnings 'once';
            say "rank version ".($main::VERSION // 'dev');
            exit 0;
        },
    );
    exit 99 if !$res;
}

sub run {
    my $fs = $Opts{field_separator};

    my @data; # elem: [sortkey, $orig_line]
    my $re_split_fields = qr/\Q$fs\E/;
    while (defined (my $line = <>)) {
        if ($Opts{ignore_leading_blanks}) {
            $line =~ s/\A\s+//;
        }
        my @fields = split $re_split_fields, $line;
        my $sortkey = $fields[$Opts{sort_field}] // '';
        if ($Opts{ignore_case}) { $sortkey = lc $sortkey };
        push @data, [$sortkey, $line];
    }

    my $sortsub;
    if ($Opts{sort} eq 'numeric') {
        if ($Opts{reverse}) {
            $sortsub = sub { $_[0] <=> $_[1] };
        } else {
            $sortsub = sub { $_[1] <=> $_[0] };
        }
    } else {
        if ($Opts{reverse}) {
            $sortsub = sub { $_[1] cmp $_[0] };
        } else {
            $sortsub = sub { $_[0] cmp $_[1] };
        }
    }

    @data = sort { $sortsub->($a->[0], $b->[0]) } @data;

    # exact percentiles to calculate
    my @percentiles;
    for (@{ $Opts{calc_percentile} }) {
        my $p = $_+0;
        die "rank: Invalid percentile '$_', must be (0,100]\n"
            unless $p > 0 && $p <= 100;
        push @percentiles, $p;
    }
    @percentiles = sort {$b <=> $a} @percentiles;

    # if we're using the 'no-skip' ranking system, we'll need to calculate
    # the lowest ranking (highest number)first
    my $lowest_rank;
    if ($Opts{rank} eq 'no-skip') {
        my $prev_sortkey;
        for my $row_num (0..$#data) {
            my $item = $data[$row_num];
            my $sortkey = $item->[0];
            if (!defined($prev_sortkey) ||
                    $sortsub->($sortkey, $prev_sortkey) > 0) {
                $lowest_rank //= 0;
                $lowest_rank++;
            }
            $prev_sortkey = $sortkey;
        }
    }

    my $rank;
    my $prev_sortkey;
    my $prev_percentile;
    for my $row_num (0..$#data) {
        my $item = $data[$row_num];
        my $sortkey = $item->[0];

        my $percentile;
        if ($Opts{rank} eq 'default') {
            if (!defined($prev_sortkey) ||
                    $sortsub->($sortkey, $prev_sortkey) > 0) {
                $rank = $row_num+1;
            }
            $percentile = (@data - $rank + 1)/@data * 100;
        } elsif ($Opts{rank} eq 'no-skip') {
            if (!defined($prev_sortkey) ||
                    $sortsub->($sortkey, $prev_sortkey) > 0) {
                $rank //= 0;
                $rank++;
            }
            $percentile = ($lowest_rank - $rank+1) / $lowest_rank * 100;
        } else { # no-same
            $rank = $row_num+1;
            $percentile = (@data - $rank + 1)/@data * 100;
        }

        if ($Opts{action} eq 'rank') {
            print $rank, $fs if $Opts{show_rank};
            printf "%.3f%s", $percentile, $fs
                if $Opts{show_percentile};
            print $item->[1];
        } elsif ($Opts{action} eq 'calc-percentile') {
            while (1) {
                last unless @percentiles;
                if ($percentile == $percentiles[0]) {
                    printf "%.3f\t%.8f\n", $percentile, $sortkey;
                    shift @percentiles;
                } elsif ($percentile < $percentiles[0] || $row_num == $#data) {
                    if (defined $prev_percentile) {
                        # linear interpolation
                        my $val = $prev_sortkey +
                            ($percentiles[0]-$prev_percentile) / ($percentile-$prev_percentile) * ($sortkey - $prev_sortkey);
                        printf "%.3f\t%.8f\n", $percentiles[0], $val;
                    } else {
                        printf "%.3f\t%s\n", $percentile, "N/A";
                    }
                    shift @percentiles;
                } else {
                    last;
                }
            }
            $prev_percentile = $percentile;
        }

        $prev_sortkey = $sortkey;
    }
}

# MAIN

parse_cmdline();
run();

1;
# ABSTRACT: Rank lines of text
# PODNAME: rank

__END__

=pod

=encoding UTF-8

=head1 NAME

rank - Rank lines of text

=head1 VERSION

This document describes version 0.006 of rank (from Perl distribution App-rank), released on 2018-05-01.

=head1 SYNOPSIS

 rank [OPTION]... [FILE]...

=head1 DESCRIPTION

C<rank> ranks lines of text, by default using the first field as the sort key
(can be changed with C<--sort-field> option). When there are multiple lines that
have the same sort key, C<rank> will assign the same rank to the lines. Finally,
the lines will be displayed with the ranks, starting from the highest ranked
(1).

Sample input:

 21    ujang
 30    budi
 50    atang
 75    robi
 89    parjiyem
 77    nono
 75    tedi

Sample output using C<rank -n> (a.k.a. C<rank --numeric-sort>):

 1    89    parjiyem
 2    77    nono
 3    75    robi
 3    75    tedi
 5    50    atang
 6    30    budi
 7    21    ujang

Sample output using C<rank -np> (a.k.a. C<rank --numeric-sort
--show-percentile>):

 1    100.000   89    parjiyem
 2    85.714    77    nono
 3    71.429    75    robi
 3    71.429    75    tedi
 5    42.857    50    atang
 6    28.571    30    budi
 7    14.286    21    ujang

Sample output using C<rank -n --calc-percentile 5,25,50,75,95>):

 95.000  84.80000000
 75.000  75.50000000
 50.000  56.25000000
 25.000  27.75000000
 5.000   15.15000000

=head1 EXIT CODES

0 on success.

255 on I/O error.

99 on command-line options error.

=head1 OPTIONS

=over

=item * --action=s (default: rank)

Valid values: C<rank>, C<calc-percentile>

The default action is to show ranking (C<rank>).

Action C<calc-percentile> will calculate specific percentiles. Usually you just
specify C<--calc-percentile a,b,c,...> to set action to C<calc-percentile>.

=item * --calc-percentile=f1,f2,...

Imply C<--action=calc-percentile>. Calculate specific percentile(s). Will use
linear interpolation to get the percentile values.

=item * --reverse, -r

=item * --ignore-leading-blanks, -b

=item * --ignore-case, -i

=item * --field-separator, -f (default: Tab)

=item * --sort=s

Valid values: C<numeric>, C<ascii>.

=item * --numeric-sort, -n

Shortcut for C<--sort=numeric>. By default numeric sort order is descending
instead of ascending. To do ascending sort, add C<-r>.

=item * --sort-field=i (default: 0)

=item * rank=s (default: default)

Change ranking system. The default is to have all lines that have the same sort
keys to be ranked the same, but the next ranking will skip numbers:

 1    89    parjiyem
 2    77    nono
 3    75    robi
 3    75    tedi
 5    30    budi
 6    21    ujang

The C<no-skip> ranking system will not do these skips:

 1    89    parjiyem
 2    77    nono
 3    75    robi
 3    75    tedi
 4    30    budi
 5    21    ujang

The C<no-same> ranking system will assign different ranks to lines that have the
same sort key:

 1    89    parjiyem
 2    77    nono
 3    75    robi
 4    75    tedi
 5    30    budi
 6    21    ujang

=item * --no-show-rank

=item * --show-percentile, -p

=item * --help, -h

=item * --version, -v

=back

=head1 FAQ

=head1 HOMEPAGE

Please visit the project's homepage at L<https://metacpan.org/release/App-rank>.

=head1 SOURCE

Source repository is at L<https://github.com/perlancar/perl-App-rank>.

=head1 BUGS

Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-rank>

When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.

=head1 SEE ALSO

B<sort>

=head1 AUTHOR

perlancar <perlancar@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2018 by perlancar@cpan.org.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut
