package Statistics::Multtest;

use List::Vectorize;
use Carp;
use strict;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(bonferroni holm hommel hochberg BH BY);
our %EXPORT_TAGS = (all => [qw(bonferroni holm hommel hochberg BH BY)]);

our $VERSION = '0.11';

1;

sub initial {
	my $p = shift;
	
	if(!defined(ref($p))) {
		croak "ERROR: P-values should be array ref or hash ref.\n";
	}
	
	unless(ref($p) eq "ARRAY" or ref($p) eq "HASH") {
		croak "ERROR: P-values should be array ref or hash ref.\n";
	}
	
	my $name = [];
	if(ref($p) eq "HASH") {
		$name = [ keys %{$p} ];
		$p = [ values %{$p} ];
	}
	
	if(max($p) > 1 or min($p) < 0) {
		croak "ERROR: P-values should between 0 and 1.\n";
	}
	
	return ($name, $p);
}


sub get_result {
	my ($adjp, $name) = @_;
	
	if(is_empty($name)) {
		return $adjp;
	}
	else {
		my $result;
		for (0..$#$name) {
			$result->{$name->[$_]} = $adjp->[$_];
		}
		return $result;
	}
}

sub bonferroni {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _bonferroni($p);
	return get_result($adjp, $name);
}

sub holm {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _holm($p);
	return get_result($adjp, $name);
}

sub hommel {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _hommel($p);
	return get_result($adjp, $name);
}

sub hochberg {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _hochberg($p);
	return get_result($adjp, $name);
}

sub BH {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _BH($p);
	return get_result($adjp, $name);
}

sub BY {
    my $p = shift;
    my $name;
    ($name, $p) = initial($p);
    my $adjp = _BY($p);
	return get_result($adjp, $name);
}

# R code: pmin(1, n * p)
sub _bonferroni {
    my $p = shift;
    my $n = len($p);
    
    my $adjp = multiply($n, $p);
    
    return pmin(1, $adjp);
}

# R code: i = 1:n
#         o = order(p)
#         ro = order(o)
#         pmin(1, cummax((n - i + 1) * p[o]))[ro]
sub _holm {
    my $p = shift;
    my $n = len($p);
	
	my $i = seq(1, $n);
	my $o = order($p);
	my $ro = order($o);
    
    my $adjp = multiply(minus($n + 1, $i), subset($p, $o));
    $adjp = cumf($adjp, \&max);
	$adjp = pmin(1, $adjp);
    
    return subset($adjp, $ro);
}

# R code: i = 1:n
#         o = order(p)
#         p = p[o]
#         ro = order[o]
#         q = pa = rep.int(min(n * p/i), n)
#         for (j in (n - 1):2) {
#             ij = 1:(n - j + 1)
#             i2 = (n - j + 2):n
#             q1 <- min(j * p[i2]/(2:j))
#             q[ij] <- pmin(j * p[ij], q1)
#             q[i2] <- q[n - j + 1]
#             pa <- pmax(pa, q)
#         }
#         pmax(pa, p)[ro]
sub _hommel {
    my $p = shift;
    my $n = len($p);
	
	my $i = seq(1, $n);
	my $o = order($p);
	$p = subset($p, $o);
	my $ro = order($o);
	
	my $pa = rep(min(divide(multiply($n, $p), $i)), $n);
	my $q = copy($pa);
	
	# set the first index as 1
    unshift(@$p, 0);
    unshift(@$q, 0);
    unshift(@$pa, 0);
	
	my $ij;
	my $i2;
	my $q1;
    for my $j (@{seq($n - 1, 2)}) {
        
		$ij = seq(1, $n - $j + 1);
		$i2 = seq($n - $j + 2, $n);
		$q1 = min(divide(multiply($j, subset($p, $i2)), seq(2, $j)));
		subset_value($q, $ij, pmin(multiply($j, subset($p, $ij)), $q1));
		subset_value($q, $i2, $q->[$n - $j + 1]);
        $pa = pmax($pa, $q);
    }
    
    shift(@$p);
    shift(@$q);
    shift(@$pa);
	
	my $adjp = pmax($pa, $p);
	return subset($adjp, $ro);    
}

# R code: i = n:1
#         o <- order(p, decreasing = TRUE)
#         ro <- order(o)
#         pmin(1, cummin((n - i + 1) * p[o]))[ro]
sub _hochberg {
    
    my $p = shift;
    my $n = len($p);
    my $i = seq($n, 1);
    
    my $o = order($p, sub {$_[1] <=> $_[0]});
	my $ro = order($o);
	
    my $adjp = multiply(minus($n+1, $i), subset($p, $o));
    $adjp = cumf($adjp, \&min);
	$adjp = pmin(1, $adjp);
    return subset($adjp, $ro);
}

# R code: i <- n:1
#         o <- order(p, decreasing = TRUE)
#         ro <- order(o)
#         pmin(1, cummin(n/i * p[o]))[ro]
sub _BH {
    my $p = shift;
    my $n = len($p);
    my $i = seq($n, 1);
    
    my $o = order($p, sub {$_[1] <=> $_[0]});
	my $ro = order($o);
	
    my $adjp = multiply(divide($n, $i), subset($p, $o));
    $adjp = cumf($adjp, \&min);
	$adjp = pmin(1, $adjp);
    return subset($adjp, $ro);

}

# R code: i <- n:1
#         o <- order(p, decreasing = TRUE)
#         ro <- order(o)
#         q <- sum(1/(1L:n))
#         pmin(1, cummin(q * n/i * p[o]))[ro]
sub _BY {
    
    my $p = shift;
    my $n = len($p);
    my $i = seq($n, 1);
    
    my $o = order($p, sub {$_[1] <=> $_[0]});
	my $ro = order($o);
	
    my $q = sum(divide(1, seq(1, $n)));
    my $adjp = multiply(divide($q*$n, $i), subset($p, $o));
    $adjp = cumf($adjp, \&min);
    $adjp = pmin(1, $adjp);
    return subset($adjp, $ro);
}

sub pmin {
	my $array1 = shift;
	my $array2 = shift;
	
	return mapply($array1, $array2, sub {min(\@_)});
}

sub pmax {
	my $array1 = shift;
	my $array2 = shift;
	
	return mapply($array1, $array2, sub {max(\@_)});
}

__END__

=pod

=head1 NAME

Statistics::Multtest - Control false discovery rate in multiple test problem

=head1 SYNOPSIS

  use Statistics::Multtest qw(bonferroni holm hommel hochberg BH BY);
  use Statistics::Multtest qw(:all);
  use strict;
  
  my $p;
  # p-values can be stored in an array by reference
  $p = [0.01, 0.02, 0.05,0.41,0.16,0.51];
  # @$res has the same order as @$p
  my $res = BH($p);
  print join "\n", @$res;
  
  # p-values can also be stored in a hash by reference
  $p = {"a" => 0.01,
        "b" => 0.02,
        "c" => 0.05,
        "d" => 0.41,
        "e" => 0.16,
        "f" => 0.51 };
  # $res is also a hash reference which is the same as $p
  $res = holm($p);
  foreach (sort {$res->{a} <=> $res->{$b}} keys %$res) {
      print "$_ => $res->{$_}\n";
  }

=head1 DESCRIPTION

For statistical test, p-value is the probability of false positives. While there
are many hypothesis for testing simultaneously, the probability of getting at least one
false positive would be large. Therefore the origin p-values should be adjusted to decrease
the false discovery rate.

Six procedures to controlling false positive rates is provided. 
The names of the methods are derived from C<p.adjust> in 
C<stat> package in R. Code is translated directly from R to Perl using L<List::Vectorize> module.

All six subroutines receive one argument which can either be an array reference
or a hash reference, and return the adjusted p-values in corresponding data structure. The order
of items in the array does not change after the adjustment.

=head2 Subroutines

=over 4

=item C<bonferroni($pvalue)>

Bonferroni single-step process.

=item C<hommel($pvalue)>

Hommel singlewise process.

Hommel, G. (1988). A stagewise rejective multiple test procedure based on a modified Bonferroni test. Biometrika, 75, 383C386. 

=item C<holm($pvalue)>

Holm step-down process.

Holm, S. (1979). A simple sequentially rejective multiple test procedure. Scandinavian Journal of Statistics, 6, 65C70. 

=item C<hochberg($pvalue)>

Hochberg step-up process.

Hochberg, Y. (1988). A sharper Bonferroni procedure for multiple tests of significance. Biometrika, 75, 800C803. 

=item C<BH($pvalue)>

Benjamini and Hochberg, controlling the FDR.

Benjamini, Y., and Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. Journal of the Royal Statistical Society Series B, 57, 289C300. 

=item C<BY($pvalue)>

Use Benjamini and Yekutieli.

Benjamini, Y., and Yekutieli, D. (2001). The control of the false discovery rate in multiple testing under dependency. Annals of Statistics 29, 1165C1188. 

=back

=head1 AUTHOR

Zuguang Gu E<lt>jokergoo@gmail.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2012 by Zuguang Gu

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.12.1 or,
at your option, any later version of Perl 5 you may have available.

=head1 SEE ALSO

List::Vectorize

=cut
