#!/usr/bin/perl -w

# overlap - print overlap between test pairs
#
# Copyright (C) 2002  Daniel Quinlan
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of either the Artistic License or the GNU General
# Public License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.

use vars qw($opt_a $opt_h);
use Getopt::Std;
getopts("ah");

my $prog = $0;
$prog =~ s@.*/@@;

sub usage {
    my $status = shift;

    my $out = $status ? STDERR : STDOUT;
    print $out <<EOF;
usage: $prog [options] [mass-check results files]

 -a    show all entries (normally, reverses of pairs are not shown)
 -h    print this help

Do not abuse this tool.  Just because a test highly correlates with
another test does not mean you can simply remove one or merge them
without further consideration.  You need to also look at hit rates,
false positives, false negatives, and actually compare the tests.
Some overlap is often good, especially if the tests have different
characteristics.

EOF
    exit($status);
}

usage(0) if $opt_h;

if ($#ARGV < 0) {
    push(@ARGV, "-");
}

my %solo;
my %pair;

foreach $file (@ARGV) {
    read_file($file);
}

print "COUNT\tPAIR/A\tPAIR/B\tA,B\n";

foreach $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
    my ($a, $b) = split(/ /, $k);
    my $a_pct = $pair{$k} / $solo{$a};
    my $b_pct = $pair{$k} / $solo{$b};
    if ($opt_a || ($a_pct > $b_pct) || ($a_pct == $b_pct && $a lt $b)) {
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k}, $a_pct, $b_pct, $a, $b;
    }
    else {
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k}, $b_pct, $a_pct, $b, $a;
    }
}

sub read_file {
    my ($input) = @_;

    open(FILE, $input) || die "open failed: $input";
    my $line = 0;
    while(<FILE>) {
	chomp;
	next if /^#/;
	if (/^[Y.]\s+-?\d+\s+\S+\s+(.*)/) {
	    my @tests = sort (split(/,/, $1));
	    for (my $i = 0; $i <= $#tests; $i++) {
		my $start = $opt_a ? 0 : ($i + 1);
		$solo{$tests[$i]}++;
		for (my $j = $start; $j <= $#tests; $j++) {
		    next if $i == $j;
		    $pair{$tests[$i] . " " . $tests[$j]}++;
		}
	    }
	}
	else {
	    die "$prog: error in input format in $input\n";
	}
    }
    close(FILE);
}
