#!/usr/bin/env perl
#-*-perl-*-
#
#    Classifier for language discrimination based on blacklists v0.1
#    Copyright 2012 Joerg Tiedemann
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published
#    by the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#-----------------------------------------------------------------------------
# USAGE:
#-----------------------------------------------------------------------------
# classification:
#   blacklist_classifier.pl [OPTIONS] lang1 lang2 ... < file
#
# training:
#   blacklist_classifier.pl -n [OPTIONS] text1 text2 > blacklist.txt
#   blacklist_classifier.pl [OPTIONS] -t "t1.txt t2.txt ..." lang1 lang2 ...
#
# run experiments:
#   blacklist_classifier.pl -t "t1.txt t2.txt ..." \
#                           -e "e1.txt e2.txt ..." \
#                           lang1 lang2 ...
#
#-----------------------------------------------------------------------------
#
# - lang1 lang2 ... are language ID's
# - blacklists are expected in <BlackListDir>/<lang1-lang2.txt
# - t1.txt t2.txt ... are training data files (in UTF-8)
# - e1.txt e2.txt ... are training data files (in UTF-8)
# - the order of languages needs to be the same for training data, eval data
#   as given by the command line arguments (lang1 lang2 ..)
#
#-----------------------------------------------------------------------------
#
# OPTIONS:
#
# -a <freq> ...... min freq for common words
# -b <freq> ...... max freq for uncommon words
# -c <score> ..... min difference score to be relevant
# -d <dir> ....... directory of black lists
# -i ............. classify each line separately
# -m <number> .... use approximately <number> tokens to traing/classify
# -n ............. train a new black list
# -v ............. verbose mode
#
# -U ............. don't lowercase
# -S ............. don't tokenize (use the string as it is)
# -A ............. don't discard tokens with non-alphabetic characters


use strict;
use vars qw($opt_a $opt_b $opt_c $opt_m $opt_n $opt_d $opt_v $opt_i
            $opt_t $opt_e $opt_F $opt_T $opt_L $opt_U $opt_S $opt_A $opt_M);
use Getopt::Std;
use FindBin qw($Bin);

use lib "$Bin/../lib";
use Lingua::Identify::Blacklists qw/:all/;


getopts('a:b:c:d:im:nvt:e:F:T:L:USAM:');

binmode(STDIN,":encoding(UTF-8)");
binmode(STDOUT,":encoding(UTF-8)");
binmode(STDERR,":encoding(UTF-8)");

my %blacklists=();

my $min_high = defined $opt_a ? $opt_a : 10;
my $max_low  = defined $opt_b ? $opt_b : 3;
my $min_diff = defined $opt_c ? $opt_c : 0.8;


$Lingua::Identify::Blacklists::VERBOSE      = 1 if ($opt_v);
$Lingua::Identify::Blacklists::BLACKLISTDIR = $opt_d if ($opt_d);

unless (-d $Lingua::Identify::Blacklists::BLACKLISTDIR){
    $Lingua::Identify::Blacklists::BLACKLISTDIR = "$Bin/../share/blacklists";
}

my $BlackListDir = $Lingua::Identify::Blacklists::BLACKLISTDIR;





# run experiments with a given set of training corpora and 
# a set of evaluation corpora

if ($opt_e){
    my @langs = @ARGV;
    # run an experiment with exponentially increasing training sizes
    # from $opt_F to $opt_T
    if ($opt_F && $opt_T && $opt_L){
        $opt_m = $opt_F;
        while ($opt_m < $opt_T){
            %blacklists=();
            print "train with ca $opt_m tokens\n";
            &run_experiment($opt_t,$opt_e,@langs);
            $opt_m *= $opt_L;
        }
    }
    &run_experiment($opt_t,$opt_e,@langs);
    exit;
}


# train new black lists

if ($opt_t){
    my @traindata = split(/\s+/,$opt_t);
    my @langs = @ARGV;
    &batch_train(\@langs,\@traindata);
}
elsif ($opt_n){
    my $file1=shift(@ARGV);
    my $file2=shift(@ARGV);
    &train($file1,$file2);
}


# classify

else{
    my @langs = @ARGV;
    @ARGV = ();
    my @predictions = &identify_stdin( langs => \@langs, every_line => $opt_i );
    print join("\n",@predictions);
    print "\n";
}





sub batch_train{
    my $langs = shift;
    my $traindata = shift;

    die "no languages given\n" unless (ref($langs) eq 'ARRAY');
    die "no training data given\n" unless (ref($traindata) eq 'ARRAY');
    die "number of languages and training data does not match\n" 
        unless ($#{$traindata} == $#{$langs});

    for my $s (0..$#{$langs}){
        for my $t ($s+1..$#{$langs}){
            print STDERR "traing blacklist for $$langs[$s]-$$langs[$t]\n";
            &train($$traindata[$s],$$traindata[$t],
                   "$Lingua::Identify::Blacklists::BLACKLISTDIR/$$langs[$s]-$$langs[$t].txt");
        }
    }
}


sub train{
    my ($file1,$file2,$outfile) = @_;

    my %dic1=();
    my %dic2=();

    my ($total1,$total2);

    my $total1 = &Lingua::Identify::Blacklists::read_file($file1,\%dic1,$opt_m);
    my $total2 = &Lingua::Identify::Blacklists::read_file($file2,\%dic2,$opt_m);

    if ($outfile){
        system("mkdir -p $Lingua::Identify::Blacklists::BLACKLISTDIR") unless (-d $Lingua::Identify::Blacklists::BLACKLISTDIR);
        open O,">$outfile" || die "cannot write to $outfile\n";
        binmode(O,":encoding(UTF-8)");
    }

    foreach my $w (keys %dic1){
	next if ($dic1{$w}<$min_high && $dic2{$w}<$min_high);
	next if ($dic1{$w}>$max_low && $dic2{$w}>$max_low);

        my $s1 = $dic1{$w} * $total2;
        my $s2 = $dic2{$w} * $total1;
        my $diff = ($s1 - $s2) / ($s1 + $s2);

	if (abs($diff) > $min_diff){
            if ($outfile){
                print O "$diff\t$w\t$dic1{$w}\t$dic2{$w}\n";
            }
            else{
                print "$diff\t$w\t$dic1{$w}\t$dic2{$w}\n";
            }
	}
    }
    # don't forget words that do NOT appear in dic1!!!
    foreach my $w (keys %dic2){
	next if (exists $dic1{$w});
	next if ($dic2{$w}<10);
        if ($outfile){
            print O "-1\t$w\t$dic1{$w}\t$dic2{$w}\n";
        }
        else{
            print "-1\t$w\t$dic1{$w}\t$dic2{$w}\n";
        }
    }
    close O if ($outfile);
}






sub run_experiment{

    use Benchmark;

    my $trainfiles = shift;
    my $evalfiles = shift;

    my @traindata = split(/\s+/,$trainfiles);
    my @evaldata = split(/\s+/,$evalfiles);
    my @langs = @_;

    die "no languages given!\n" unless (@langs);
    die "no training nor evaluation data given!\n" 
        unless ($#traindata == $#evaldata || $#traindata == $#langs);

    # train blacklists

    if ($#traindata == $#langs){
        $Lingua::Identify::Blacklists::BLACKLISTDIR = $opt_d || "blacklist-experiment";
        my $t1 = new Benchmark;
        &batch_train(\@langs,\@traindata);
        print STDERR "training took: ".
            timestr(timediff(new Benchmark, $t1)).".\n";
    }

    # classify test data

    if ($#evaldata == $#langs){
        print STDERR "classify ....\n";

        my $correct=0;
        my $count=0;
        my %guesses=();

        my %correct_lang=();
        my %count_lang=();

        my $t1 = new Benchmark;
        foreach my $i (0..$#langs){
            open IN,"<:encoding(UTF-8)",$evaldata[$i] || die "...";
            while (<IN>){
                chomp;
                my %dic = ();
                &Lingua::Identify::Blacklists::process_string($_,\%dic);
                my $guess = &Lingua::Identify::Blacklists::classify(\%dic,@langs);
                $count++;
                $count_lang{$langs[$i]}++;
                if ($guess eq $langs[$i]){
                    $correct++;
                    $correct_lang{$langs[$i]}++;
                }
                $guesses{$langs[$i]}{$guess}++;
            }
            close IN;
        }
        print STDERR "classification took: ".
            timestr(timediff(new Benchmark, $t1)).".\n";

        printf "accuracy: %6.4f\n   ",$correct/$count;
        foreach my $c (@langs){
            print "  $c";
        }
        print "\n";
        foreach my $c (@langs){
            print "$c ";
            foreach my $g (@langs){
                printf "%4d",$guesses{$c}{$g};
            }
            printf "  %6.4f",$correct_lang{$c}/$count_lang{$c};
            print "\n";
        }
    }
    system("wc -l $Lingua::Identify::Blacklists::BLACKLISTDIR/*.txt");
}

