#!/usr/bin/perl

use strict;
use warnings;
use utf8;

use Getopt::Long;
use Pod::Usage;
use File::Basename;

# use Data::Dumper;
use Lingua::BioYaTeA;
use Lingua::BioYaTeA::Corpus;
use Lingua::BioYaTeA::PreProcessing;
use Lingua::BioYaTeA::PostProcessing;

# use Lingua::YaTeA::Corpus;
# use Lingua::YaTeA::Config;
use Config::General;


# my $verbose;
my $man = 0;
my $help = 0;
my $rcfile;
my $postProcessingFile;
my $preProcessingFile;
my $ppConfigFile;
my $extractionStep;
my $fh;

GetOptions('help|?|h' => \$help, 
	   'man|m' => \$man, 
	   'extraction|e' => \$extractionStep,
	   "rcfile=s" => \$rcfile,
	   'pre-processing|f=s' => \$preProcessingFile,
	   'post-processing|p=s' => \$postProcessingFile,
	   'post-processing-config|C=s' => \$ppConfigFile,
    ) or pod2usage(2);
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
pod2usage(1) if $help or (scalar(@ARGV) == 0);

# my $current_dir = `pwd`;
my $corpus_path = $ARGV[$#ARGV];

if (defined $preProcessingFile) {
    my $preProc = Lingua::BioYaTeA::PreProcessing->new();
    open($fh, ">$preProcessingFile") or die "can not open file $preProcessingFile to record corrected file";
    $preProc->process_file($corpus_path, $fh);
    close($fh);
    $corpus_path = $preProcessingFile;
}

if (defined $extractionStep) {
    my %config = Lingua::BioYaTeA->load_config($rcfile);
    
    my $sys_config ;
    my $bioyatea;
    my $corpus;
    
    $bioyatea = Lingua::BioYaTeA->new($config{"OPTIONS"}, \%config);
    
    if (defined $corpus_path) {
	if (-f $corpus_path) {
	    $corpus = Lingua::BioYaTeA::Corpus->new($corpus_path,$bioyatea->getOptionSet,$bioyatea->getMessageSet);
	} else {
	    die("\"".$corpus_path . "\"". $bioyatea->getMessageSet->getMessage("NO_FILE")->getContent($bioyatea->getOptionSet->getDisplayLanguage) . "\n");
	}
	$bioyatea->termExtraction($corpus);

	if ((defined $postProcessingFile) && (defined $ppConfigFile)) {
	    my %options;
	    $options{'input-file'} = $corpus->getOutputFileSet->getFile("candidates")->getPath;
	    warn "Loading YaTeA output file " . $options{'input-file'} . "\n";
	    $options{'output-file'} = $postProcessingFile;
	    $options{'configuration'} = $ppConfigFile;
	    my $postProc = Lingua::BioYaTeA::PostProcessing->new(\%options);

	    my ($second,$minute,$hour,$day,$month,$year,$weekday,$yearday,$isdailysavingtime) = localtime(time);
	    $year += 1900;

	    $postProc->logfile(dirname($postProc->output_file) . '/term-filtering-tmp-' . "date-$year-$month-${day}_${hour}_$minute" . '.log');
	    $postProc->_printOptions(\*stderr);
	    $postProc->load_configuration;
	    $postProc->defineTwigParser;
	    $postProc->filtering;
	    $postProc->printResume;
#	    $postProc->rmlog;
	}

	if (defined $preProcessingFile) {
	    unlink($corpus_path);
	}

    } else {
	die( $bioyatea->getMessageSet->getMessage("NO_FILE_ARG")->getContent($bioyatea->getOptionSet->getDisplayLanguage) . "\n");
    }
} else {
    if ((defined $corpus_path) && (-f $corpus_path)  && (defined $postProcessingFile) && (defined $ppConfigFile)) {
	my %options;
	$options{'input-file'} = $corpus_path;
	warn "Loading YaTeA output file " . $options{'input-file'} . "\n";
	$options{'output-file'} = $postProcessingFile;
	$options{'configuration'} = $ppConfigFile;
	my $postProc = Lingua::BioYaTeA::PostProcessing->new(\%options);

	my ($second,$minute,$hour,$day,$month,$year,$weekday,$yearday,$isdailysavingtime) = localtime(time);
	$year += 1900;

	$postProc->logfile(dirname($postProc->output_file) . '/term-filtering-tmp-' . "date-$year-$month-${day}_${hour}_$minute" . '.log');
	$postProc->_printOptions(\*stderr);
	$postProc->load_configuration;
	$postProc->defineTwigParser;
	$postProc->filtering;
	$postProc->printResume;
# 	$postProc->rmlog;
    } else {
	if (!defined $preProcessingFile) {
	    pod2usage(1);
	}
    }
}

=encoding utf8

=head1 NAME

bioyatea - Perl script for extracting terms from a corpus of biomedical texts (based on the module Lingua::YaTeA).

=head1 SYNOPSIS

bioyatea [-help] [-man] [--rcfile=file] file

=head1 OPTIONS

=over 4

=item    B<--help>, B<-h>, B<-?>     brief help message

=item    B<--man>, B<-m>          full documentation

=item    B<--rcfile=file>      load the given configuration file

=item    B<--extraction>      perform the term extraction

=item    B<--post-processing=file>, B<-C file>    set the filename for the output in case of post-processing

=item    B<--pre-processing=file>, B<-f file>     set the filename for the output in case of pret-processing

=item    B<--post-processing-config=file>      set the configuration file for the post-processing

=item    I<file>               corpus of texts in TreeTagger output format.
                               If only post-processing is set, the file is a YaTeA XML output 

=back


=head1 DESCRIPTION

BioYaTeA is an adaptation of YaTeA (C<Lingua::YaTeA>) for biomedical
text.  The tuning concerns the configuration files (in the directory
C<share/BioYaTeA>, pre-processing of the input file and
post-processing of the XML output.

=head1 USE OF BIOYATEA

Using BioYaTeA requires to have a output of TreeTagger
(<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html>
or GeniaTagger (<http://www.nactem.ac.uk/GENIA/tagger/>. It will be
the input of BioYaTeA.

To run bioyatea, a configuration file is needed (usually bioyatea.rc
in /etc/bioyatea). This file describes the behaviour of the term
extractor. You have to indicate the language of the configuration file
you use (see section CONFIGURATION FILE FORMAT of C<Lingua::YaTeA> for
more details, ). It also indicates the path of the configuration files
for the linguistic analysis. You have to adapt the path if your
configuration is not standard.

An example of the configuration file is available in
C<etc/bioyatea/bioyatea.rc> from the archive directory.

=over 4

=item The most common command line to run BioYaTeA is 

C<bioyatea -e TreeTaggerOutputFile.ttg>

It is assumed that the directory containing the program bioyatea is in
your PATH variable and that the configuration file is
C</etc/bioyatea/bioyatea.rc>.

=item If you are not allow to copy the configuration file
C<bioyatea.rc> in the directory C</etc/bioyatea> (or create this
directory), or if you want to use your own configuration file, you can
specify the file with its path by using the option C<--rcfile>

C<bioyatea -e --rcfile MyBioYaTeAConfig.rc TreeTaggerOutputFile.ttg>

More examples of the use of bioyaeta script is given below.

=back

=head1 INPUT/OUTPUT FILE FORMATS

See Documentation in Lingua::YaTeA

=head1 EXAMPLES

Processing of a file without post-processing, with the default
configuration file (C</etc/bioyatea/bioyatea.rc>):

   bioyatea -e sampleEN.ttg

Processing of a file without post-processing. The configuration file
is given in the option C<--rcfile>:

   bioyatea -e --rcfile etc/bioyatea.rc sampleEN.ttg

Processing of a file with post-processing:

   bioyatea -e --rcfile etc/bioyatea.rc --post-processing-config etc/post-processing-filtering.conf --post-processing sampleEN-PP.xml sampleEN.ttg

Only post-processing a file (XML YaTeA output format):

   bioyatea --post-processing-config etc/post-processing-filtering.conf --post-processing sampleEN-PP.xml sampleEN-output.xml

Processing of a file with pre-processing:

   bioyatea -e --rcfile etc/bioyatea.rc --pre-processing sampleEN-prepro.ttg sampleEN.ttg

Only pre-processing a file (TreeTagger output format):

   bioyatea --pre-processing sampleEN-prepro.ttg sampleEN.ttg


Processing of a file with pre-processing and post-processing:

   bioyatea -e --rcfile etc/bioyatea.rc --post-processing-config etc/post-processing-filtering.conf --post-processing sampleEN-PP.xml --pre-processing sampleEN-prepro.ttg sampleEN.ttg


=head1 SEE ALSO

Documentation of Lingua::YaTeA

=head1 AUTHORS

Wiktoria Golik <wiktoria.golik@jouy.inra.fr>, Zorana Ratkovic <Zorana.Ratkovic@jouy.inra.fr>, Robert Bossy <Robert.Bossy@jouy.inra.fr>, Claire Nédellec <claire.nedellec@jouy.inra.fr>, Thierry Hamon <thierry.hamon@univ-paris13.fr>

=head1 LICENSE

Copyright (C) 2012 Wiktoria Golik, Zorana Ratkovic, Robert Bossy, Claire Nédellec and Thierry Hamon

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
