package Kea;

# This function is called by BasPlug.pm when a flag in a collection
# configuration document specifies that keyphrase metadata must be gathered for
# that collection.
# It is passed as arguments, the documents text and possibly some options for
# how the keyphrase data is to be collected if the keyphrase option flag was
# set in the collection configuration file.  This module then writes the
# documents text to a file because the stand-alone program Kea which will be
# called to do the actual extraction of the keyphrases expects a file argument.
# Once Kea has been called upon, the file containing the keyphrase data
# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
# we are interested in is extracted and passed back to BasPlug.pm in an
# appropriate format.

sub extract_KeyPhrases {

    my $gsdlhome = $ENV{'GSDLHOME'}; 
    my $doc = shift(@_); #documents text  
    my $args = shift(@_); #any options
    my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
    my $suffix = 'kea'; #default file will be called .kea
    my $command = ""; 
    my @keylist; 
    my @stemlist;

    print STDERR "optionlist: @optionlist\n";
       
    foreach $element (@optionlist){ #for each option
	my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)

	$option  = "-".$option; #place dash in front of option
	$file = "" if(!defined($file)); #no file options specified
	$suffix = $file if($option eq '-E'); #if option is extension (suffix) option
	$command .= " $option $file "; #add to list of commands
    }

    print STDERR "Using output suffix: $suffix\n"; 

    # remove all HTML tags
    $doc =~ s/<P[^>]*>/\n/sgi;
    $doc =~ s/<H[^>]*>/\n/sgi;
    $doc =~ s/<[^>]*>//sgi;
    $doc =~ tr/\n/\n/s;

    #write text to a file eg doc.txt 
    open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n";  
    print OUT $doc;
    close(OUT);

    #call Kea with specifed options
    `$gsdlhome/perllib/Kea-1.1.4/Kea $command $gsdlhome/tmp/doc.txt`; 

    #read doc.kea with keywords
    open(IN, "<$gsdlhome/tmp/doc.$suffix") or return @emptykeylist;
                                          #this means doc.kea does not exist
                                          #either because an option was wrongly specified
	                                  #or no keyphrases were found
    while(<IN>){
	chomp;
	@key = split(/\t/); #split into array separated by a tab
	push(@keylist, $key[0]); #add to list of keywords
	push(@stemlist, $key[1]); #add to list of stems  
    }
    close(IN);

    #put data into appropriate format 
    $keylist = join(", ", @keylist);
    $stemlist = join(", ", @stemlist);	
 
    #delete doc.extension so that in future it will not be opened and read
    `rm $gsdlhome/tmp/doc.$suffix`;

    #return keywords + stems to basplug
    my @keystemlist = ($keylist, $stemlist);
    return @keystemlist;

}



1;


