#!/usr/bin/perl -w

#    prepare-clauses.pl
#    Version 1.1

#    Kea -- Automatic Keyphrase Extraction
#    Copyright 1998-1999 by Gordon Paynter and Eibe Frank
#    Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# Version history
#
# 1.0   Witten et.al.
# 1.0.1 Bug: Sentences ending in ".)" 
# 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
# 1.0.3 All "." chanracters removed from the start of tokens.
# 1.1   First Distribution.  GPL added.

if (!$ARGV[0] || !$ARGV[1]) {
    die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
}

$infile = $ARGV[0];
$outfile = $ARGV[1];

print "prepare-clauses.pl\n";
print "Input text: $infile\n";
print "Output clauses: $outfile\n";

open(IN, "<$infile");
open(OUT, ">$infile.1");


# step 1: put every sentence on a new line.

while (<IN>) {

    # remove hyphens at end of line
    s/\-\s+$//;

    # replace \n and other whitespace with space
    s/\s+/ /g;
    # one sentence per line
    s/[\.\!\?][^A-Za-z0-9]+/. /g; 
    s/[\.\!\?] +/\ \n/g;

    # space out the punctuation
    # double-hyphens (equivilent to an M-dash) replaced with a clause breaker
    s/\-\-/ : /g;
    # delete apostrophe marks
    s/\'//g;
    # remove all "." charactersfrom the start of a token
    s/ \./ /g;

    # space out any other character that is not something we are interested in
    s/([^\w\.\n])/ $1 /g;
    # allow for underscore, which perl considers a "word" character
    s/_/ _ /g;

    # squeeze spaces
    s/\ +/ /g;
    s/^\ //g;
    
    # put a fullstop at the end of each line/sentence
    s/\n/.\n/g;

    # print
    if ($_ =~ /./) {
	print OUT "$_";
    } else {
	print OUT ".\n";
    }

}
close(IN);
close(OUT);

# step 2: the tagger
# `tag $infile.lines > $infile.tagged`;

# step 3: put a clause on each line and get rid of punctuation

open(IN, "<$infile.1");
open(OUT, ">$infile.2");

while (<IN>) {

    @words = split(/\s+/, $_);
    foreach $w (@words) {
	if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
	    # print new line for clause breakers (. and ? and ! done above)
	    print OUT "\n";
	} elsif (($w =~ /^\d+$/) || ($w =~ /^\d+\.\d+$/)) {
	    # "pure" numbers are clause breakers
	    print OUT "\n";
	} elsif ($w =~ /[A-Za-z]/) {
	    # print anything that contains at least one letter
	    print OUT " $w";
	} else {
	    # simply ignore whatever is left over
	}
    }
    print OUT "\n";
}
close(IN);
close(OUT); 

# step 4: pretty things up a tad

`cat $infile.2 | perl -ne "s/^ +//g; print;" | tr -s '\n' > $outfile`;

`rm $infile.1 $infile.2`;
