#! /usr/bin/perl -w

# cstr-to-text.pl: change a cstr html file to a text file.
# Gordon Paynter (gwp@cs.waikato.ac.nz) 

# On the lucy/rose/borg change first line to: #! /usr/local/bin/perl -w

# Version 1    1998 Oct 23 First distibuted, give or take a version.
# Version 1.01 1998 Nov 17 New page number form: "<p><!--Page No-->..."
# Version 1.02 1998 Nov 17 Bug: <p> at end of line.

if (!$ARGV[0] || !$ARGV[1]) {
    die "Usage: cstr-to-text.pl <input-cstr-file> <output-text-file>\n";
}
$infile = $ARGV[0];
$tmpfile = "/tmp/c2t.$$";
$outfile = $ARGV[1];


# print STDERR "Preparing text file $infile in $outfile\n";

open(IN, "<$infile");
open(OUT, ">$tmpfile");

$ignore_next_paragraph_marker = 0;

while (<IN>) {
    
    $line = $_;

    if ($line =~ /^<\!\-\-Page No\-\->/) {
	# print "Page number generated by prescript before 2.1\n";
	$ignore_next_paragraph_marker = 1;
	next;
    } 

    if ($line =~ /^<p><\!\-\-Page No\-\->/) {
	# print "Page number generated by prescript 2.1\n";
	if ($line =~ /<p>\s*$/) {
	    $ignore_next_paragraph_marker = 0;
	} else {
	    $ignore_next_paragraph_marker = 1;
	}
	next;
    } 
    
    if ($line =~ /^<\!\-\-End Of Page\-\->/) {
	# print "End of Page generated by prescript (up to and including 2.1)\n";
	if ($line =~ /<p>\s*$/) {
	    $ignore_next_paragraph_marker = 0;
	} else {
	    $ignore_next_paragraph_marker = 1;
	}
	next;
    }

    if ($line =~ /^<p>\d+$/) {
	# print "Page number as paragraph mark and single number\n";
	$ignore_next_paragraph_marker = 1;
	next;
    }

    if ($line =~ /^<p>\- \d+ \-$/) {
	# print "Page number in yet another form\n";
	$ignore_next_paragraph_marker = 1;
	next;
    }

    if ($ignore_next_paragraph_marker && ($line =~ /^<p>/)) {
	$line =~ s/^<p>//;
	$ignore_next_paragraph_marker = 0;
    }

    if ($line =~ /[A-Za-z0-9]\-$/) {
	#print "Hyphenation\n";
	$ignore_next_paragraph_marker = 1;
    }


    print OUT $line;

}

close(OUT);

`lynx -force_html -dump $tmpfile > $outfile`;

`rm $tmpfile`;

