package Biblio::DocParser::Utils;

######################################################################
#
# ParaTools::DocParser::Utils; 
#
######################################################################
#
#  This file is part of ParaCite Tools ((http://paracite.eprints.org/developers/) 
#
#  Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
#
#  ParaTools is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  ParaTools is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with ParaTools; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
######################################################################


use utf8;
use strict;
require Exporter;
use LWP::UserAgent;
use File::Temp qw/ tempfile tempdir /;
use URI;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_TRANSFORMS %CONVERTERS $DEBUG);

@ISA = qw( Exporter );
@EXPORT_OK = qw( &normalise_multichars );
$DEBUG = 0;

=pod

=head1 NAME

@<Biblio::DocParser::Utils> - utility module for handling International characters and document conversion

=head1 DESCRIPTION

Biblio::DocParser::Utils provides some utility functions for handling international
characters and for conversion of documents to plaintext.

=head1 SYNOPSIS

	use Biblio::DocParser::Utils qw( normalise_multichars );

	print normalise_multichars( $str );

=head1 METHODS

=over 4

=item $str = normalise_multichar( $str )

Convert multi-char international characters into single UTF-8 chars, e.g.:
	¨o => ö
These appear in pdftotext output from PDFs generated by pdflatex.

=cut

$CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]';

%CHAR_TRANSFORMS = (
"\x{5e}a"=>"\x{e2}",
"\x{5e}e"=>"\x{ea}",
"\x{5e}o"=>"\x{f4}",
"\x{5e}u"=>"\x{fb}",
"\x{60}a"=>"\x{e0}",
"\x{60}e"=>"\x{e8}",
"\x{60}o"=>"\x{f2}",
"\x{60}u"=>"\x{f9}",
"\x{a8}a"=>"\x{e4}",
"\x{a8}e"=>"\x{eb}",
"\x{a8}o"=>"\x{f6}",
"\x{a8}u"=>"\x{fc}",
"\x{b4}a"=>"\x{e1}",
"\x{b4}e"=>"\x{e9}",
"\x{b4}o"=>"\x{f3}",
"\x{b4}u"=>"\x{fa}",
"\x{b4}n"=>"\x{144}",
"\x{b4}z"=>"\x{17a}",
"\x{7e}n"=>"\x{f1}",
);

%CONVERTERS =
(
	doc => "wvText _IN_ _OUT_",
	pdf => "pdftotext -raw _IN_ _OUT_",
	ps => "pstotext -output _OUT_ _IN_",
	htm => "links --dump _IN_ > _OUT_",
	html => "links --dump _IN_ > _OUT_",
);

if($DEBUG) {
	binmode(STDOUT,":utf8");
	for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) {
		print "$_ => $CHAR_TRANSFORMS{$_}\n";
	}
}

sub normalise_multichars {
	my $str = shift;
	$str =~ s/($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}/sgo;
	$str;
}


=pod

=item $content = ParaTools::Utils::get_content($location)

This function takes either a filename or a URL as a parameter, and
aims to return a string containing the lines in the file. A hash of
converters is provided in ParaTools/Utils.pm, which should be customised
for your system.

For URLs, the file is first downloaded to a temporary directory, then
converted, whereas local files are copied straight into the temporary
directory. For this reason, some care should be taken when handling very
large files.

=cut

sub get_content
{
	my($location) = @_;

	# Get some temporary files ready.
	my $dir = tempdir( CLEANUP => 1 );
	my (undef, $tofile)  = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt");

	my $type = "txt";
	my $converter = "";

	# Set up the type. 
	if ($location =~ /\.(\w+?)$/)
	{
		$type = $1;
	}	

	if ($location =~ /^http:\/\//)
	{
		if (!$type)	
		{
			print STDERR "Unknown type - assuming HTML\n";
			$type = "html";
		}
	}
	else
	{
		if (!$type)
		{
			print STDERR "Unknown type - assuming plaintext\n";
			$type = "txt";
		}		
	}

	my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type");

	# Now we know the type, grab the files. 
	if ($location =~ /^http:\/\//)
        {
		# If it's remote, use the LWP mirror function to grab it.
		my $ua = new LWP::UserAgent();
              	$ua->mirror($location, $fromfile);
	}
	else
	{
		# If it's local, mirror it straight to the $fromfile.
		open(FIN, $location) or die $!;
                open(FOUT, ">$fromfile") or die $!;
                foreach(<FIN>) { print FOUT $_; }
                close FOUT or die $!;
                close FIN or die $!;
	}
	
	if ($type ne "txt")
	{
		# Convert from the $fromfile to the $tofile.
		if (!$CONVERTERS{$type})
		{
			print STDERR "Sorry, no converters available for type $type\n";
			return;
		}
		else
		{
			$converter = $CONVERTERS{$type};
			$converter =~ s/_IN_/$fromfile/g;
			$converter =~ s/_OUT_/$tofile/g;
		}
		system($converter);
	}
	else
	{
		# If we have text, just use the fromfile.
		$tofile = $fromfile;
	}

	my $content = "";
	open( INPUT, $tofile ) or return;
    	read( INPUT, $content, -s INPUT );
	close INPUT or die $!;

	return $content;
}

=pod

=item $escaped_url = ParaTools::Utils::url_escape($string)

Simple function to convert a string into an encoded
URL (i.e. spaces to %20, etc). Takes the unencoded
URL as a parameter, and returns the encoded version.

=cut

sub url_escape
{
        my( $url ) = @_;
	$url =~ s/</%3C/g;
	$url =~ s/>/%3E/g;
	$url =~ s/#/%23/g;
	$url =~ s/;/%3B/g;
	$url =~ s/&/%26/g;
        my $uri = URI->new( $url );
	my $out = $uri->as_string;
        return $out;
}

1;

__END__

=pod

=back

=head1 AUTHOR

Tim Brody <tdb01r@ecs.soton.ac.uk>
Mike Jewell <moj@ecs.soton.ac.uk> (packaging)

=cut
