###########################################################################
#
# GMLPlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# plugin which processes a GML format document
# assumes that gml tags are all in lower-case.

package GMLPlug;

use BasPlug;
use util;
use doc;

sub BEGIN {
    @ISA = ('BasPlug');
}

sub new {
    my ($class) = @_;
    my $self = new BasPlug ("GMLPlug", @_);

    return bless $self, $class;}

sub get_default_process_exp {
    my $self = shift (@_);

    return q^(?i)\.gml?$^;
}

# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might 
# include directories
sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    my $outhandle = $self->{'outhandle'};

    my $filename = &util::filename_cat($base_dir, $file);
    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
	return undef;
    }
    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up

    print $outhandle "GMLPlug: processing $file\n";

    my $parent_dir = $file;
    $parent_dir =~ s/[^\\\/]*$//;
    $parent_dir = &util::filename_cat ($base_dir, $parent_dir);

    if (!open (INFILE, $filename)) {
	print $outhandle "GMLPlug::read - couldn't read $filename\n";
	return 0;
    }

    undef $/;
    my $gml = <INFILE>;
    $/ = "\n";
    close (INFILE);
    
    my @gml_sections = split("</gsdlsection>",$gml);
    $gml = shift(@gml_sections);

    my $no_docs = 0;

    while (1) {
	# create a new document
	my $doc_obj = new doc ();
	my $section = $doc_obj->get_top_section();

	# process the document
	my $firstsection = 1;
	while (1) {
	    my ($tags, $text) = ("", "");

	    my @indenting_sections = split("<gsdlsection", $gml);
	    shift(@indenting_sections); # first entry is trivially empty

	    foreach $gml (@indenting_sections) {

		if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
		    $tags = $1 if defined $1;
		    $text = &GMLPlug::_unescape_text($2);

		} else {
		    print $outhandle "GMLPlug::read - error in file $filename\n";
		    print $outhandle "text: \"$gml\"\n";
		    last;
		}

		# create the section (unless this is the first section)
		if ($firstsection) {
 		    $firstsection = 0;
#		    $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
#		    $src_filename = $2 || $3;

		} else {

		    $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
		    if (defined $1) {
			$section .= ".$1";
			$doc_obj->create_named_section($section);
		    } else {
			$section = $doc_obj->insert_section($doc_obj->get_end_child($section));
		    }
		}
	    
		# add the tags
		while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
		    $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2)) 
			if (defined $1 and defined $2);

		}
		
		# add the text
		$doc_obj->add_utf8_text($section, $text) 
		    if ((defined $text) && ($text ne ""));		
	    }

	    $gml = shift(@gml_sections); # get next bit of data
	    last unless defined $gml;
 	    last if $section eq ""; # back to top level again (more than one document in gml file)
	    $section = $doc_obj->get_parent_section ($section);
	} # while (1) section level

	# add the associated files
	my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
	my ($assoc_file_info);

	foreach $assoc_file_info (@$assoc_files) 
	{
	    my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
	    my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
	    my $assoc_dir = (defined $dir && $dir ne "") 
		? &util::filename_cat($dir, $assoc_file) : $assoc_file;
	    $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
	    
	}
	$doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");

	# add metadata passed in from elsewhere
	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	
	# do any automatic metadata extraction
	$self->auto_extract_metadata ($doc_obj);

	# assume the document has an OID already

	# process the document
	$processor->process($doc_obj, $file);
	
	$no_docs++;
	last if ($maxdocs > -1 && $no_docs >= $maxdocs);
	last unless defined $gml && $gml =~ /\w/;
    } # while(1) document level
    
    return $no_docs; # no of docs processed
}

sub _unescape_text {
    my ($text) = @_;

    # special characters in the gml encoding
    $text =~ s/&lt;/</g;
    $text =~ s/&gt;/>/g;
    $text =~ s/&quot;/\"/g;
    $text =~ s/&amp;/&/g; # this has to be last...

    return $text;
}

1;
