#!/usr/bin/perl -w

###########################################################################
#
# import.pl --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# This program will import a number of files into a particular collection

package import;

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
}

use arcinfo;
use colcfg;
use plugin;
use docprint;
use util;
use parsargv;
use FileHandle;

sub print_usage {
    print STDERR "\n";
    print STDERR "import.pl: Converts documents in collections -importdir directory into\n";
    print STDERR "           gml documents which are written to the -archivedir directory.\n\n";
    print STDERR "  usage: $0 [options] collection-name\n\n";
    print STDERR "  options:\n";
    print STDERR "   -verbosity number      0=none, 3=lots\n";
    print STDERR "   -importdir directory   Where the original material lives\n";
    print STDERR "   -archivedir directory  Where the converted material ends up\n";
    print STDERR "   -keepold               Will not destroy the current contents of the\n";
    print STDERR "                          archives directory (the default)\n";
    print STDERR "   -removeold             Will remove the old contents of the archives\n";
    print STDERR "                          directory -- use with care\n";
    print STDERR "   -gzip                  Use gzip to compress resulting gml documents\n";
    print STDERR "                          (don't forget to include ZIPPlug in your plugin\n";
    print STDERR "                          list when building from compressed documents)\n";
    print STDERR "   -maxdocs number        Maximum number of documents to import\n";
    print STDERR "   -groupsize number      Number of GML documents to group into one file\n";
    print STDERR "   -sortmeta metadata     Sort documents alphabetically by metadata for\n";
    print STDERR "                          building. This will be disabled if groupsize > 1\n";
    print STDERR "   -debug                 Print imported text to STDOUT\n";
    print STDERR "   -collectdir directory  Collection directory (defaults to " .
	&util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
    print STDERR "   -out                   Filename or handle to print output status to.\n";
    print STDERR "                          The default is STDERR\n\n";
}

&main();

sub main {
    my ($verbosity, $importdir, $archivedir, $keepold, 
	$removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
	$configfilename, $collectcfg, $pluginfo, $sortmeta,
	$archive_info_filename, $archive_info, $processor, 
	$out, $collectdir);
    if (!parsargv::parse(\@ARGV, 
			 'verbosity/\d+/2', \$verbosity,
			 'importdir/.*/', \$importdir,
			 'archivedir/.*/', \$archivedir,
			 'keepold', \$keepold,
			 'removeold', \$removeold,
			 'gzip', \$gzip,
			 'groupsize/\d+/1', \$groupsize,
			 'sortmeta/.*/', \$sortmeta,
			 'debug', \$debug,
			 'maxdocs/^\-?\d+/-1', \$maxdocs,
			 'collectdir/.*/', \$collectdir,
			 'out/.*/STDERR', \$out)) {
	&print_usage();
	die "\n";
    }

    my $close_out = 0;
    if ($out !~ /^(STDERR|STDOUT)$/i) {
	open (OUT, ">$out") || die "Couldn't open output file $out\n";
	$out = 'import::OUT';
	$close_out = 1;
    }
    $out->autoflush(1);

    # set removeold to false if it has been defined
    $removeold = 0 if ($keepold);

    # get and check the collection name
    if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
	&print_usage();
	die "\n";
    }

    # check sortmeta
    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
    if (defined $sortmeta && $groupsize > 1) {
	print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
	print $out "         sortmeta option will be ignored\n\n";
	$sortmeta = undef;
    }

    # dynamically load 'docsave' module so it can pick up on a collection
    # specific docsave.pm is specified.

    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    require docsave;


    # get the list of plugins for this collection
    my $plugins = [];
    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
    if (-e $configfilename) {
	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
	if (defined $collectcfg->{'plugin'}) {
	    $plugins = $collectcfg->{'plugin'};
	}
	if (defined $collectcfg->{'importdir'} && $importdir eq "") {
	    $importdir = $collectcfg->{'importdir'};
	}
	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
	    $archivedir = $collectcfg->{'archivedir'};
	}
	if (defined $collectcfg->{'removeold'}) {
	    if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
		$removeold = 1;
	    }
	    if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
		$removeold = 0;
	    }
	}
    } else {
	die "Couldn't find the configuration file $configfilename\n";
    }
    
    # fill in the default import and archives directories if none
    # were supplied, turn all \ into / and remove trailing /
    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    $importdir =~ s/[\\\/]+/\//g;
    $importdir =~ s/\/$//;
    $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
    $archivedir =~ s/[\\\/]+/\//g;
    $archivedir =~ s/\/$//;

    # load all the plugins
    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
    if (scalar(@$pluginfo) == 0) {
	print $out "No plugins were loaded.\n";
	die "\n";
    }
	
    # remove the old contents of the archives directory if needed
    if ($removeold && -e $archivedir) {
	print $out "Warning - removing current contents of the archives directory\n";
	print $out "          in preparation for the import\n";
	sleep(5); # just in case...
	&util::rm_r ($archivedir);
    }
    
    # read the archive information file
    if (!$debug) {
	$archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
	$archive_info = new arcinfo ();
	$archive_info->load_info ($archive_info_filename);

	# create a docsave object to process the documents
	$processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
	$processor->setarchivedir ($archivedir);
	$processor->set_sortmeta ($sortmeta) if defined $sortmeta;
    } else {
	$processor = new docprint ();
    }

    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);

    # process the import directory
    &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
    
    &plugin::end($pluginfo, $processor);
    
    # write out the archive information file
    if (!$debug) {
	$processor->close_file_output() if $groupsize > 1;
	$archive_info->save_info($archive_info_filename);
    }
    close OUT if $close_out;
}
