#!/usr/bin/perl -w

###########################################################################
#
# unindex.pl --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# this program will decompress all the text from a built index
# and return it to gml format - may be slightly broken at present

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    $FileHandle = 'FH000';
}

use doc;
use docsave;
use util;
use parsargv;
use GDBM_File;
use FileHandle;
use English;

select STDERR; $| = 1;
select STDOUT; $| = 1;


# globals
$collection = "";
$index = "";
$textdir = "";
$classinfo = {};
$mgread = ++$FileHandle;
$mgwrite = ++$FileHandle;



sub print_usage {
    print STDERR "\n";
    print STDERR "unindex.pl: Attempts to generate gml files from a built Greenstone index.\n\n";
    print STDERR "  usage: $0 [options]\n\n";
    print STDERR "  options:\n";
    print STDERR "   -verbosity number      0=none, 3=lots\n";
    print STDERR "   -indexdir directory    The index to be decompressed (defaults to ./index)\n";
    print STDERR "   -archivedir directory  Where the converted material ends up (defaults to ./archives\n";
    print STDERR "   -removeold             Will remove the old contents of the archives\n";
    print STDERR "                          directory -- use with care\n\n";
}

&main ();

sub main {
    if (!parsargv::parse(\@ARGV, 
			 'verbosity/\d+/2', \$verbosity,
			 'indexdir/.*/index', \$indexdir,
			 'archivedir/.*/archives', \$archivedir,
			 'removeold', \$removeold)) {
	&print_usage();
	die "\n";
    }

    die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
    $indexdir =~ s/\/$//;
    if (-d $archivedir) {
	if ($removeold) {
	    print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
	    print STDERR "          in preparation for the import\n";
	    sleep(5); # just in case...
	    &util::rm_r ($archivedir);
	}
    } else {
	&util::mk_all_dir ($archivedir);
    }

    my $gdbmfile = &get_gdbmfile ($indexdir);
    &set_index ();
    &get_classinfo ($gdbmfile);
    &openmg ();

    # read the archive information file
    my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
    my $archive_info = new arcinfo ();
    $archive_info->load_info ($archive_info_filename);

    # create a docsave object to process the documents
    my $processor = new docsave ("", $archive_info, $verbosity);
    $processor->setarchivedir ($archivedir);

    # create a document object for the classification file
    my $clsudoc_obj = new doc($gdbmfile, "classification");
    $clsudoc_obj->set_OID ("CLSU");

    my ($doc_obj);
    foreach $classification (keys %$classinfo) {
	my $count = 0;
	foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
	    print STDERR "\n$classification.$section" if $verbosity >= 2;
	    if ($classification =~ /^B$/i) {
		last if $count > 10;
		# create a new document
		$doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'}, 
				    "indexed_doc");
		$doc_obj->set_OID();
		my $cursection = $doc_obj->get_top_section();
		&add_section_content ($doc_obj, $cursection,
				      $classinfo->{$classification}->{$section});
		&recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section}, 
				    $doc_obj->get_end_child($cursection), 0);
		$processor->process($doc_obj);
	    } else {
		my $classifier = &int_classification ($classification);
		$clsudoc_obj->create_named_section("$classifier.$section");
		&add_section_content ($clsudoc_obj, "$classifier.$section",
				      $classinfo->{$classification}->{$section});
		&recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section}, 
				    "", "$classifier.$section");
	    }
	    $count ++;
	}
    }
    $processor->process($clsudoc_obj);
    &closemg();

    # write out the archive information file
    $archive_info->save_info($archive_info_filename);
}

# returns the path to the gdbm info database - also
# sets the $collection and $textdir global variable
sub get_gdbmfile {
    my ($indexdir) = @_;
    my ($gdbmfile);

    opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
    my @conts = readdir DIR;
    close DIR;

    foreach $file (@conts) {
	if ($file =~ /text$/) {
	    $textdir = $file;
	    last;
	}
    }
    die "No text directory found in $indexdir\n\n" 
	unless defined $textdir && $textdir =~ /text$/;

    $gdbmfile = &util::filename_cat ($indexdir, $textdir);

    opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
    @conts = readdir DIR;
    close DIR;

    foreach $file (@conts) {
	if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
	    $collection = $1;
	    $gdbmfile = &util::filename_cat ($gdbmfile, $file);
	    last;
	}
    }
    
    if (defined $collection && $collection =~ /\w/) {
	$textdir = &util::filename_cat ($textdir, $collection);
    } else {
	die "collection global wasn't set\n";
    }
    return $gdbmfile if (-e $gdbmfile);
    die "Couldn't find gdbm info database in $indexdir\n\n";
}

sub get_classinfo {
    my ($gdbmfile) = @_;
    my ($class);

    open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";

    print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
    my $count = 0;

    $/ = '-' x 70;
    my $entry = "";
    while (defined ($entry = <DB2TXT>)) {
	next unless $entry =~ /\w/;
	$entry =~ s/\n+/\\n/g;
	my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;

	print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);

	next if ($key !~ /\./); # ignore the docnums and top level of classifications

	die "Badly formatted key $key in $gdbmfile\n\n" 
	    unless (($class, $section) = $key =~ /^(.)\.(.*)$/);	
	$classinfo->{$class} = {} unless defined $classinfo->{$class};
	my @sections = split /\./, $section;

	my $hashptr = $classinfo->{$class};
	foreach $sec (@sections) {
	    $hashptr->{$sec} = {} unless defined $hashptr->{$sec};
	    $hashptr = $hashptr->{$sec};
	}
	&get_metadata ($key, \$value, $hashptr);

	$count ++;
    }

    $/ = "\n";
    print STDERR "\n" if $verbosity >= 2;
}

sub get_metadata {
    my ($key, $gdb_str_ref, $hashsection) = @_;
    
    my ($title) = $$gdb_str_ref =~ /(?:<t>|<title>)(.*?)(?:\\n|$)/i; 
    my ($docnum) = $$gdb_str_ref =~ /(?:<d>|<docnum>)(.*?)(?:\\n|$)/i; 
    my ($jobnum) = $$gdb_str_ref =~ /(?:<j>|<jobnumber>)(.*?)(?:\\n|$)/i; 
    my ($classifications) = $$gdb_str_ref =~ /(?:<x>|<classification>)(.*?)(?:\\n|$)/i; 
    my $hastext = 1;
    $hastext = 0 if ($$gdb_str_ref =~ /(?:<c>|<contains>)(.*?)(?:\\n|$)/i);
    my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n|$)/i; 
    my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n|$)/i; 
			
    # just in case there are empty classifications			     
    if ($hastext && !defined $docnum) {
	print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
	if ($verbosity >= 2) {
	    my $valuestr = $$gdb_str_ref;
	    $valuestr =~ s/\\n/\n/g;
	    print STDERR "$valuestr\n";
	}
	$hastext = 0;
    }

    # shove metadata in 9999999 to keep it numeric and prevent 
    # sorting from being a pain. Watch out for documents with
    # 9999999 subsections ;-)
    if (defined $hashsection->{'9999999'}) {
	print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
    } else {
	$hashsection->{'9999999'}->{'Title'} = $title if defined $title;
	$hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
	$hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
	$hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
	$hashsection->{'9999999'}->{'hastext'} = $hastext;
	$hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
	$hashsection->{'9999999'}->{'Date'} = $date if defined $date;
    }
}

sub recurse_classinfo {
    my ($doc_obj, $hashsection, $cursection, $class) = @_;
    foreach $section (sort numerically keys %$hashsection) {
	next if $section == 9999999;
	if ($class) {
	    $doc_obj->create_named_section("$class.$section");
	    &add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
	    &recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
	} else {
	    $cursection = 
		$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
	    &add_section_content ($doc_obj, $cursection, $hashsection->{$section});
	    &recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
	}
    }
}

sub add_section_content {
    my ($doc_obj, $cursection, $hashsection) = @_;

    $doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
	if defined $hashsection->{'9999999'}->{'Title'};
    $doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
	if defined $hashsection->{'9999999'}->{'Creator'};
    $doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
	if defined $hashsection->{'9999999'}->{'Date'};

    if (defined $hashsection->{'9999999'}->{'classifications'}) {
	my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
	map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
    }

    $doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
	if ($hashsection->{'9999999'}->{'hastext'});
}

sub set_index {
    # check that $collection has been set
    die "collection global was not set\n"
	unless defined $collection && $collection =~ /\w/;

    # find an index (just use first non-text directory we come across in $indexdir)
    opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
    my @indexes = readdir INDEXDIR;
    close INDEXDIR;
    foreach $i (@indexes) {
	next if $i =~ /text$/i || $i =~ /\./ ;
	$index = &util::filename_cat ($i, $collection);
	last;
    }
}

sub get_text {
    my ($docnum) = @_;

    print STDERR "." if $verbosity >= 2;
    &mgcommand ($docnum);

    <$mgread>;	# eat the document separator

    my $text = "";
    my $line = "";

    while (defined ($line = <$mgread>))
    {
	last if $line =~ /^<\/mg>/;
	$text .= $line;
    }

    # Read in the last statement, which should be:
    #  "dd documents retrieved."
    <$mgread>;

    return $text;
}

sub numerically {$a <=> $b;}

# converts leading letter of a classification into its ascii equivalent
# i.e C.2.4 becomes 67.2.4
sub int_classification {
    my ($classification) = @_;
    my $c = ord($classification);
    $classification =~ s/^./$c/;

    return $classification;
}


# mg stuff

sub openmg {
    
    die "Unable to start mgquery." unless
	openpipe($mgread, $mgwrite,
		 "mgquery -d $indexdir -f $index -t $textdir");

    $mgwrite->autoflush();

    &mgcommand('.set expert true');
    &mgcommand('.set terminator "</mg>\n"');
    &mgcommand('.set mode text');
    &mgcommand('.set query docnums');
    &mgcommand('.set term_freq off');
}

sub closemg {
    &mgcommand (".quit");
    close($mgread);
    close($mgwrite);
}

sub mgcommand {
    my ($command) = @_;

    return if $command =~ /^\s*$/;
    print $mgwrite "$command\n";

    # eat up the command executed which is echoed
    <$mgread>;
}

# openpipe(READ, WRITE, CMD)
# 
# Like open2, except CMD's stderr is also redirected.
# 
sub openpipe
{
    my ($read, $write, $cmd) = @_;
    my ($child_read, $child_write);

    $child_read = ++$FileHandle;
    $child_write = ++$FileHandle;

    pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
    pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
    my $pid;

    if (($pid = fork) < 0) {
        die "Failed fork: $!";
    } elsif ($pid == 0) {
        close($read);
        close($write);
        open(STDIN, "<&$child_read");
        open(STDOUT, ">&$child_write");
        open(STDERR, ">&$child_write");
        exec($cmd);
        die "Failed exec $cmd: $!";
    }

    close($child_read);
    close($child_write);

    $write->autoflush();
    $read->autoflush();

    return 1;
}
