###########################################################################
#
# mgbuilder.pm -- MGBuilder object
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package mgbuilder;

use classify;
use cfgread;
use colcfg;
use plugin;
use util;
use FileHandle;

BEGIN {
    # set autoflush on for STDERR and STDOUT so that mg
    # doesn't get out of sync with plugins
    STDOUT->autoflush(1);
    STDERR->autoflush(1);
}

END {
    STDOUT->autoflush(0);
    STDERR->autoflush(0);
}

$maxdocsize = 12000;

%wanted_index_files = ('td'=>1,
		       't'=>1,
		       'idb'=>1,
		       'ib1'=>1,
		       'ib2'=>1,
		       'ib3'=>1,
		       'i'=>1,
		       'ip'=>1,
		       'tiw'=>1,
		       'wa'=>1);


sub new {
    my ($class, $collection, $source_dir, $build_dir, $verbosity, 
	$maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;

    $outhandle = STDERR unless defined $outhandle;

    # create an mgbuilder object
    my $self = bless {'collection'=>$collection,
		      'source_dir'=>$source_dir,
		      'build_dir'=>$build_dir,
		      'verbosity'=>$verbosity,
		      'maxdocs'=>$maxdocs,
		      'debug'=>$debug,
		      'keepold'=>$keepold,
		      'allclassifications'=>$allclassifications,
		      'outhandle'=>$outhandle,
		      'notbuilt'=>[]    # indexes not built
		      }, $class;


    # read in the collection configuration file
    my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
    if (!-e $colcfgname) {
	die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
    }
    $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);

    # sort out subcollection indexes
    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
	    foreach $index (@$indexes) {
		push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
	    }
	}
    }

    # sort out language subindexes
    if (defined $self->{'collect_cfg'}->{'languages'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
	    foreach $index (@$indexes) {
		if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
		    push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
		}
		else { # add in an empty subcollection field
		    push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
		}
	    }
	}
    }

    # make sure that the same index isn't specified more than once
    my %tmphash = ();
    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
    $self->{'collect_cfg'}->{'indexes'} = [];
    foreach my $i (@tmparray) {
	if (!defined ($tmphash{$i})) {
	    push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
	    $tmphash{$i} = 1;
	}
    }

    # get the list of plugins for this collection
    my $plugins = [];
    if (defined $self->{'collect_cfg'}->{'plugin'}) {
	$plugins = $self->{'collect_cfg'}->{'plugin'};
    }
    
    # load all the plugins
    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
    if (scalar(@{$self->{'pluginfo'}}) == 0) {
	print $outhandle "No plugins were loaded.\n";
	die "\n";
    }

    # get the list of classifiers for this collection
    my $classifiers = [];
    if (defined $self->{'collect_cfg'}->{'classify'}) {
	$classifiers = $self->{'collect_cfg'}->{'classify'};
    }
    
    # load all the classifiers
    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);

    # load up any dontgdbm fields
    $self->{'dontgdbm'} = {};
    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
	foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
	    $self->{'dontgdbm'}->{$dg} = 1;
	}
    }

    # load up the document processor for building
    # if a buildproc class has been created for this collection, use it
    # otherwise, use the mg buildproc
    my ($buildprocdir, $buildproctype);
    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
	$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
	$buildproctype = "${collection}buildproc";
    } else {
	$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
	$buildproctype = "mgbuildproc";
    }
    require "$buildprocdir/$buildproctype.pm";

    eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
	 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
    die "$@" if $@;

    return $self;
}

sub init {
    my $self = shift (@_);

    if (!$self->{'debug'} && !$self->{'keepold'}) {
	# remove any old builds
	&util::rm_r($self->{'build_dir'});
	&util::mk_all_dir($self->{'build_dir'});
        
	# make the text directory
	my $textdir = "$self->{'build_dir'}/text";
	&util::mk_all_dir($textdir);
    }
}

sub compress_text {
    my $self = shift (@_);
    my ($textindex) = @_;
    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
    my $outhandle = $self->{'outhandle'};

    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
    my $basefilename = "text/$self->{'collection'}";
    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);

    my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	$fulltextprefix =~ s/\//\\/g;
    } else {
	$osextra = " -d /";
    }

    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);

    # collect the statistics for the text
    # -b $maxdocsize sets the maximum document size to be 12 meg
    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);

    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$mg_passes_exe" || 
#	    !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
	    !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
	    die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
	}
	$handle = mgbuilder::PIPEOUT;
    }

    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($textindex);
    $self->{'buildproc'}->set_indexing_text (0);
    $self->{'buildproc'}->reset();
    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 
		   $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::end($self->{'pluginfo'});

    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    # create the compression dictionary
    # the compression dictionary is built by assuming the stats are from a seed
    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
    # and the resulting dictionary must be less than 5 meg with the most frequent
    # words being put into the dictionary first (-2 -k 5120)
    if (!$self->{'debug'}) {
	print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_compression_dict_exe") {
	    die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
	}
#	system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
	system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");

	# -b $maxdocsize sets the maximum document size to be 12 meg
	if (!-e "$mg_passes_exe" || 
#	    !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
	    !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
	    die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
	}
    }

    $self->{'buildproc'}->reset();
    # compress the text
    print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();
}

sub want_built {
    my $self = shift (@_);
    my ($index) = @_;

    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
	foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
	    if ($index =~ /^$checkstr$/) {
		push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
		return 0;
	    }
	}
    }

    return 1;
}

sub build_indexes {
    my $self = shift (@_);
    my ($indexname) = @_;
    my $outhandle = $self->{'outhandle'};

    my $indexes = [];
    if (defined $indexname && $indexname =~ /\w/) {
	push @$indexes, $indexname;
    } else {
	$indexes = $self->{'collect_cfg'}->{'indexes'};
    }

    # create the mapping between the index descriptions 
    # and their directory names
    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);

    # build each of the indexes
    foreach $index (@$indexes) {
	if ($self->want_built($index)) {
	    print $outhandle "\n*** building index $index in subdirectory " . 
		"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
	    $self->build_index($index);
	} else {
	    print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
	}
    }
}

# creates directory names for each of the index descriptions
sub create_index_mapping {
    my $self = shift (@_);
    my ($indexes) = @_;

    my %mapping = ();
    $mapping{'indexmaporder'} = [];
    $mapping{'subcollectionmaporder'} = [];
    $mapping{'languagemaporder'} = [];
    
    # dirnames is used to check for collisions. Start this off
    # with the manditory directory names
    my %dirnames = ('text'=>'text',
		    'extra'=>'extra');
    my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');

    foreach $index (@$indexes) {
	my ($level, $gran, $subcollection, $languages) = split (":", $index);

	# the directory name starts with the first character of the index level
	my ($pindex) = $level =~ /^(.)/;

	# next comes a processed version of the index
	$pindex .= $self->process_field ($gran); 
	$pindex = lc ($pindex);

	# next comes a processed version of the subcollection if there is one.
	my $psub = $self->process_field ($subcollection);
	$psub = lc ($psub);

	# next comes a processed version of the language if there is one.
	my $plang = $self->process_field ($languages);
	$plang = lc ($plang);

	my $dirname = $pindex . $psub . $plang;

	# check to be sure all index names are unique
	while (defined ($dirnames{$dirname})) {
	    $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
	}
	$mapping{$index} = $dirname;

	# store the mapping orders as well as the maps
	# also put index, subcollection and language fields into the mapping thing - 
	# (the full index name (eg document:text:subcol:lang) is not used on
	# the query page) -these are used for collectionmeta later on
	if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
	    $mapping{'indexmap'}{"$level:$gran"} = $pindex;
	    push (@{$mapping{'indexmaporder'}}, "$level:$gran");
	    if (!defined $mapping{"$level:$gran"}) {
		$mapping{"$level:$gran"} = $pindex;
	    }
	}
	if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
	    $mapping{'subcollectionmap'}{$subcollection} = $psub;
	    push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
	    $mapping{$subcollection} = $psub;
	}
	if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
	    $mapping{'languagemap'}{$languages} = $plang;
	    push (@{$mapping{'languagemaporder'}}, $languages);
	    $mapping{$languages} = $plang;
	}
	$dirnames{$dirname} = $index;
	$pnames{'index'}{$pindex} = "$level:$gran";
	$pnames{'subcollection'}{$psub} = $subcollection;
	$pnames{'languages'}{$plang} = $languages;
    }

    return \%mapping;
}

# returns a processed version of a field.
# if the field has only one component the processed
# version will contain the first character and next consonant
# of that componant - otherwise it will contain the first 
# character of the first two components 
sub process_field {
    my $self = shift (@_);
    my ($field) = @_;
 
    return "" unless (defined ($field) && $field =~ /\w/);

    my @components = split /,/, $field;
    if (scalar @components >= 2) {
	splice (@components, 2);
	map {s/^(.).*$/$1/;} @components;
	return join("", @components);
    } else {
	my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
	($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
	return "$a$b";
    }
}

sub make_unique {
    my $self = shift (@_);
    my ($namehash, $index, $indexref, $subref, $langref) = @_;
    my ($level, $gran, $subcollection, $languages) = split (":", $index);

    if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
	$self->get_next_version ($indexref);
    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
	$self->get_next_version ($subref);
    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
	$self->get_next_version ($langref);
    }
    return "$$indexref$$subref$$langref";
}	

sub get_next_version {
    my $self = shift (@_);
    my ($nameref) = @_;

    if ($$nameref =~ /(\d\d)$/) {
	my $num = $1; $num ++;
	$$nameref =~ s/\d\d$/$num/;
    } elsif ($$nameref =~ /(\d)$/) {
	my $num = $1;
	if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
	else {$num ++; $$nameref =~ s/\d$/$num/;}
    } else {
	$$nameref =~ s/.$/0/;
    }
}

sub build_index {
    my $self = shift (@_);
    my ($index) = @_;
    my $outhandle = $self->{'outhandle'};

    # get the full index directory path and make sure it exists
    my $indexdir = $self->{'index_mapping'}->{$index};
    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, 
					       $self->{'collection'});
    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", 
					       $self->{'collection'});

    # get any os specific stuff
    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    my $mg_perf_hash_build_exe = 
	&util::filename_cat($exedir, "mg_perf_hash_build$exe");
    my $mg_weights_build_exe = 
	&util::filename_cat ($exedir, "mg_weights_build$exe");
    my $mg_invf_dict_exe = 
	&util::filename_cat ($exedir, "mg_invf_dict$exe");
    my $mg_stem_idx_exe =
	&util::filename_cat ($exedir, "mg_stem_idx$exe");

    my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	$fullindexprefix =~ s/\//\\/g;
    } else {
	$osextra = " -d /";
    }

    # get the index level from the index description
    # the index will be level 2 unless we are building a
    # paragraph level index
    my $index_level = 2;
    $index_level = 3 if $index =~ /^paragraph/i;

    # get the index expression if this index belongs
    # to a subcollection
    my $indexexparr = [];

    # there may be subcollection info, and language info. 
    my ($level, $fields, $subcollection, $language) = split (":", $index);
    my @subcollections = ();
    @subcollections = split /,/, $subcollection if (defined $subcollection);

    foreach $subcollection (@subcollections) {
	if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
	    push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
	} 
    }
    
    # add expressions for languages if this index belongs to
    # a language subcollection - only put languages expressions for the 
    # ones we want in the index

    my @languages = ();
    @languages = split /,/, $language if (defined $language);
    foreach $language (@languages) {
	my $not=0;
	if ($language =~ s/^\!//) {
	    $not = 1;
	}
	foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
	    if ($lang eq $language) {
		if($not) {
		    push (@$indexexparr, "!Language/$language/");
		} else {
		    push (@$indexexparr, "Language/$language/");
		}
		last;
	    }
	}
    }

    # Build index dictionary. Uses verbatim stem method
    print $outhandle "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$mg_passes_exe" || 
#	    !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
	    !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
		   "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
	    die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
	}
	$handle = mgbuilder::PIPEOUT;
    }
	
    # set up the document processor
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($index, $indexexparr);
    $self->{'buildproc'}->set_indexing_text (1);

    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    if (!$self->{'debug'}) {
	# create the perfect hash function
	if (!-e "$mg_perf_hash_build_exe") {
	    die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
	}
#	system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
	system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");

	if (!-e "$mg_passes_exe" || 
#	    !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
	    !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
		   "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
	    die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
	}
    }
    
    # invert the text
    print $outhandle "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);

    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    $self->print_stats ();

    if (!$self->{'debug'}) {

	close ($handle);
	
	# create the weights file
	print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_weights_build_exe") {
	    die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
	}
#	system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
	system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");

	# create 'on-disk' stemmed dictionary
	print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_invf_dict_exe") {
	    die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
	}
#	system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
	system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");


	# creates stem index files for the various stemming methods
	print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_stem_idx_exe") {
	    die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
	}
#	system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
	system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
#	system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
	system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
#	system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
	system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");

    
	# remove unwanted files
	my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
	opendir (DIR, $tmpdir) || die
	    "mgbuilder::build_index - couldn't read directory $tmpdir\n";
	foreach $file (readdir(DIR)) {
	    next if $file =~ /^\./;
	    my ($suffix) = $file =~ /\.([^\.]+)$/;
	    if (defined $suffix && !defined $wanted_index_files{$suffix}) {
		# delete it!
		print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
		&util::rm (&util::filename_cat ($tmpdir, $file));
	    }
	}
	closedir (DIR);
    }
}

sub make_infodatabase {
    my $self = shift (@_);
    my $outhandle = $self->{'outhandle'};

    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    &util::mk_all_dir ($textdir);
    &util::mk_all_dir ($assocdir);

    # get db name
    my $dbext = ".bdb";
    $dbext = ".ldb" if &util::is_little_endian();
    my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
    $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);

    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");

    print $outhandle "\n*** creating the info database and processing associated files\n" 
	if ($self->{'verbosity'} >= 1);

    # init all the classifiers
    &classify::init_classifiers ($self->{'classifiers'});

    # set up the document processor
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
#	if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
	if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
	    die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
	}
	$handle = mgbuilder::PIPEOUT;
    }

    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('infodb');
    $self->{'buildproc'}->set_assocdir ($assocdir);
    $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    $self->{'buildproc'}->set_indexing_text (0);
    $self->{'buildproc'}->reset();

    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
	
	if (!defined $self->{'index_mapping'}) {
	    $self->{'index_mapping'} = 
		$self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
	}

	print $handle "[collection]\n";
	
	foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
	    if ($cmeta =~ s/^\.//) {
		if (defined $self->{'index_mapping'}->{$cmeta}) {
		    print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
			$self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
		} 
		else {
		    print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
		}
	    } else {
		print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
	    }
	}
	print $handle "\n" . ('-' x 70) . "\n";

    }

    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    # output classification information
    &classify::output_classify_info ($self->{'classifiers'}, $handle,
				     $self->{'allclassifications'});

    close ($handle) if !$self->{'debug'};
}

sub collect_specific {
    my $self = shift (@_);
}

sub make_auxiliary_files {
    my $self = shift (@_);
    my ($index);
    my %build_cfg = ();
    my $outhandle = $self->{'outhandle'};

    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);

    # get the text directory
    &util::mk_all_dir ($self->{'build_dir'});

    # store the build date
    $build_cfg->{'builddate'} = time;

    # store the number of documents and number of bytes
    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();

    # get additional stats from mg
    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
    my $input_file = &util::filename_cat ("text", $self->{'collection'});
#    if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
    if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
	print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
    } else {
	my $line = "";
	while (defined ($line = <PIPEIN>)) {
	    if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
		($build_cfg->{'numwords'}) = $1;
	    } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
		($build_cfg->{'numsections'}) = $1;
	    }
	}
	close PIPEIN;
    }

    # store the mapping between the index names and the directory names
    my @indexmap = ();
    foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
	push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
    }
    $build_cfg->{'indexmap'} = \@indexmap;

    my @subcollectionmap = ();
    foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
	push (@subcollectionmap, "$subcollection\-\>" .
	      $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
    }
    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);

    my @languagemap = ();
    foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
	push (@languagemap, "$language\-\>" .
	      $self->{'index_mapping'}->{'languagemap'}->{$language});
    }
    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);

    $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};

    # write out the build information
    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
			     '^(builddate|numdocs|numbytes|numwords|numsections)$', 
                             '^(indexmap|subcollectionmap|languagemap|notbuilt)$');

}

sub deinit {
    my $self = shift (@_);
}

sub print_stats {
    my $self = shift (@_);

    my $outhandle = $self->{'outhandle'};
    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
    my $index = $self->{'buildproc'}->get_index();
    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();

    if ($indexing_text) {
	print $outhandle "Stats (Creating index $index)\n";
    } else {
	print $outhandle "Stats (Compressing text from $index)\n";
    }
    print $outhandle "Total bytes in collection: $num_bytes\n";
    print $outhandle "Total bytes in $index: $num_processed_bytes\n";

    if ($num_processed_bytes < 50) {
	print $outhandle "***************\n";
	print $outhandle "WARNING: There is very little or no text to process for $index\n";
	if ($indexing_text) {
	    print $outhandle "This may cause an error while attempting to build the index\n";
	} else {
	    print $outhandle "This may cause an error while attempting to compress the text\n";
	}
	print $outhandle "***************\n";
    }
}

1;


