###########################################################################
#
# mgppbuilder.pm -- MGBuilder object
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package mgppbuilder;

use classify;
use cfgread;
use colcfg;
use plugin;
use util;
use FileHandle;


BEGIN {
    # set autoflush on for STDERR and STDOUT so that mg
    # doesn't get out of sync with plugins
    STDOUT->autoflush(1);
    STDERR->autoflush(1);
}

END {
    STDOUT->autoflush(0);
    STDERR->autoflush(0);
}

$maxdocsize = 12000;


%wanted_index_files = ('td'=>1,
		       't'=>1,
		       'tl'=>1,
		       'ti'=>1,
		       'idb'=>1,
		       'ib1'=>1,
		       'ib2'=>1,
		       'ib3'=>1,
		       'i'=>1,
		       'il'=>1,
		       'tw'=>1,
		       'w'=>1,
		       'wa'=>1);

# change this so a user can add their own ones in via a file or cfg
%static_indexfield_map = ('Title'=>'TI',
			  'TI'=>1,
			  'Subject'=>'SU',
			  'SU'=>1,
			  'Creator'=>'CR',
			  'CR'=>1,
			  'Organization'=>'OR',
			  'OR'=>1,
			  'Source'=>'SO',
			  'SO'=>1,
			  'Howto'=>'HT',
			  'HT'=>1,
			  'ItemTitle'=>'IT',
			  'IT'=>1,
			  'ProgNumber'=>'PN',
			  'PN'=>1,
			  'People'=>'PE',
			  'PE'=>1,
			  'TextOnly'=>'TX',
			  'TX'=>1);

sub new {
    my ($class, $collection, $source_dir, $build_dir, $verbosity, 
	$maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;

    $outhandle = STDERR unless defined $outhandle;

    # create an mgppbuilder object
    my $self = bless {'collection'=>$collection,
		      'source_dir'=>$source_dir,
		      'build_dir'=>$build_dir,
		      'verbosity'=>$verbosity,
		      'maxdocs'=>$maxdocs,
		      'debug'=>$debug,
		      'keepold'=>$keepold,
		      'allclassifications'=>$allclassifications,
		      'outhandle'=>$outhandle,
		      'notbuilt'=>[],    # indexes not built
		      'indexfieldmap'=>\%static_indexfield_map
		  }, $class;
    

    # read in the collection configuration file
    my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
    if (!-e $colcfgname) {
	die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
    }
    $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);

    # sort out subcollection indexes
    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
	    foreach $index (@$indexes) {
		push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
	    }
	}
    }

    # sort out language subindexes
    if (defined $self->{'collect_cfg'}->{'languages'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
	    foreach $index (@$indexes) {
		push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
	    }
	}
    }

    # make sure that the same index isn't specified more than once
    my %tmphash = ();
    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
    $self->{'collect_cfg'}->{'indexes'} = [];
    foreach my $i (@tmparray) {
	if (!defined ($tmphash{$i})) {
	    push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
	    $tmphash{$i} = 1;
	}
    }


    # get the levels (Section, Paragraph) for indexing and compression
    $self->{'levels'} = {};
    if (defined $self->{'collect_cfg'}->{'levels'}) {
        foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
            $self->{'levels'}->{$level} = 1;
        }
    }  

    # get the list of plugins for this collection
    my $plugins = [];
    if (defined $self->{'collect_cfg'}->{'plugin'}) {
	$plugins = $self->{'collect_cfg'}->{'plugin'};
    }
    
    # load all the plugins
    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
    if (scalar(@{$self->{'pluginfo'}}) == 0) {
	print $outhandle "No plugins were loaded.\n";
	die "\n";
    }

    # get the list of classifiers for this collection
    my $classifiers = [];
    if (defined $self->{'collect_cfg'}->{'classify'}) {
	$classifiers = $self->{'collect_cfg'}->{'classify'};
    }
    
    # load all the classifiers
    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);

    # load up any dontgdbm fields
    $self->{'dontgdbm'} = {};
    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
	foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
	    $self->{'dontgdbm'}->{$dg} = 1;
	}
    }

    # load up the document processor for building
    # if a buildproc class has been created for this collection, use it
    # otherwise, use the mgpp buildproc
    my ($buildprocdir, $buildproctype);
    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
	$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
	$buildproctype = "${collection}buildproc";
    } else {
	$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
	$buildproctype = "mgppbuildproc";
    }
    require "$buildprocdir/$buildproctype.pm";

    eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
	 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
    die "$@" if $@;


    return $self;
}

sub init {
    my $self = shift (@_);

    if (!$self->{'debug'} && !$self->{'keepold'}) {
	# remove any old builds
	&util::rm_r($self->{'build_dir'});
	&util::mk_all_dir($self->{'build_dir'});
        
	# make the text directory
	my $textdir = "$self->{'build_dir'}/text";
	&util::mk_all_dir($textdir);
    }
}

sub set_strip_html {
    my $self = shift (@_);
    my ($strip) = @_;
    
    $self->{'strip_html'} = $strip;
    $self->{'buildproc'}->set_strip_html($strip);
}

sub compress_text {

    my $self = shift (@_);
    my ($textindex) = @_;

    my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
    my $exe = &util::get_os_exe ();
    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
    my $outhandle = $self->{'outhandle'};

    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));

    my $builddir = $self->{'build_dir'};
    my $basefilename = "text/$self->{'collection'}";

# mgpp cant work on windows at the moment    
#     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
#	 $basefilename =~ s/\//\\/g;
#	 $builddir =~ s/\//\\/g;
#	
#    } 


    # define the section names for mgpasses
    # the compressor doesn't need to know about paragraphs - never want to 
    # retrieve them
    my $mg_passes_sections = "";
    if ($self->{'levels'}->{'Section'}) {
	$mg_passes_sections .= "-K Section ";
    }
	
    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);

    # collect the statistics for the text
    # -b $maxdocsize sets the maximum document size to be 12 meg
    print $outhandle "\n    collecting text statistics (mg_passes -T1)\n"  if ($self->{'verbosity'} >= 1);

    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$mg_passes_exe" || 
	    !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -T1")) {
	    die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }

    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($textindex);
    $self->{'buildproc'}->set_indexing_text (0);
    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($self->{'levels'});                      
    $self->{'buildproc'}->reset();
    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 
		   $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::end($self->{'pluginfo'});
    close (PIPEOUT);

    close ($handle) unless $self->{'debug'};

    # create the compression dictionary
    # the compression dictionary is built by assuming the stats are from a seed
    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
    # and the resulting dictionary must be less than 5 meg with the most 
    # frequent words being put into the dictionary first (-2 -k 5120)
    # note: these options are left over from mg version
    if (!$self->{'debug'}) {
	print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_compression_dict_exe") {
	    die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
	}
	system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");


	if (!$self->{'debug'}) {
	    if (!-e "$mg_passes_exe" || 
		!open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
		die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
	    }
	}
    }

    $self->{'buildproc'}->reset();
    # compress the text
    print $outhandle "\n    compressing the text (mg_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();
}

sub want_built {
    my $self = shift (@_);
    my ($index) = @_;

    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
	foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
	    if ($index =~ /^$checkstr$/) {
		push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
		return 0;
	    }
	}
    }

    return 1;
}

sub build_indexes {
    my $self = shift (@_);
    my ($indexname) = @_;
    my $outhandle = $self->{'outhandle'};

    my $indexes = [];
    if (defined $indexname && $indexname =~ /\w/) {
	push @$indexes, $indexname;
    } else {
	$indexes = $self->{'collect_cfg'}->{'indexes'};
    }

    # create the mapping between the index descriptions 
    # and their directory names
    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);

    # build each of the indexes
    foreach $index (@$indexes) {
	if ($self->want_built($index)) {
	    print $outhandle "\n*** building index $index in subdirectory " . 
		"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
	    $self->build_index($index);
	} else {
	    print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
	}
    }
}

# creates directory names for each of the index descriptions
sub create_index_mapping {
    my $self = shift (@_);
    my ($indexes) = @_;

    my %mapping = ();
    $mapping{'indexmaporder'} = [];
    $mapping{'subcollectionmaporder'} = [];
    $mapping{'languagemaporder'} = [];
    
    # dirnames is used to check for collisions. Start this off
    # with the manditory directory names
    my %dirnames = ('text'=>'text',
		    'extra'=>'extra');
    my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');

    foreach $index (@$indexes) {
	my ($fields, $subcollection, $languages) = split (":", $index);

	# the directory name starts with a processed version of index fields
	my ($pindex) = $self->process_field($fields);
	# next comes a processed version of the index
	$pindex = lc ($pindex);

	# next comes a processed version of the subcollection if there is one.
	my $psub = $self->process_field ($subcollection);
	$psub = lc ($psub);

	# next comes a processed version of the language if there is one.
	my $plang = $self->process_field ($languages);
	$plang = lc ($plang);

	my $dirname = $pindex . $psub . $plang;

	# check to be sure all index names are unique
	while (defined ($dirnames{$dirname})) {
	    $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
	}

	# store the mapping orders as well as the maps
	if (!defined $mapping{'indexmap'}{"$fields"}) {
	    $mapping{'indexmap'}{"$fields"} = $pindex;
	    push (@{$mapping{'indexmaporder'}}, "$fields");
	}
	if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
	    $mapping{'subcollectionmap'}{$subcollection} = $psub;
	    push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
	}
	if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
	    $mapping{'languagemap'}{$languages} = $plang;
	    push (@{$mapping{'languagemaporder'}}, $language);
	}
	$mapping{$index} = $dirname;
	$dirnames{$dirname} = $index;
	$pnames{'index'}{$pindex} = "$fields";
	$pnames{'subcollection'}{$psub} = $subcollection;
	$pnames{'languages'}{$plang} = $languages;
    }

    return \%mapping;
}

# returns a processed version of a field.
# if the field has only one component the processed
# version will contain the first character and next consonant
# of that componant - otherwise it will contain the first 
# character of the first two components 
sub process_field {
    my $self = shift (@_);
    my ($field) = @_;
 
    return "" unless (defined ($field) && $field =~ /\w/);

    my @components = split /,/, $field;
    if (scalar @components >= 2) {
	splice (@components, 2);
	map {s/^(.).*$/$1/;} @components;
	return join("", @components);
    } else {
	my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
	($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
	return "$a$b";
    }
}

sub make_unique {
    my $self = shift (@_);
    my ($namehash, $index, $indexref, $subref, $langref) = @_;
    my ($fields, $subcollection, $languages) = split (":", $index);

    if ($namehash->{'index'}->{$$indexref} ne "$fields") {
	$self->get_next_version ($indexref);
    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
	$self->get_next_version ($subref);
    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
	$self->get_next_version ($langref);
    }
    return "$$indexref$$subref$$langref";
}	

sub get_next_version {
    my $self = shift (@_);
    my ($nameref) = @_;

    if ($$nameref =~ /(\d\d)$/) {
	my $num = $1; $num ++;
	$$nameref =~ s/\d\d$/$num/;
    } elsif ($$nameref =~ /(\d)$/) {
	my $num = $1;
	if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
	else {$num ++; $$nameref =~ s/\d$/$num/;}
    } else {
	$$nameref =~ s/.$/0/;
    }
}

sub build_index {
    my $self = shift (@_);
    my ($index) = @_;
    my $outhandle = $self->{'outhandle'};

    # get the full index directory path and make sure it exists
    my $indexdir = $self->{'index_mapping'}->{$index};
    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
    my $builddir = $self->{'build_dir'};

    my $basefilename = &util::filename_cat ($indexdir, 
					       $self->{'collection'});

    # get any os specific stuff
    my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";

    my $exe = &util::get_os_exe ();
    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");

    # define the section names for mgpasses
    my $mg_passes_sections = "";
    foreach $level (keys (%{$self->{'levels'}})) {
	if ($level eq "Section" || $level eq "Paragraph") {
	    $mg_passes_sections .= "-K $level ";
	}
    }

    my $mg_perf_hash_build_exe = 
	&util::filename_cat($exedir, "mg_perf_hash_build$exe");
    my $mg_weights_build_exe = 
	&util::filename_cat ($exedir, "mg_weights_build$exe");
    my $mg_invf_dict_exe = 
	&util::filename_cat ($exedir, "mg_invf_dict$exe");
    my $mg_stem_idx_exe =
	&util::filename_cat ($exedir, "mg_stem_idx$exe");

#    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
#	$builddir=~ s/\//\\/g;
#	$basefilename =~ s/\//\\/g; 
#    }

    # get the index expression if this index belongs
    # to a subcollection
    my $indexexparr = [];
    my ($fields, $subcollection) = split (":", $index);
    my @subcollections = ();
    @subcollections = split /,/, $subcollection if (defined $subcollection);

    foreach $subcollection (@subcollections) {
	if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
	    push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
	} 
    }
    
    # add expressions for languages if this index belongs to
    # a language subcollection
    foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
	if ($language =~ s/^\!//) {
	    push (@$indexexparr, "!Language/$language/");
	} else {
	    push (@$indexexparr, "Language/$language/");
	}
    }

    # Build index dictionary. Uses verbatim stem method
    print $outhandle "\n    creating index dictionary (mg_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$mg_passes_exe" || 
	    !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I1")) {
	    die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }
	
    # set up the document processor
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($index, $indexexparr);
    $self->{'buildproc'}->set_indexing_text (1);
    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($self->{'levels'});                       
    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    if (!$self->{'debug'}) {
	# create the perfect hash function
	if (!-e "$mg_perf_hash_build_exe") {
	    die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
	}
	system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");

	if (!-e "$mg_passes_exe" || 
	    !open ($handle, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I2")) {
	    die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
	}
    }
    
    # invert the text
    print $outhandle "\n    inverting the text (mg_passes -I2)\n"  if ($self->{'verbosity'} >= 1);

    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    $self->print_stats ();
    
    if (!$self->{'debug'}) {

	close ($handle);
	
	# create the weights file
	print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_weights_build_exe") {
	    die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
	}
	system ("$mg_weights_build_exe -d $builddir -f $basefilename");

	# create 'on-disk' stemmed dictionary
	print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_invf_dict_exe") {
	    die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
	}
	system ("$mg_invf_dict_exe -d $builddir -f $basefilename");


	# creates stem index files for the various stemming methods
	print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mg_stem_idx_exe") {
	    die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
	}
	system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
	system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
	system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");

    
	# remove unwanted files
	my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
	opendir (DIR, $tmpdir) || die
	    "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
	foreach $file (readdir(DIR)) {
	    next if $file =~ /^\./;
	    my ($suffix) = $file =~ /\.([^\.]+)$/;
	    if (defined $suffix && !defined $wanted_index_files{$suffix}) {
		# delete it!
		print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
		&util::rm (&util::filename_cat ($tmpdir, $file));
	    }
	}
	closedir (DIR);
  }
}   

sub make_infodatabase {
    my $self = shift (@_);
    my $outhandle = $self->{'outhandle'};


    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    &util::mk_all_dir ($textdir);
    &util::mk_all_dir ($assocdir);

    # get db name
    my $dbext = ".bdb";
    $dbext = ".ldb" if &util::is_little_endian();
    my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
    $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);

    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");

    print $outhandle "\n*** creating the info database and processing associated files\n" 
	if ($self->{'verbosity'} >= 1);

    # init all the classifiers
    &classify::init_classifiers ($self->{'classifiers'});

    # set up the document processor
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
	    die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }

    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('infodb');
    $self->{'buildproc'}->set_assocdir ($assocdir);
    $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    $self->{'buildproc'}->set_indexing_text (0);
    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});

    $self->{'buildproc'}->reset();

    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
	
	if (!defined $self->{'index_mapping'}) {
	    $self->{'index_mapping'} = 
		$self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
	}

	print $handle "[collection]\n";
	
	foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
	    if ($cmeta =~ s/^\.//) {
		if (defined $self->{'index_mapping'}->{$cmeta}) {
		    print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
			$self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
		print $outhandle  "have .section entry in collect file\n";
		} else {
		    print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
		}
	    } else {
		print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
	    }
	}
	#print out the indexfield mapping
	foreach $field (keys(%{$self->{'indexfieldmap'}})) {
	    $shortname = $self->{'indexfieldmap'}->{$field};
	    print $handle "<$shortname>$field\n";
	}
	print $handle "\n" . ('-' x 70) . "\n";

    }

    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    # output classification information
    &classify::output_classify_info ($self->{'classifiers'}, $handle,
				     $self->{'allclassifications'});

    close ($handle) if !$self->{'debug'};
}

sub collect_specific {
    my $self = shift (@_);
}

sub make_auxiliary_files {
    my $self = shift (@_);
    my ($index);
    my %build_cfg = ();

    my $outhandle =  $self->{'outhandle'};
    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);

    # get the text directory
    &util::mk_all_dir ($self->{'build_dir'});

    # store the build date
    $build_cfg->{'builddate'} = time;
    $build_cfg->{'buildtype'} = "mgpp";

    # store the number of documents and number of bytes
    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();

    # store the mapping between the index names and the directory names
    my @indexmap = ();
    foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
	push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
    }
    $build_cfg->{'indexmap'} = \@indexmap;

    my @subcollectionmap = ();
    foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
	push (@subcollectionmap, "$subcollection\-\>" .
	      $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
    }
    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);

    my @languagemap = ();
    foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
	push (@languagemap, "$language\-\>" .
	      $self->{'index_mapping'}->{'languagemap'}->{$language});
    }
    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);

    $build_cfg->{'notbuilt'} = $self->{'notbuilt'};

    # store the indexfieldmap information
    my @indexfieldmap = ();
    #add all fields bit
    foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
	push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
    }

    $build_cfg->{'indexfieldmap'} = \@indexfieldmap;

    #store the indexed field information
    foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) {
	
	push (@{$build_cfg->{'indexfields'}}, $field);
    }
    # write out the build information
    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
			     '^(builddate|buildtype|numdocs|numbytes)$', 
                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');

}

sub deinit {
    my $self = shift (@_);
}

sub print_stats {
    my $self = shift (@_);

    my $outhandle = $self->{'outhandle'};
    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
    my $index = $self->{'buildproc'}->get_index();
    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();

    if ($indexing_text) {
	print $outhandle "Stats (Creating index $index)\n";
    } else {
	print $outhandle "Stats (Compressing text from $index)\n";
    }
    print $outhandle "Total bytes in collection: $num_bytes\n";
    print $outhandle "Total bytes in $index: $num_processed_bytes\n";

    if ($num_processed_bytes < 50) {
	print $outhandle "***************\n";
	print $outhandle "WARNING: There is very little or no text to process for $index\n";
	if ($indexing_text) {
	    print $outhandle "This may cause an error while attempting to build the index\n";
	} else {
	    print $outhandle "This may cause an error while attempting to compress the text\n";
	}
	print $outhandle "***************\n";
    }

}

1;


