#!/usr/bin/perl -w

# Remove tokens from the bayes_probs db.  Don't remove tokens from the
# bayes_toks db, since feeding the learner more messages might change
# the state of the token so it wouldn't have been removed; just re-run
# this tool after each learning sessions

use strict;
use Fcntl;

BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
use AnyDBM_File;

# for the DB format...
use constant FORMAT_FLAG        => 0xc0;        # 11000000
use constant ONE_BYTE_FORMAT    => 0xc0;        # 11000000
use constant TWO_LONGS_FORMAT   => 0x00;        # 00000000

use constant ONE_BYTE_SSS       => 0x38;        # 00111000
use constant ONE_BYTE_HHH       => 0x07;        # 00000111

use vars qw{
    %prob_db %toks_db
    $opt_dbpath $opt_regexp $opt_min_hits
    $opt_min_prob_strength
    $opt_help
  };

sub usage {
  print "
Usage: trim_bayes_db [--dbpath=path] [--regexp=regexp] [--min-hits=int]
                     [--min-prob-strength==float]\n";
  exit(1);
} # usage()

use Getopt::Long;
GetOptions("dbpath=s", "regexp=s", "min-hits=i", "min-prob-strength=f",
           "help");

usage() if ($opt_help);
# At least one of the filtering options must be set
if (!$opt_regexp && !$opt_min_hits && !$opt_min_prob_strength) {
  print "At least one of the filtering options must be set\n";
  usage();
}

my ($MPS1, $MPS2);

if ($opt_min_prob_strength) {
  $MPS1 = 0.5 - $opt_min_prob_strength;
  $MPS2 = 0.5 + $opt_min_prob_strength;
}

my $path = $opt_dbpath;
$path ||= $ENV{HOME}."/.spamassassin/bayes";


my $toks_name = "${path}_toks";
tie %toks_db, "AnyDBM_File", $toks_name, O_RDONLY, 0600
  or die "Cannot open file $toks_name: $!\n";

my $prob_name = "${path}_probs";
tie %prob_db, "AnyDBM_File", $prob_name, O_RDWR, 0666
  or die "Cannot open file $prob_name: $!\n";

foreach my $key ( keys(%prob_db) ) {
  if ($opt_regexp && ($key =~ m/$opt_regexp/o)) {
    delete $prob_db{$key};
    next;
  }

  if ($opt_min_hits) {
    my ($ts, $th) = tok_unpack ($toks_db{$key});

    my $hits = ($ts || 0) + ($th || 0);

    if ($hits < $opt_min_hits) {
      delete $prob_db{$key};
      next;
    }
  } # if ($opt_min_hits)

  if ($opt_min_prob_strength) {
    my $prob = unpack ('f', $prob_db{$key});

    if (($MPS1 < $prob) && ($prob < $MPS2)) {
      delete $prob_db{$key};
      next;
    }
  }
} # foreach my $key ( keys(%prob_db) )

untie %prob_db;
untie %toks_db;

if ($AnyDBM_File::ISA[0] eq "GDBM_File") {
  # GDBM_File::reorganize() can't be perfomed on a AnyDBM_File
  # tied hash, even if the underlying implementation is GDBM,
  # so we have to tie it again; bleh.
  print "Re-tieing db as GDBM_File to reduce db size\n";
  tie %prob_db, "GDBM_File", $prob_name, O_RDWR, 0666
    or die "Cannot open file $prob_name: $!\n";

  GDBM_File::reorganize(tied(%prob_db));
  untie %prob_db;
}

##################################################

sub tok_unpack {
  my ($packed, $ts, $th) = unpack("CLL", $_[0] || 0);

  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
    $ts = ($packed & ONE_BYTE_SSS) >> 3;
    $th = ($packed & ONE_BYTE_HHH);
  }
  # else use $ts and $th we just unpacked.

  return ($ts, $th);
} # tok_unpack()
