#!/usr/bin/perl

use strict;
use Fcntl ':DEFAULT',':flock';

BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
use AnyDBM_File;

# for the DB format...
use constant FORMAT_FLAG        => 0xc0;        # 11000000
use constant ONE_BYTE_FORMAT    => 0xc0;        # 11000000
use constant TWO_LONGS_FORMAT   => 0x00;        # 00000000

use constant ONE_BYTE_SSS_BITS  => 0x38;        # 00111000
use constant ONE_BYTE_HHH_BITS  => 0x07;        # 00000111

use constant ATIME_EPOCH_START  => 1038000000;  # Fri Nov 22 21:20:00 2002
use constant ATIME_GRANULARITY  => 21600;       # 6 hours

use vars qw{
    %h $k $v @DBNAMES
    $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN
    $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN
    $SCANCOUNT_BASE_MAGIC_TOKEN
    $opt_dbpath $opt_min_hits $opt_max_hits $opt_regexp
    $robinson_x $robinson_s_dot_x 
  };

use Getopt::Long;
GetOptions("dbpath=s", "min-hits=i", "max-hits=i", "regexp=s");

@DBNAMES = qw(toks);

$NSPAM_MAGIC_TOKEN = '**NSPAM';
$NHAM_MAGIC_TOKEN = '**NHAM';
$OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
$LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
$NTOKENS_MAGIC_TOKEN = '**NTOKENS';
$SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';


use constant ROBINSON_S_CONSTANT => 0.53;
$robinson_x = 0.43;
$robinson_s_dot_x = ($robinson_x * ROBINSON_S_CONSTANT);

my $path = $opt_dbpath;
$path ||= $ENV{HOME}."/.spamassassin/bayes";

foreach my $dbname (@DBNAMES) {
  my $name = $path.'_'.$dbname;
  my $db_var = 'db_'.$dbname;
  if (!tie %{$h{$db_var}}, "AnyDBM_File",$name, O_RDONLY, 0600) {
    warn "Cannot open file $name: $!\n";
  }
}

my $dbformat = 'on-the-fly probs, expiry, scan-counting';
my $ns = $h{db_toks}->{$NSPAM_MAGIC_TOKEN} || 0;
my $nn = $h{db_toks}->{$NHAM_MAGIC_TOKEN} || 0;
my $nt = $h{db_toks}->{$NTOKENS_MAGIC_TOKEN} || 0;
my $le = $h{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} || 0;
my $oa = $h{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} || 0;

my $sb = $h{db_toks}->{$SCANCOUNT_BASE_MAGIC_TOKEN} || 0;
if (!defined $sb) {
  $dbformat = 'on-the-fly probs, expiry, no scan-counting';
}
$sb ||= 0;
$sb += (-s $path."_msgcount");

printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, 0, 0,
			'non-token data: db format = '.$dbformat);
printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $ns, 0, 'non-token data: nspam');
printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $nn, 0, 'non-token data: nham');
printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $nt, 0, 'non-token data: ntokens');
printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $oa, 0, 'non-token data: oldest age');

printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $sb, 0,
			'non-token data: current scan-count');
printf ("%3.3f %8d %8d %8d  %s\n", 0.0, 0, $le, 0,
			'non-token data: last expiry scan-count');

my %seen = ();
my $prob;
my $now = time;

for my $key ( keys(%{$h{db_toks}}) ) {
  next if ($key eq $NSPAM_MAGIC_TOKEN
          || $key eq $NHAM_MAGIC_TOKEN
          || $key eq $LAST_EXPIRE_MAGIC_TOKEN
          || $key eq $NTOKENS_MAGIC_TOKEN
          || $key eq $SCANCOUNT_BASE_MAGIC_TOKEN
          || $key eq $OLDEST_TOKEN_AGE_MAGIC_TOKEN);

  next if (exists $seen{$key});
  $seen{$key} = 1;

  my ($ts, $th, $atime) = tok_unpack ($h{db_toks}->{$key});
  $ts ||= 0;
  $th ||= 0;

  next if ($opt_min_hits && (($ts + $th) < $opt_min_hits));
  next if ($opt_max_hits && (($ts + $th) > $opt_max_hits));
  next if ($opt_regexp && ($key !~ /$opt_regexp/o));

  $prob = compute_prob_for_token ($ts, $th, $ns, $nn);

  my $age = $atime;

  $prob ||= 0.5;
  printf ("%3.3f %8d %8d %8d  %s\n", $prob, $ts, $th, $age, $key);
}

foreach my $dbname (@DBNAMES) {
  my $name = $path.'_'.$dbname;
  my $db_var = 'db_'.$dbname;
  untie %{$h{$db_var}};
}

sub tok_unpack {
  my ($packed, $atime) = unpack("CS", $_[0] || 0);

  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
                $packed & ONE_BYTE_HHH_BITS,
                $atime || 0);
  }
  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
    my ($packed, $ts, $th, $atime) = unpack("CLLS", $_[0] || 0);
    return ($ts || 0, $th || 0, $atime || 0);
  }
  # other formats would go here...
  else {
    warn "unknown packing format for Bayes db, please re-learn: $packed";
    return (0, 0, 0);
  }
}


sub compute_prob_for_token {
  my ($s, $n, $ns, $nn) = @_;

  return if ($s == 0 && $n == 0);
  return if ($ns== 0 || $nn== 0);
  my $ratios = ($s / $ns);
  my $ration = ($n / $nn);
  my $prob;
  if ($ratios == 0 && $ration == 0) {
    warn "oops? ratios == ration == 0";
    return 0.5;
  } else {
    $prob = ($ratios) / ($ration + $ratios);
  }

  my $robn = $s+$n;
  return ($robinson_s_dot_x + ($robn * $prob)) /
		(ROBINSON_S_CONSTANT + $robn);
}

###########################################################################

# 2-byte time format: expiry is after the time_t epoch, so time_t calculations
# will fail before this will.

sub atime_to_time_t {
  my ($self, $atime) = @_;
  return ($atime * ATIME_GRANULARITY) + ATIME_EPOCH_START;
}

sub time_t_to_atime {
  my ($self, $tt) = @_;
  return int (($tt - ATIME_EPOCH_START) / ATIME_GRANULARITY);
}

###########################################################################

