#!/usr/bin/perl -w

my $cffile = shift @ARGV;
$cffile ||= "../spamassassin.cf";


my @unmutated_tests = qw(
BALANCE_FOR_LONG
COPYRIGHT_CLAIMED
SIRCAM_SIGNATURE
USER_IN_WHITELIST
NO_MX_FOR_FROM

INVALID_DATE_NO_TZ
UNDISC_RECIPS
MAY_BE_FORGED
HTML_WITH_BGCOLOR
FORGED_RCVD_FOUND
NO_REAL_NAME
PLING
PLING_PLING
USER_IN_BLACKLIST
SUBJ_ENDS_IN_Q_MARK
SUBJ_HAS_Q_MARK
RAZOR_CHECK
RCVD_IN_RELAYS_ORDB_ORG
RCVD_IN_OSIRUSOFT_COM
X_OSIRU_SPAM_SRC
X_OSIRU_SPAMWARE_SITE

PORN_5

  );

my $threshold = 5;
my $iterlimit = 0;

my %is_spam = ();
my %tests_hit = ();

my $scores;
readscores();
my $origscores = $scores;

print "Reading per-message hit stat logs and scores...\n";
my $total;
my $totspam;
my $totnonspam;
readlogs();

print "Writing logs and current scores as C code...\n";

writescores_c();
writetests_c();
exit 0;


sub readlogs {
  my $count = $totspam = $totnonspam = 0;

  foreach my $file ("spam.log", "nonspam.log") {
    open (IN, "<$file");

    while (<IN>) {
      /^.\s+(\d+)\s+\S+\s*/ or next;
      my $hits = $1;

      $_ = $'; s/,,+/,/g; s/^\s+//; s/\s+$//;
      my @tests = ();
      foreach my $tst (split (/,/, $_)) {
	next if ($tst eq '');
	if (!defined $scores->{$tst}) {
	  warn "unknown test in $file, ignored: $tst\n";
	  next;
	}
	push (@tests, $tst);
      }

      $tests_hit{$count} = \@tests;

      if ($file eq "spam.log") {
	$totspam++;
	$is_spam{$count} = 1;
      } else {
	$totnonspam++;
	$is_spam{$count} = 0;
      }
      $count++;
    } 
    close IN;
  }
  $total = $count;
}


sub readscores {
  $scores = { };

  print "Reading scores from \"$cffile\"...\n";
  open (IN, "<$cffile") or warn "cannot read $cffile\n";
  while (<IN>) {
    s/#.*$//g; s/^\s+//; s/\s+$//;

    if (/^(header|body|full)\s+(\S+)\s+/) {
      $scores->{$2} ||= 1;
    } elsif (/^score\s+(\S+)\s+(.+)$/) {
      $scores->{$1} = $2;
    }
  }
  close IN;
}

sub writescores_c {
  %is_mutatable = ();
  foreach my $t (@unmutated_tests) { $is_mutatable{$t} = 0; }
  my $size = (scalar keys %{$scores}) + 1;

  if (!-f 'tmp/ranges.data') {
    system ("make tmp/ranges.data");
  }

  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  while (<IN>) {
    /^(\S+) (\S+) (\S+)$/ or next;
    $range_lo{$3} = $1+0;
    $range_hi{$3} = $2+0;
  }
  close IN;

  open (OUT, ">tmp/scores.h");
  print OUT "

int num_scores = $size;
unsigned char is_mutatable[$size]; 	/* er, is_mutable I think ;) */
float range_lo[$size];
float range_hi[$size];
float bestscores[$size];
float scores[$size];
char *score_names[$size];

void loadscores (void) {

";
  my $count = 0;
  foreach my $name (sort keys %{$scores}) {
    if (!defined $is_mutatable{$name}) { $is_mutatable{$name} = 1; }

    print OUT 
    " bestscores[$count] = ".$scores->{$name}.";", "\t",
    " is_mutatable[$count] = ".$is_mutatable{$name}.";", "\t",
    " range_lo[$count] = ".$range_lo{$name}.";", "\t",
    " range_hi[$count] = ".$range_hi{$name}.";", "\t",
    " score_names[$count] = \"".$name."\";\n";

    $score_c_index{$name} = $count;
    $count++;
  }

  print OUT "\n}\n";
  close OUT;
}

sub writetests_c {
  my $file;

  # figure out max hits per message
  my $max_hits_per_msg = 0;
  for ($file = 0; $file < $total; $file++) {
    my $hits = scalar @{$tests_hit{$file}} + 1;
    if ($hits > $max_hits_per_msg) { $max_hits_per_msg = $hits; }
  }

  open (TOP, ">tmp/tests.h");
  print TOP "

int num_tests = $total;
int num_spam = $totspam;
int num_nonspam = $totnonspam;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$total];
unsigned char is_spam[$total];
unsigned short tests_hit[$total][$max_hits_per_msg];

";
  $_ = join ('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  for ($file = 0; $file < $total; $file++)
  {
    print DAT ".".$file."\n";

    my $out = '';
    $out .= "s".$is_spam{$file}."\n";

    my $num_tests_hit = 0;
    foreach my $test (@{$tests_hit{$file}}) {
      if ($test eq '') { next; }

      if (!defined $score_c_index{$test}) {
	warn "test with no C index: $test\n";
      }

      $num_tests_hit++;
      $out .= "t".$score_c_index{$test}."\n";

      if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
      }
    }

    print DAT "n".$num_tests_hit."\n".$out;
  }
  close DAT;
}

__DATA__

void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];
  int file = 0;
  int tnum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);

    if (cmd == '.') {
      file = arg;

    } else if (cmd == 'n') {
      tnum = 0;
      num_tests_hit[file] = arg;

    } else if (cmd == 's') {
      is_spam[file] = arg;

    } else if (cmd == 't') {
      tests_hit[file][tnum] = arg; tnum++;
    }
  }
  fclose(fin);

  printf ("Read test results for %d messages.\n", file+1);
}

