#!/usr/bin/perl
use strict;
$|++;

my $VERSION = '0.17';

#----------------------------------------------------------------------------

=head1 NAME

cpanstats-verify - script to verify the contents of the cpanstats database.

=head1 SYNOPSIS

  perl cpanstats-verify [-c|-m] [-v|-s --file=$file] [--start=0] [--end=0]

=head1 DESCRIPTION

Reads the cpanstats database and verifies the contents. Three kinds of
verification can be applied; Check (-c), Missing (-m) and Verify (-v).
The start and end counts for the NNTP server can be specified, otherwise
values within the database and the script will be assumed.

=head1 OPTIONS

=over 4

=item -c | --check

Looks up the entries stored in the database between the START and END NNTP ids,
checking the NNTP server for the subjects of any that are missing. Also
highlights any subjects that have been marked as bad during processing.

=item -m | --missing

Looks up the entries stored in the database between the START and END NNTP ids,
checking each to ensure all the fields are complete for each type.

=item -v | --verify

Looks up the entries stored in the database as stored in the named file, and
either prints the entry or highlights that it is missing.

=item -s | --search

Looks up the NNTP entries directly as listed in the named file.

=item --database

Specify the exact path to the cpanstats database if not ./cpanstats.db.

=item --file

Named file used when verifying a list of NNTP ids.

=item --start --end

Start and end NNTP ids.

=back

=cut

# -------------------------------------
# Library Modules

use DBI;
use Email::Simple;
use Net::NNTP;
use Getopt::ArgvFile default=>1;
use Getopt::Long;
use IO::File;

Use CPAN::WWW::Testers::Generator::Database;

# -------------------------------------
# Variables

my (%log,%options);
my $PROGRESS = 0;

use constant    DATABASE    => 'cpanstats.db';
use constant    NNTPSTART   => 872391;

# -------------------------------------
# Program

##### INITIALISE #####

progress("init");

GetOptions( \%options,
            "database=s",
            "missing|m",
            "check|c",
            "verify|v",
            "search|s",
            "start=i",
            "end=i",
            "file=s",
            "log=s",
            "out=s"
);

$options{database} ||= DATABASE;
$options{start}    ||= NNTPSTART;
$options{end}      ||= get_lastid();

my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database});
print STDERR "Cannot connect to database [$options{database}]\n"	unless($dbi);

my $nntp = Net::NNTP->new("nntp.perl.org")
        || die "Cannot connect to nntp.perl.org";
my($num, $first, $last) = $nntp->group("perl.cpan.testers");


##### MAIN #####

_clear_log();

_log("FROM: $options{start} - $options{end}") if($options{missing} || $options{check});

#load_log();

progress("start");

check_stats()   if($options{check});
missing_stats() if($options{missing});
verify_stats()  if($options{verify});
search_stats()  if($options{search});

progress("finish");

# -------------------------------------
# Subroutines

=item get_lastid

Returns the last NNTP id recorded in the database.

=cut

sub get_lastid {
    my @rows = $dbi->get_query("SELECT MAX(id) FROM cpanstats");
    return $rows[0]->[0];
}

=item missing_stats

Report on the database entries with missing field values.

=cut

# insert_report($id,$state,$date,$from,$dist,$version,$platform,$perl);

sub missing_stats {
    progress("start - missing stats");

    $" = ",";
    my $count = 0;
    my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if($row->[1] =~ /^na|pass|fail|unknown|cpan$/);
        _log("BADPARSE:") unless($count++);
        _log("@$row");
    }

    $count = 0;
    $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('na','pass','fail','unknown') AND id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if( defined $row->[2] &&
                    defined $row->[3] &&
                    defined $row->[4] &&
                    defined $row->[5] &&
                    defined $row->[6]);
        _log("\nBADREPORTS:") unless($count++);
        _log("@$row");
    }

    $count = 0;
    $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('cpan') AND id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if( defined $row->[2] &&
                    defined $row->[4] &&
                    defined $row->[5]);
        _log("\nBADUPLOADS:") unless($count++);
        _log("@$row");
    }
}

=item check_stats

Report on the database entries which are either missing, or have reported bad
processing.

=cut

sub check_stats {
    progress("start - check stats");

    my $count = $options{start};
    my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end} ORDER BY id");
    while(my $row = $iterator->()) {
        #progress("$count - $row->[0]");

        # missing entries (ignore human replies)
        while($count < $row->[0]) {
            my ($subj,$from) = get_subject($count);
            _log("$count,$from,missing - $subj")  unless($subj =~ /Re:/i);
            $count++;
        }

        # badly parsed entries
        if($row->[1] =~ /bad/) {
            my ($subj,$from) = get_subject($row->[0]);
            _log("$row->[0],$from,$row->[1] - $subj");
#           print join(",",@$row) . "\n";
        }

        # missing fields
        elsif($row->[1] =~ /(na|pass|fail|unknown)/) {
            unless( defined $row->[2] &&
                    defined $row->[3] &&
                    defined $row->[4] &&
                    defined $row->[5] &&
                    defined $row->[6]) {
                my ($subj,$from) = get_subject($row->[0]);
                _log("$row->[0],$from,$row->[1] - $subj");
            }
        }
        $count++;
    }
}

=item verify_stats

Report on the given database entries highlighting those which are missing.

=cut

sub verify_stats {
    progress("start - verify stats");

    $" = ",";
    my @list = get_list();
    for my $id (@list) {
        my @rows = $dbi->get_query("SELECT * FROM cpanstats WHERE id=$id");
        if(@rows) {
            my $row = $rows[0];
            _log("@$row");
        } else {
            _log("$id,missing");
        }
    }
}

=item search_stats

Report on the given database entries highlighting those which are missing.

=cut

sub search_stats {
    progress("start - search stats");

    $" = ",";
    my @list = get_list();
    for my $id (@list) {
        my ($subj,$from) = get_subject($id);
        _log("$id,$from,$subj");
    }
}

=item get_subject

Access the NNTP server to get the real subject recorded for the article,
unless we can short cut the network by accessing the information from the
project log file.

=cut

sub get_subject {
  my $id = shift;

  # can we short cut?
  find_id($id)                                  unless($log{$id});
  return($log{$id}{subject},$log{$id}{from})    if($log{$id});

  # talk NNTP
  my $article = join "", @{$nntp->article($id) || []};
  return "" unless($article);

  # parse the resulting headers
  my $mail = Email::Simple->new($article);
  return($mail->header("Subject"),$mail->header("From"));
}

=item get_list

Returns the list of NNTP ids from the named file.

=cut

sub get_list {
    my @list;
    my $file = $options{file} || die "--file not specified";
    die "file [$file] not found"    unless(-f $file);

    my $fh = IO::File->new($file)   or die "Cannot open file [$file]: $!";
    while(<$fh>) {
        chomp;
        my ($num) = (m/^(\d+)/);
        push @list, $num;
    }
    $fh->close;
    return @list;
}

=item load_log

Load log file, the output of cpanstats.pl.

=cut

sub load_log {
    my $fh = IO::File->new($options{log},'r')    or die "Cannot read log file [$options{log}]: $!\n";
    while(<$fh>) {
        next    unless(/^ID \[(\d+)\] \[([^\]]+)\] (.*?)\s*$/);
        next    unless($1 >= $options{start});
        $log{$1} = {from => $2, subject => $3};
    }
}

# note read the whole file in case it has been reparsed

sub find_id {
    my $id = shift || return;

    my $fh = IO::File->new($options{log},'r')    or die "Cannot read log file [$options{log}]: $!\n";
    while(<$fh>) {
        next    unless(/^ID \[$id\] \[([^\]]+)\] (.*?)\s*$/);
        $log{$id} = {from => $1, subject => $2};
    }
    return;
}

sub _clear_log {
    my $fh = IO::File->new($options{out},'w')   or die "Cannot write to file [$options{out}]: $!\n";
    print $fh '';
    $fh->close;
}

sub _log {
    my $msg = shift;

    my $fh = IO::File->new($options{out},'a+')   or die "Cannot write to file [$options{out}]: $!\n";
    print $fh "$msg\n";
    $fh->close;
}

=item progress

Simple audit logging function.

=cut

my $lasttime = time;

sub progress {
    return  unless($PROGRESS);

    my $msg = shift;
    my $time = time;
    my @localtime = localtime($time);
    my $secs = $time - $lasttime;
    printf STDERR "%02d:%02d:%02d\t%03d\t%s\n", $localtime[2], $localtime[1], $localtime[0], $secs, $msg;
    $lasttime = $time;
}


__END__

=back

=head1 BUGS, PATCHES & FIXES

There are no known bugs at the time of this release. However, if you spot a
bug or are experiencing difficulties, that is not explained within the POD
documentation, please send bug reports and patches to the RT Queue (see below).

Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.

RT Queue -
http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator

=head1 SEE ALSO

F<http://www.cpantesters.org/>,
F<http://stats.cpantesters.org/>

=head1 AUTHOR

  Barbie, <barbie@cpan.org>
  for Miss Barbell Productions <http://www.missbarbell.co.uk>.

=head1 COPYRIGHT AND LICENSE

  Copyright (C) 2005-2008 Barbie for Miss Barbell Productions.

  This module is free software; you can redistribute it and/or
  modify it under the same terms as Perl itself.

=cut

