#!/usr/bin/perl
use strict;
$|++;

my $VERSION = '0.24';

#----------------------------------------------------------------------------

=head1 NAME

cpanstats-verify - script to verify the contents of the cpanstats database.

=head1 SYNOPSIS

  perl cpanstats-verify                 \
     --config=<file>                    \
     --database=<dbcode>                \
     --log=<file> --out=<file>          \
    [-c|-m] [-v|-s --file=$file] [--start=0] [--end=0]

=head1 DESCRIPTION

Reads the cpanstats database and verifies the contents. Three kinds of
verification can be applied; Check (-c), Missing (-m) and Verify (-v).
The start and end counts for the NNTP server can be specified, otherwise
values within the database and the script will be assumed.

=head1 OPTIONS

=over 4

=item --config

Configuration file contain database access details.

=item --database

Specify the database to use, CPANSTATS or LITESTATS

=item --log

Specify the logfile generated by the 'cpanstats' program.

=item --out

Specify the output for this run.

=item -c | --check

Looks up the entries stored in the database between the START and END NNTP ids,
checking the NNTP server for the subjects of any that are missing. Also
highlights any subjects that have been marked as bad during processing.

=item -m | --missing

Looks up the entries stored in the database between the START and END NNTP ids,
checking each to ensure all the fields are complete for each type.

=item -v | --verify

Looks up the entries stored in the database as stored in the named file, and
either prints the entry or highlights that it is missing.

=item -s | --search

Looks up the NNTP entries directly as listed in the named file.

=item --file

Named file used when verifying a list of NNTP ids.

=item --start --end

Start and end NNTP ids.

=back

=cut

# -------------------------------------
# Library Modules

use Config::IniFiles;
use CPAN::Testers::Common::DBUtils;
use Email::Simple;
use Net::NNTP;
use Getopt::ArgvFile default=>1;
use Getopt::Long;
use IO::File;

# -------------------------------------
# Variables

my (%log,%options,$dbi);

use constant    NNTPSTART   => 872391;

# -------------------------------------
# Program

##### INITIALISE #####

progress("init");

init_options();

$options{start}    ||= NNTPSTART;
$options{end}      ||= get_lastid();

my $nntp;
unless($options{localonly}) {
    $nntp = Net::NNTP->new("nntp.perl.org") || die "Cannot connect to nntp.perl.org";
    $nntp->group("perl.cpan.testers");
}

##### MAIN #####

_clear_log();

_log("FROM: $options{start} - $options{end}") if($options{missing} || $options{check});

#load_log();

progress("start");

check_stats()   if($options{check});
missing_stats() if($options{missing});
verify_stats()  if($options{verify});
search_stats()  if($options{search});

progress("finish");

# -------------------------------------
# Subroutines

=item get_lastid

Returns the last NNTP id recorded in the database.

=cut

sub get_lastid {
    my @rows = $dbi->get_query('array',"SELECT MAX(id) FROM cpanstats");
    return $rows[0]->[0];
}

=item missing_stats

Report on the database entries with missing field values.

=cut

# insert_report($id,$state,$date,$from,$dist,$version,$platform,$perl);

sub missing_stats {
    progress("start - missing stats");

    $" = ",";
    my $count = 0;
    my $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if($row->[1] =~ /^na|pass|fail|unknown|cpan$/);
        _log("BADPARSE:") unless($count++);
        _log("@$row");
    }

    $count = 0;
    $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE state in ('na','pass','fail','unknown') AND id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if( defined $row->[2] &&
                    defined $row->[3] &&
                    defined $row->[4] &&
                    defined $row->[5] &&
                    defined $row->[6]);
        _log("\nBADREPORTS:") unless($count++);
        _log("@$row");
    }

    $count = 0;
    $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE state in ('cpan') AND id >= $options{start} AND id <= $options{end}");
    while(my $row = $iterator->()) {
        next    if( defined $row->[2] &&
                    defined $row->[4] &&
                    defined $row->[5]);
        _log("\nBADUPLOADS:") unless($count++);
        _log("@$row");
    }
}

=item check_stats

Report on the database entries which are either missing, or have reported bad
processing.

=cut

sub check_stats {
    progress("start - check stats");

    my $count = $options{start};
    my $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end} ORDER BY id");
    while(my $row = $iterator->()) {
        #progress("$count - $row->[0]");

        # missing entries (ignore human replies)
        while($count < $row->[0]) {
            my ($subj,$from) = get_subject($count);
            _log("$count,$from,missing - $subj")  unless($subj =~ /(?:Re:|Fw:|Ab:|mirror update)/i ||
                                                         $subj =~ /\.(?:readme|pl|cgi|pdf|html?|doc|txt|ppd|asc|yml|jpg|png|gif|rtf|css|pod|sig|diff)$/i);

            # note that in the above two regexes we ignore conversation
            # threads, mirror updates and a whole host of uploads that
            # are nothing to do with a distribution upload. The caveats
            # to this are the entries that are potentially bad uploads
            # (bad archive naming or uploading a .pm file) and any
            # reports that are in error.

            $count++;
        }

        # badly parsed entries
        if($row->[1] =~ /bad/) {
            my ($subj,$from) = get_subject($row->[0]);
            _log("$row->[0],$from,$row->[1] - $subj");
#           print join(",",@$row) . "\n";
        }

        # missing fields
        elsif($row->[1] =~ /(na|pass|fail|unknown)/) {
            unless( defined $row->[2] &&
                    defined $row->[3] &&
                    defined $row->[4] &&
                    defined $row->[5] &&
                    defined $row->[6]) {
                my ($subj,$from) = get_subject($row->[0]);
                _log("$row->[0],$from,$row->[1] - $subj");
            }
        }
        $count++;
    }
}

=item verify_stats

Report on the given database entries highlighting those which are missing.

=cut

sub verify_stats {
    progress("start - verify stats");

    $" = ",";
    my @list = get_list();
    for my $id (@list) {
        my @rows = $dbi->get_query('array',"SELECT * FROM cpanstats WHERE id=$id");
        if(@rows) {
            my $row = $rows[0];
            _log("@$row");
        } else {
            _log("$id,missing");
        }
    }
}

=item search_stats

Report on the given database entries highlighting those which are missing.

=cut

sub search_stats {
    progress("start - search stats");

    $" = ",";
    my @list = get_list();
    for my $id (@list) {
        my ($subj,$from) = get_subject($id);
        _log("$id,$from,$subj");
    }
}

=item get_subject

Access the NNTP server to get the real subject recorded for the article,
unless we can short cut the network by accessing the information from the
project log file.

=cut

sub get_subject {
  my $id = shift;

  # can we short cut?
  find_id($id)                                  unless($log{$id});
  return($log{$id}{subject},$log{$id}{from})    if($log{$id});

  return "" if($options{localonly});

  # talk NNTP
  my $article = join "", @{$nntp->article($id) || []};
  return "" unless($article);

  # parse the resulting headers
  my $mail = Email::Simple->new($article);
  return($mail->header("Subject"),$mail->header("From"));
}

=item get_list

Returns the list of NNTP ids from the named file.

=cut

sub get_list {
    my @list;
    my $file = $options{file} || die "--file not specified";
    die "file [$file] not found"    unless(-f $file);

    my $fh = IO::File->new($file)   or die "Cannot open file [$file]: $!";
    while(<$fh>) {
        chomp;
        my ($num) = (m/^(\d+)/);
        push @list, $num;
    }
    $fh->close;
    return @list;
}

=item load_log

Load log file, the output of cpanstats.pl.

=cut

sub load_log {
    my $fh = IO::File->new($options{log},'r')    or die "Cannot read log file [$options{log}]: $!\n";
    while(<$fh>) {
        next    unless(/^ID \[(\d+)\] \[([^\]]+)\] (.*?)\s*$/);
        next    unless($1 >= $options{start});
        $log{$1} = {from => $2, subject => $3};
    }
}

# note read the whole file in case it has been reparsed

sub find_id {
    my $id = shift || return;

    my $fh = IO::File->new($options{log},'r')    or die "Cannot read log file [$options{log}]: $!\n";
    while(<$fh>) {
        next    unless(/^ID \[$id\] \[([^\]]+)\] (.*?)\s*$/);
        $log{$id} = {from => $1, subject => $2};
    }
    return;
}

sub _clear_log {
    my $fh = IO::File->new($options{out},'w')   or die "Cannot write to file [$options{out}]: $!\n";
    print $fh '';
    $fh->close;
}

sub _log {
    my $msg = shift;

    my $fh = IO::File->new($options{out},'a+')   or die "Cannot write to file [$options{out}]: $!\n";
    print $fh "$msg\n";
    $fh->close;
}

=item progress

Simple audit logging function.

=cut

my $lasttime = time;

sub progress {
    return  unless($options{progress});

    my $msg = shift;
    my $time = time;
    my @localtime = localtime($time);
    my $secs = $time - $lasttime;
    printf STDERR "%02d:%02d:%02d\t%03d\t%s\n", $localtime[2], $localtime[1], $localtime[0], $secs, $msg;
    $lasttime = $time;
}

sub init_options {
    GetOptions( \%options,
        'config=s',
        'database=s',
        'localonly|l',
        'missing|m',
        'check|c',
        'verify|v',
        'progress|p',
        'search|s',
        'start=i',
        'end=i',
        'file=s',
        'log=s',
        'out=s',
        'help|h',
        'version|V'
    );

    help(1) if($options{help});
    help(0) if($options{version});

    help(1,"No cpanstats.log file specified")   unless($options{log} && -f $options{log});
    help(1,"No results output file specified")  unless($options{out});

    help(1,"Must specify the configuration file")              unless($options{config});
    help(1,"Configuration file [$options{config}] not found")   unless(-f $options{config});

    help(1,"Must specify the database code (CPANSTATS or LITESTATS)")
        unless($options{database} && $options{database} =~ /^CPANSTATS|LITESTATS$/);

    # load configuration
    my $cfg = Config::IniFiles->new( -file => $options{config} );

    # configure databases
    my $db = $options{database};
    die "No configuration for $db database\n"   unless($cfg->SectionExists($db));
    my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
    $dbi = CPAN::Testers::Common::DBUtils->new(%opts);
    die "Cannot configure $db database\n" unless($dbi);
}

sub help {
    my ($full,$mess) = @_;

    print "\n$mess\n\n" if($mess);

    if($full) {
        print <<HERE;
Usage: $0 \\
     --config=<file> --database=<dbcode>        \\
     --log=<file> --out=<file>                  \\
     [-c] [-m] [-v] [-s]                        \\
     [--file=<file>] [--start=n] [--end=n]      \\
      [-h] [-V]     \\
     [--localonly] [--progress]

  --config=<file>       - configuration file
  --database=<dbcode>   - CPANSTATS or LITESTATS
  --log                 - log file for shortcut reference for -m and -c
  --out                 - results output file

  -m                    - check for missing entries
  -c                    - check existing entries for bad parsing
  -v                    - provide a verification report
  -s                    - search stats providing id + subject
  --file                - file of IDs to reference
  --start               - start id for -m or -c
  --end                 - end id for -m or -c

  --localonly           - no NNTP lookups, use local logs only
  --progress            - print progress messages to STDOUT

  -h                    - this help screen
  -V                    - program version

HERE

    }

    print "$0 v$VERSION\n";
    exit(0);
}

__END__

=back

=head1 BUGS, PATCHES & FIXES

There are no known bugs at the time of this release. However, if you spot a
bug or are experiencing difficulties, that is not explained within the POD
documentation, please send bug reports and patches to the RT Queue (see below).

Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.

RT Queue -
http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator

=head1 SEE ALSO

L<CPAN::WWW::Testers>,
L<CPAN::Testers::WWW::Statistics>

F<http://www.cpantesters.org/>,
F<http://stats.cpantesters.org/>,
F<http://wiki.cpantesters.org/>

=head1 AUTHOR

  Barbie, <barbie@cpan.org>
  for Miss Barbell Productions <http://www.missbarbell.co.uk>.

=head1 COPYRIGHT AND LICENSE

  Copyright (C) 2005-2010 Barbie for Miss Barbell Productions.

  This module is free software; you can redistribute it and/or
  modify it under the same terms as Perl itself.

=cut
