#! /usr/bin/perl
use strict;
use warnings;
$|=1;

# $Id: checksite 557 2006-10-08 09:50:08Z abeltje $
our $VERSION = '0.070';

=head1 NAME

checksite - Check the contents of a website

=head1 SYNOPSIS

    $ checksite [options] -p <name> uri

=head1 OPTIONS

    --prefix|-p <name>   The prefix (dir) of this check [mandatory]
    --dir|-d <dir>       The target directory

    --[no]save           Save validation results
    --load               Load the validation results

    --novalidate         Skip the W3 validation
    --by_xmllint         Validate by using the xmllint program
    --by_uri             Validate by sending the uri to W3
    --by_upload          Validate by uploading the contents to W3

    --disallow <path>    Add Disallow: rules to robots.txt (multiple)

    --nostrictrules      Do not impose /robots.txt on the validator

    --lang|-l <lang>     Set language(s) for Accept-Language: header

    -v                   increase verbosity (multiple)
    --help|-h            This message

See L<WWW::CheckSite::Manual> for more information.

=head1 DESCRIPTION

This program will spider the specified url and check the availability
of the B<links>, B<images> and B<stylesheets> on each page. Pages and
stylesheets are also validated with the validators available at
L<http://validator.w3.org> and L<http://jigsaw.w3.org>.

When all pages are checked two reports in HTML-format are
generated. The F<full.html> report contains all the information for
all pages and the F<summ.html> report contains only the pages with
errors and their errors.

=head2 Metrics for a spidered page

Each page fetched by the spider will have these metrics:

=over 4

=item * B<status>, B<status_tx>

The HTTP-returncode and a verbal explanation of that code

=item * B<title>

The contents of the C<< <title></title> >> tag.

=item * B<ct>

The MIME type returned by the HTTP-server for the document.

=item * B<valid>

The HTML-validation result.

=item * B<links>

A list of C<< <a href=> >>, C<< <area href=> >> and C<< <frame src=>
>> uri's found on the page with the HTTP-returncode. Each HTML-code is
also checked for the text or ALT/TITLE attribute.

=item * B<link_cnt>, B<links_ok>

The number of links found and the number of links that are ok.
 
=item * B<images>

A list of C<< <img src=> >> and C<< <input type=image> >> uri's found
on the page with the HTTP-returncode and MIME type. Each HTML tag is
also checked for the existance of the ALT attribute.

=item * B<image_cnt>, B<images_ok>

The number of images found and the number of images that are ok.
 
=item * B<styles>

A list of C<< <link rel=stylesheet type=text/css> >> uri's found on
the page with the HTTP-returncode, MIME type and CSS-validation result.

=item * B<style_cnt>, B<styles_ok>

The number of stylesheets found and the number of stylesheets that are ok.
 
=back

=cut

use File::Spec::Functions qw( :DEFAULT rel2abs );
use File::Basename;
my $findbin;
BEGIN { $findbin = rel2abs dirname $0 }
use lib catdir $findbin, 'lib'; # run out of the box

use WWW::CheckSite;
use WWW::CheckSite::Validator;

use Getopt::Long;
my %opt = (
    validate    => 1,
    by_uri      => undef,
    by_xmllint  => undef,
    by_upload   => undef,
    strictrules => 1,
    lang        => undef,
    disallow    => [ ],

    uri         => [ ],
    prefix      => undef,
    save        => 1,
    load        => 0,
    tt          => 1,

    v           => 0,
);
GetOptions \%opt => qw(
    uri|u=s@          prefix|p=s          dir|d=s
    save!             load!               tt!
    strictrules!      ua_class=s
    lang|l=s
    validate!
    disallow=s@
    by_uri|by_url:s   by_upload|by_upl:s  by_xmllint:s  by_none
    v+                help|h
) or usage();
$opt{help} and usage();

# Check the mandatory arguments
@{ $opt{uri} } or $opt{uri} = [ @ARGV ];
defined $opt{uri} || $opt{load} or usage();
defined $opt{prefix} or usage();

# Set validation method
my $validate = $opt{validate}
    ? defined $opt{by_xmllint} ? 'by_xmllint'
        : defined $opt{by_upload} ? 'by_upload'
            : 'by_uri' : 'by_none';

$opt{by_xmllint} && $opt{by_xmllint} !~ /^(?:0|1)?$/ and
    $WWW::CheckSite::Validator::XMLLINT = $opt{by_xmllint};
$opt{by_upload} && $opt{by_upload} !~ /^0|1$/ and
    $WWW::CheckSite::Validator::VALIDATOR_FRM = $opt{by_upload};
$opt{by_uri} && $opt{by_uri} !~ /^0|1$/ and
    $WWW::CheckSite::Validator::VALIDATOR_URL = $opt{by_uri};

if ( $opt{load} ) {
    my $validator = WWW::CheckSite->load(
        prefix => $opt{prefix},
        dir    => $opt{dir},
        tt     => $opt{tt},
        v      => $opt{v},
    );
    $validator->write_report;
} else {
    my $validator = WWW::CheckSite->new(
        uri         => $opt{uri},
        validate    => $validate,
        strictrules => $opt{strictrules},
        lang        => $opt{lang},
        ua_class    => $opt{ua_class},
        tt          => $opt{tt},
        myrules     => $opt{disallow},

        prefix      => $opt{prefix},
        dir         => $opt{dir},
        save        => $opt{save},
        v           => $opt{v},
    );
    $validator->validate;
    $validator->write_report;
}

sub usage {

    print <<EO_HELP; exit;
Usage:

    checksite [options] <uri>

Options:

    --prefix|-p <name>   The prefix (dir) of this check [mandatory]
    --dir|-d <dir>       The target directory

    --[no]save           Save validation results
    --load               Load the validation results

    --novalidate         Skip the W3 validation
    --by_uri             Validate by sending the uri to W3
                         Optional alternative uri-mask
    --by_upload          Validate by uploading the contents to W3
                         Optional alternative uri

    --nostrictrules      Do not impose /robots.txt on the validator

    --disallow <path>    Add Disallow: rules to robots.txt (multiple)

    --lang|-l <lang>     Set language(s) for Accept-Language: header

    -v                   increase verbosity (multiple)
    --help|-h            This message

checksite v$VERSION
EO_HELP
}

=head1 SEE ALSO

=over 4

=item * L<WWW::CheckSite>

=item * L<WWW::CheckSite::Validator>

=item * L<WWW::CheckSite::Spider>

=back

=head1 AUTHOR

Abe Timmerman, C<< <abeltje@cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-WWW-CheckSite@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org>.  I will be notified, and then you'll automatically
be notified of progress on your bug as I make changes.

=head1 COPYRIGHT & LICENSE

Copyright MMV Abe Timmerman, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=cut
