#! /usr/bin/perl
use warnings;
use strict;
$|++;

# $Id: checksite 676 2007-05-28 22:00:59Z abeltje $
our $VERSION = '0.073';

=head1 NAME

checksite - Check the contents of a website

=head1 SYNOPSIS

    $ checksite [options] -p <name> uri

=head1 OPTIONS

=over 2

=item B<Results>

  --prefix|-p <name>        The prefix (dir) of this check [mandatory]
  --dir|-d <dir>            The target directory

=item B<Persistence>

  --[no]save                Save validation results
  --load                    Load the validation results

=item B<(X)HTML validation>

  --nohtml                  Skip (X)HTML validation
  --html_validator <uri>    Base uri for the W3C (X)HTML validator
  --html_upload             Validate (X)HTML by uploading
  --html_uri                Validate (X)HTML by sending the uri
  --xmllint                 Validate by using the xmllint program

=item B<CSS validation>

  --nocss                   Skip CSS validation
  --css_validator <uri>     Base uri for the W3C CSS validator
  --css_upload              Validate CSS by uploading
  --css_uri                 Validate CSS by sending the uri

=item B<Exclusion>

  --disallow <path>         Add Disallow: rules to robots.txt (multiple)

  --nostrictrules           Do not impose /robots.txt on the validator
                            for "local" url's

=item B<General>

  --lang|-l <lang>          Set language(s) for Accept-Language: header

  --ua_class <Module>       Set a new UserAgent class
                            (child of WWW::Mechanize)

  -v                        Increase verbosity (multiple)
  --help|-h                 This message

=begin private

  --nott                    Force the use of HTML::Template

  --exclude <pat>           Replace the default exclusion pattern '[#?].*$'

=end private

=back

See L<WWW::CheckSite::Manual> for more information.

=head1 DESCRIPTION

This program will spider the specified url and check the availability
of the B<links>, B<images> and B<stylesheets> on each page.

B<INCOMPATIBLE CHANGE AS OF 0.020>: Pages and stylesheets are B<NO
LONGER> validated with the validators available at
L<http://validator.w3.org> and L<http://jigsaw.w3.org>. These
validators do not allow robots! The W3C-HTML validator is now widly
available and very installable, so I advise you to run your own. The
W3C-CSS validator is more work, but I have managed to get that
to work as well with Jigsaw.

When all pages are checked two reports in HTML-format are
generated. The F<full.html> report contains all the information for
all pages and the F<summ.html> report contains only the pages with
errors and their errors.

=head2 Metrics for a spidered page

Each page fetched by the spider will have these metrics:

=over 4

=item * B<status>, B<status_tx>

The HTTP-returncode and a verbal explanation of that code

=item * B<title>

The contents of the C<< <title></title> >> tag.

=item * B<ct>

The MIME type returned by the HTTP-server for the document.

=item * B<links>

A list of C<< <a href=> >>, C<< <area href=> >> and C<< <frame src=>
>> uri's found on the page with the HTTP-returncode. Each HTML-code is
also checked for the text or ALT/TITLE attribute.

=item * B<link_cnt>, B<links_ok>

The number of links found and the number of links that are ok.
 
=item * B<images>

A list of C<< <img src=> >> and C<< <input type=image> >> uri's found
on the page with the HTTP-returncode and MIME type. Each HTML tag is
also checked for the existance of the ALT attribute.

=item * B<image_cnt>, B<images_ok>

The number of images found and the number of images that are ok.
 
=item * B<styles>

A list of C<< <link rel=stylesheet type=text/css> >> uri's found on
the page with the HTTP-returncode, MIME type and CSS-validation result.

=item * B<style_cnt>, B<styles_ok>

The number of stylesheets found and the number of stylesheets that are ok.

=item * B<valid>

The HTML-validation result.

=back

=cut

use File::Spec::Functions qw( :DEFAULT rel2abs );
use File::Basename;
my $findbin;
BEGIN { $findbin = rel2abs dirname $0 }
use lib catdir $findbin, 'lib'; # run out of the box

use WWW::CheckSite;
use WWW::CheckSite::Validator;

my( $dfopt, $dfopt_file );
eval { require Config::Auto };
unless ( $@ ) {
    $dfopt_file = Config::Auto::find_file();
    $dfopt_file && -r $dfopt_file
        and $dfopt = Config::Auto::parse( $dfopt_file );
}

use Pod::Usage;
use Getopt::Long;
my %opt = (
    html           => 1,
    html_validator => 'http://localhost/w3c-validator',
    html_upload    => undef,
    html_uri       => undef,
    xmllint        => undef,

    css            => 1,
    css_validator  => 'http://localhost/css-validator',
    css_upload     => undef,
    css_uri        => undef,

    strictrules    => 1,
    exclude        => undef,
    lang           => undef,
    disallow       => [ ],

    uri            => [ ],
    dir            => '.',
    prefix         => undef,
    save           => 0,
    load           => 0,
    tt             => 1,

    v              => 0,
);

if ( $dfopt ) {
    exists $dfopt->{disallow} && ! ref $dfopt->{disallow}
        and $dfopt->{disallow} = [ split /\s*[;:,]\s*/, $dfopt->{disallow} ];
    exists $dfopt->{ $_ } and $opt{ $_ } = $dfopt->{ $_ } for keys %opt;
} 

GetOptions \%opt => qw(
    uri|u=s@          prefix|p=s          dir|d=s
    save!             load!               tt!
    strictrules!      disallow=s@         exclude=s
    ua_class=s        lang|l=s

    html!             html_validator|html-validator=s
    html_upload!      html_uri!           xmllint:s
    css!              css_validator|css-validator=s
    css_upload!       css_uri!

    v+                help|h              man
) or pod2usage( exitval => 42, verbose => 0 );
$opt{help} and pod2usage( exitval => 0, verbose => 1 );
$opt{ man} and pod2usage( exitval => 0, verbose => 2 );

$opt{v} and print "Used default settings from '$dfopt_file'\n";

# Check the mandatory arguments
@{ $opt{uri} } or $opt{uri} = [ @ARGV ];
defined $opt{uri} || $opt{load} or pod2usage( exitval => 42, verbose => 0 );
defined $opt{prefix} or pod2usage( exitval => 42, verbose => 0 );

# Work out the validator mess
if ( $opt{html} ) { # We want (x)html validation
    if ( $opt{xmllint} ) {
        $opt{html_by} = 'by_xmllint';
        $opt{html_validator} = $opt{xmllint} eq '1'
            ? $WWW::CheckSite::Validator::XMLLINT
            : $opt{xmllint};
    } else {
        $opt{html_validator} ||= $WWW::CheckSite::Validator::VALIDATOR_XHTML;
        $opt{html_by} = $opt{html_uri} ? 'by_uri' : 'by_upload';
    }
} else {
    $opt{html_by} = 'by_none';
}

if ( $opt{css} ) {
    $opt{css_validator} ||= $WWW::CheckSite::Validator::VALIDATOR_STYLE;
    $opt{css_by} = $opt{css_uri} ? 'by_uri' : 'by_upload';
} else {
    $opt{css_by} = 'by_none';
}

if ( $opt{load} ) {
    my $validator = WWW::CheckSite->load(
        prefix => $opt{prefix},
        dir    => $opt{dir},
        tt     => $opt{tt},
        v      => $opt{v},
    );
    $validator->write_report;
} else {
    my $validator = WWW::CheckSite->new(
        uri            => $opt{uri},

        html_by        => $opt{html_by},
        html_validator => $opt{html_validator},
        css_by         => $opt{css_by},
        css_validator  => $opt{css_validator},

        strictrules    => $opt{strictrules},
        lang           => $opt{lang},
        ua_class       => $opt{ua_class},
        tt             => $opt{tt},
        myrules        => $opt{disallow},
        exclude        => $opt{exclude},

        prefix         => $opt{prefix},
        dir            => $opt{dir},
        save           => $opt{save},
        v              => $opt{v},
    );
    $validator->validate;
    $validator->write_report;
}

=head1 FILES

C<checksite> supports L<Config::Auto>. This means that any of
following directories is searched for F<checksiteconfig>,
F<checksite.config>, F<checksiterc> and F<.checksiterc>:

=over 4

=item B<current directory>

=item B<bin directory> (where the script is installed)

=item B<$HOME>

=item B</etc/>

=item B</usr/local/etc/>

=back

=head1 SEE ALSO

=over 4

=item * L<WWW::CheckSite>

=item * L<WWW::CheckSite::Validator>

=item * L<WWW::CheckSite::Spider>

=back

=head1 AUTHOR

Abe Timmerman, C<< <abeltje@cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-WWW-CheckSite@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org>.  I will be notified, and then you'll automatically
be notified of progress on your bug as I make changes.

=head1 COPYRIGHT & LICENSE

Copyright MMV-MMVII Abe Timmerman, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=cut
