#!/usr/bin/env perl
package crawler2mysql;
use strict;
use warnings;
# ABSTRACT: command-line script to control WWW::3172::Crawler
our $VERSION = 'v0.002'; # VERSION

use WWW::3172::Crawler;
use Getopt::Long;
use Pod::Usage;
use Data::Printer alias => 'dump';
use DBI;
use Config::General qw(ParseConfig);

my %opts = (
    max => 50,
);
GetOptions( \%opts,
    'help|?',
    'version',
    'max=i',
    'debug+',
);

my $host = shift;
pod2usage(
    -verbose => 2,
) if $opts{help} or !$host;


if (delete $opts{version}) {
    my $this = __PACKAGE__;
    my $this_ver = defined __PACKAGE__->VERSION ? __PACKAGE__->VERSION : 'dev';
    print "$this version $this_ver\n" and exit;
}

print "Crawling $opts{max} pages, starting at $host\n";
print "Dry run -- aborting\n" and exit if $opts{'dry-run'};

my %conf = ParseConfig('db.conf');
my $dbh = DBI->connect(
    "DBI:mysql:database=doherty;host=$conf{host}",
    $conf{username},
    $conf{password},
);
$dbh->{'mysql_enable_utf8'} = 1;
$dbh->{'AutoCommit'}        = 0;
$dbh->trace('1|SQL') if $opts{debug} and $opts{debug} > 1;

my %sql = (
    words_insert    => $dbh->prepare('INSERT IGNORE INTO words (word_str) VALUES (?)'),
    words_select    => $dbh->prepare('SELECT word_id FROM words WHERE word_str = ?'),
    url_insert      => $dbh->prepare('INSERT INTO url (url_str) VALUES (?)'),
    url_words_insert=> $dbh->prepare('INSERT INTO url_words (url_id, word_id, url_word_count) VALUES (?,?,?)'),
    start           => $dbh->prepare('START TRANSACTION'),
    commit          => $dbh->prepare('COMMIT'),
    rollback        => $dbh->prepare('ROLLBACK'),
);

my $crawler = WWW::3172::Crawler->new(
    host    => $host,
    max     => $opts{max},
    debug   => $opts{debug},
    callback=> sub {
        my $uri  = shift;
        my $data = shift;
        print "Got data for $uri\n" if $opts{debug} and $opts{debug} > 1;

        $sql{start}->execute;   # START TRANSACTION

        # INSERT url
        $sql{url_insert}->execute($uri);
        my $url_id = $dbh->last_insert_id(undef, undef, undef, undef);

        # INSERT stemmed words
        while (my ($stem, $count) = each %{ $data->{stems} }) {
            $sql{words_insert}->execute($stem);
            my $word_id = $dbh->last_insert_id(undef, undef, undef, undef);

            unless ($word_id) { # Was a duplicate word
                my $data = $dbh->selectrow_hashref($sql{words_select}, {}, $stem);
                $word_id = $data->{word_id};
            }

            # INSERT url<->word mappings
            $sql{url_words_insert}->execute($url_id, $word_id, $count);
        }

        $sql{commit}->execute;  # COMMIT
    },
);
$crawler->crawl;

$dbh->commit;
$dbh->disconnect;


__END__
=pod

=encoding utf-8

=head1 NAME

crawler2mysql - command-line script to control WWW::3172::Crawler

=head1 VERSION

version v0.002

=head1 SYNOPSIS

    3137-crawler http://example.com
    3137-crawler --max=50 http://example.com

=head1 DESCRIPTION

B<3172-crawler> is a command line script to control a L<crawler|WWW::3172::Crawler>.

=head1 OPTIONS

=over 4

=item B<--max>

Specify the maximum number of pages to crawl. Default is 50.

=item B<--help>, -h, -?

Opens this man page and exits.

=item B<--version>

Prints the version of this program and supporting libraries.

=item B<--debug>

Print debugging information.

=back

=head1 AVAILABILITY

The project homepage is L<http://metacpan.org/release/WWW-3172-Crawler/>.

The latest version of this module is available from the Comprehensive Perl
Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
site near you, or see L<http://search.cpan.org/dist/WWW-3172-Crawler/>.

The development version lives at L<http://github.com/doherty/WWW-3172-Crawler>
and may be cloned from L<git://github.com/doherty/WWW-3172-Crawler.git>.
Instead of sending patches, please fork this project using the standard
git and github infrastructure.

=head1 SOURCE

The development version is on github at L<http://github.com/doherty/WWW-3172-Crawler>
and may be cloned from L<git://github.com/doherty/WWW-3172-Crawler.git>

=head1 BUGS AND LIMITATIONS

No bugs have been reported.

Please report any bugs or feature requests through the web interface at
L<http://rt.cpan.org>.

=head1 AUTHOR

Mike Doherty <doherty@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2011 by Mike Doherty.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

