#!/usr/bin/env perl
# ABSTRACT: output HTML document as a flat XPath/content list
# PODNAME: xpathify
use strict;
use utf8;
use warnings qw(all);

use feature 'say';
use open qw(:std :utf8);

use Getopt::Long;
use HTML::Linear;
use IO::Interactive qw(is_interactive);
use Pod::Usage;

our $VERSION = '0.014'; # VERSION


GetOptions(
    q(help)         => \my $help,
    q(color!)       => \my $color,
    q(html!)        => \my $html,
    q(encoding=s)   => \my $encoding,
    q(shrink!)      => \my $shrink,
    q(strict!)      => \my $strict,
    q(weight!)      => \my $weight,
) or pod2usage(q(-verbose) => 1);
pod2usage(q(-verbose) => 1)
    if $help or $#ARGV != 0;

$color //= is_interactive(*STDOUT);
$weight //= 0;

if ($html) {
    (%HTML::Linear::Path::xpath_wrap) = (%{$HTML::Linear::Path::Colors::scheme{html}});
    $color = 0;
    print $HTML::Linear::Path::Colors::html[0];
} elsif ($color) {
    (%HTML::Linear::Path::xpath_wrap) = (%{$HTML::Linear::Path::Colors::scheme{terminal}});
    $html = 0;
}

my $hl = HTML::Linear->new;

$hl->set_shrink
    if $shrink // 1;

$hl->set_strict
    if $strict // 0;

open(my $fh, '<:' . ($encoding ? "encoding($encoding)" : 'utf8' ), $ARGV[0])
    or die "Can't open $ARGV[0]: $!";
$hl->parse_file($fh);
close $fh;

for my $el ($hl->as_list) {
    my $hash = $el->as_hash;
    for (sort keys %{$hash}) {
        my @line;
        if ($html) {
            push @line, HTML::Linear::Path::Colors::wrap_xpath($_);
            $hash->{$_} = HTML::Linear::Path::Colors::wrap_content($hash->{$_}, 1);
        } elsif ($color) {
            push @line, $_;
            $hash->{$_} = HTML::Linear::Path::Colors::wrap_content($hash->{$_});
        } else {
            push @line, $_;
        }

        push @line, $el->weight if $weight;
        push @line, $hash->{$_};
        if ($html) {
            say '<tr><td>' . join('</td><td>', @line) . '</td></tr>';
        } else {
            say join("\t", @line);
        }
    }
}

print $HTML::Linear::Path::Colors::html[1]
    if $html;

__END__
=pod

=encoding utf8

=head1 NAME

xpathify - output HTML document as a flat XPath/content list

=head1 VERSION

version 0.014

=head1 SYNOPSIS

    xpathify [options] HTML

=head1 DESCRIPTION

Represents a typical HTML document in a very verbose two-column mode.
The first column is a XPath which locates each element inside the HTML tree.
The second column is a respective content (if any).

    /html/head/title/text() test 1
    /html/body/h1/text()    test 2
    /html/body/p[1]/text()  Lorem ipsum dolor sit amet, consectetur adipiscing elit.

=head1 OPTIONS

=over 4

=item --help

This.

=item --encoding=name

Specify the HTML document encoding (C<latin1>, C<utf8>).
UTF-8 is assumed by default.

=item --[no]color

Enable syntax highlight for XPath.
By default, enabled automatically on interactive terminals.

=item --[no]html

Disables the C<--color> option and highlights using HTML/CSS.

=item --[no]shrink

Shrink the XPath to the minimal unique identifier.
For example:

    /html/body[@id='cpansearch']/form[@class='searchbox']/input[@name='query']

Could be shortened as:

    //input[@name='query']

The shrinking is enabled by default.

=item --[no]strict

Strict mode disables grouping by C<id>, C<class> or C<name> attributes.
The grouping is enabled by default.

=item --[no]weight

Print XPath weight on a second column.

=back

=head1 AUTHOR

Stanislaw Pusep <stas@sysd.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2012 by Stanislaw Pusep.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

