#!/usr/bin/env perl

use strict;
use warnings;

use FindBin qw($Bin);
use lib "$Bin/../../lib";

package CeisPages;

use Moose;
extends 'DataFlow::Proc::MultiPageURLGenerator';

use DataFlow::Util::HTTPGet;
use HTML::TreeBuilder::XPath;
use URI;

has '+produce_last_page' => (
    default => sub {
        return sub {
            my $url = shift;

            my $get  = DataFlow::Util::HTTPGet->new;
            my $html = $get->get($url);

            my $texto =
              HTML::TreeBuilder::XPath->new_from_content($html)
              ->findvalue('//p[@class="paginaAtual"]');
            die q{Não conseguiu determinar a última página}
              unless $texto;
            return $1 if $texto =~ /\d\/(\d+)/;
          }
    },
);
has '+make_page_url' => (
    default => sub {
        return sub {
            my ( $self, $url, $page ) = @_;

            my $u = URI->new($url);
            $u->query_form( $u->query_form, Pagina => $page );
            return $u->as_string;
          }
    },
);

package main;

use DataFlow;
use aliased 'DataFlow::Proc::NOP';
use aliased 'DataFlow::Proc::HTMLFilter';
use aliased 'DataFlow::Proc::URLRetriever';
use aliased 'DataFlow::Proc::MultiPageURLGenerator';
use aliased 'DataFlow::Proc::CSV';
use aliased 'DataFlow::Proc::JSON';
use aliased 'DataFlow::Proc::Encoding';
use aliased 'DataFlow::Proc::SimpleFileOutput';

use Encode;
use Data::Dumper;

my $flow = DataFlow->new(
    procs => [
        CeisPages->new(
            first_page => -5,

            #last_page     => 35,
        ),
        NOP->new( deref => 1, name => 'nop', ),
        URLRetriever->new,
        HTMLFilter->new(
            search_xpath =>
              '//div[@id="listagemEmpresasSancionadas"]/table/tbody/tr',
        ),
        HTMLFilter->new(
            search_xpath => '//td',
            result_type  => 'VALUE',
            ref_result   => 1,
        ),
        sub {    # remove leading and trailing spaces
            local $_ = shift;
            s/^\s*//;
            s/\s*$//;
            s/[\r\n\t]+/ /g;
            s/\s\s+/ /g;
            return $_;
        },
        sub {
            my $internal = decode( "iso-8859-1", shift );
            return encode( "utf8", $internal );
        },
        NOP->new( name => 'espiando', dump_output => 1, ),
        JSON->new(
            name        => 'json',
            direction   => 'TO_JSON',
            json_opts   => { utf8 => 1, pretty => 1, },
            dump_output => 1,
        ),

       #        CSV->new(
       #            name          => 'csv',
       #            direction     => 'TO_CSV',
       #            text_csv_opts => { binary => 1 },
       #            headers       => [
       #                'CNPJ/CPF',   'Nome/Razão Social/Nome Fantasia',
       #                'Tipo',       'Data Inicial',
       #                'Data Final', 'Nome do Órgão/Entidade',
       #                'UF',         'Fonte',
       #                'Data'
       #            ],
       #            dump_output => 1,
       #        ),
       #        SimpleFileOutput->new( file => '> /tmp/ceis.csv', ors => "\n" ),
    ],
);

##############################################################################

my $base = q{http://www.portaltransparencia.gov.br} . '/'
  . q{ceis/EmpresasSancionadas.asp?paramEmpresa=0};

$flow->input($base);

my @res = $flow->flush;

#print Dumper(\@res);

