#!/usr/bin/env perl
# PODNAME: locale_simple_scraper
# ABSTRACT: Command line tool for finding tokens in code


$|=1;

use strict;
use warnings;
use Getopt::Long;
use File::Find;
use Cwd;
use IO::All;
use Locale::Simple;
use Data::Dumper;

#
# !!! Work in progress !!!
#

                 # Supported filetypes:
my $js_ext = ""; # Javascript
my $pl_ext = ""; # Perl
my $py_ext = ""; # Python
my $tx_ext = ""; # Text::Xslate (Kolon or Metakolon)

my @ignores;
my @only;

my $output = 'po';
my $md5;

GetOptions(
	"js=s" => \$js_ext,
	"pl=s" => \$pl_ext,
	"py=s" => \$py_ext,
	"tx=s" => \$tx_ext,
	"ignore=s" => \@ignores,
	"only=s" => \@only,
	"output=s" => \$output,
	"md5" => \$md5,
);

# could add Getopt::Long here for override

my @js = split(",",$js_ext);
push @js, 'js';

my @pl = split(",",$pl_ext);
push @pl, 'pl', 'pm', 't';

my @tx = split(",",$tx_ext);
push @tx, 'tx';

my @py = split(",",$py_ext);
push @py, 'py';

# extension list
my %e = (
	( map { $_ => 'js' } @js ),
	( map { $_ => 'pl' } @pl ),
	( map { $_ => 'tx' } @tx ),
	( map { $_ => 'py' } @py ),
);

# functions with count of locale simple with function of parameter
#
# 1 = msgid
# 2 = msgid_plural
# 3 = msgctxt
# 4 = domain
#
my %f = (
	l => [1],
	ln => [1,2],
	ld => [4,1],
	lp => [3,1],
	lnp => [3,1,2],
	ldn => [4,1,2],
	ldp => [4,3,1],
	ldnp => [4,3,1,2],
);

my @found;

my $dir = getcwd;
my $re_dir = $dir;
$re_dir =~ s/\./\\./g;

finddepth(sub {
	my $filename = $File::Find::name;
	my $stored_filename = $filename;
	if ($md5) {
		eval {
			require Digest::MD5;
			Digest::MD5->import('md5_hex');
		};
		die "This feature requires Digest::MD5" if $@;
		$stored_filename = md5_hex($filename);
	}
	$filename =~ s/^$dir\///g;
	for (@ignores) {
		return if $filename =~ /$_/;
	}
	if (@only) {
		my $found = 0;
		for (@only) {
			$found = 1 if $filename =~ /$_/;
		}
		return unless $found;
	}
	my @fileparts = split('\.',$File::Find::name);
	my $ext = pop @fileparts;
	if (grep { $ext eq $_ } keys %e) {
		my $file = $File::Find::name;
		my $type = $e{$ext};
		print STDERR $type." => ".$file."\n";
		return if -l $file and not -e readlink($file);
		my @lines = io($file)->slurp;
		my $line = 0;
		for (@lines) {
			$line++;
			my @results = parse_line($_, $type);
			for (@results) {
				push @found, {
					%{$_},
					line => $line,
					file => $stored_filename,
					type => $type,
				}
			}
		}
	}
}, $dir);

if ($output eq 'po') {
	my %files;
	my %token;
	for (@found) {
		my $key .= defined $_->{domain} ? '"'.$_->{domain}.'"' : 'undef';
		$key .= defined $_->{msgctxt} ? '"'.$_->{msgctxt}.'"' : 'undef';
		$key .= defined $_->{msgid} ? '"'.$_->{msgid}.'"' : 'undef';
		$key .= defined $_->{msgid_plural} ? '"'.$_->{msgid_plural}.'"' : 'undef';
		$token{$key} = $_ unless defined $token{$key};
		$files{$key} = [] unless defined $files{$key};
		push @{$files{$key}}, $_->{file}.':'.$_->{line};
	}
	for my $k (sort { $a cmp $b } keys %files) {
		print "\n";
		print "#: ".join(' ',@{$files{$k}})."\n";
		print "#, locale-simple-format";
		print " ".$token{$k}{domain} if defined $token{$k}{domain};
		print "\n";
		for (qw( msgctxt msgid msgid_plural )) {
			print $_.' "'.Locale::Simple::gettext_escape($token{$k}{$_}).'"'."\n" if defined $token{$k}{$_};
		}
		print qq[msgstr ""\n];
	}
} elsif ($output eq 'perl') {
	print Dumper \@found;
} elsif ($output eq 'json') {
	eval {
		require JSON;
		JSON->import;
		print encode_json(\@found);
	} or do {
		die "You require the module JSON for this output";
	};
} elsif ($output eq 'yaml') {
	eval {
		require YAML;
		YAML->import;
		print Dump(\@found);
	} or do {
		die "You require the module YAML for this output";
	};
}

sub parse_line {
	my ( $line, $type, @results ) = @_;
	return if $line =~ /^\s*\#.*/;
	for (keys %f) {
		my @args = @{$f{$_}};
		for ($line =~ /[^\w]${_}\((.*)/) {
			my $argc = scalar @args;
			my ( $remainder, @params ) = parse_params($1, $type, $argc);
			if (scalar @params == $argc) {
				my %result;
				my $pos = 0;
				for (@args) {
					$result{msgid} = $params[$pos] if $_ eq 1;
					$result{msgid_plural} = $params[$pos] if $_ eq 2;
					$result{msgctxt} = $params[$pos] if $_ eq 3;
					$result{domain} = $params[$pos] if $_ eq 4;
					$pos++;
				}
				push @results, \%result, parse_line( $remainder, $type );
			}
		}
	}
	return @results;
}

sub parse_params {
	my ( $params, $type, $argc ) = @_;
	my @chars = split('',$params);
	my @args;
	my $arg = "";
	my $q_state = 0; # 0 = code, 1 = qoute, 2 = double qoute
	my $comma_state = 1;
	while (defined (my $c = shift @chars)) {
		next if $c =~ /\s/ and !$q_state;
		if ($q_state) {
			if ($c eq '\\') {
				my $esc = shift @chars;
				if ($esc eq "'" or $esc eq '"' or $esc eq '\\') {
					$arg .= $c;
				} else {
					warn "Unknown escape char '".$esc."'";
				}
			} elsif ( ( $c eq "'" and $q_state == 1 ) or ( $c eq '"' and $q_state == 2 ) ) {
				$q_state = 0;
				$comma_state = 0;
				push @args, $arg;
				$arg = "";
				last if scalar @args == $argc;
			} else {
				$arg .= $c;
			}
		} else {
			if ($c eq "'" or $c eq '"') {
				die "quote found where comma expected: ".$params unless $comma_state;
				$q_state = $c eq "'" ? 1 : 2;
			} elsif ($c eq ',') {
				die "comma found after comma in code: ".$params if $comma_state;
				$comma_state = 1;
			} elsif ($type eq 'js') {
				last;
			} else {
				last;
			}
		}
	}
	return join('', @chars), @args;
}

__END__
=pod

=head1 NAME

locale_simple_scraper - Command line tool for finding tokens in code

=head1 VERSION

version 0.016

=head1 SYNOPSIS

  # ignoring specific regexps for the filename
  # please be aware that you must escape the . and because we use bash here you
  # must also be aware to escape the \ for escaping the .
  locale_simple_scraper --ignores \\.build --ignores some/dir

  # only use specific regexps for the filename
  locale_simple_scraper --only core --only site/something

  # different output parameter, might require more modules
  locale_simple_scraper --output perl
  locale_simple_scraper --output yaml
  locale_simple_scraper --output json

  # scramble real filename with md5 (for security)
  locale_simple_scraper --md5

  # setting additional extensions for Javascript (default: js)
  locale_simple_scraper --js jjs,ajs

  # setting additional extensions for Perl (default: pl, pm, t)
  locale_simple_scraper --pl ppl,pppl

  # setting additional extensions for Text::Xslate (default: tx)
  locale_simple_scraper --tx ttx,xxx

=head1 DESCRIPTION

This tool parses all Perl, Python, Javascript and Text::Xslate templates in the
current directory and subdirectories to find calls to the L<Locale::Simple>
API.

It gives out on STDERR which files are parsed right now, while it dumps the
resulting data to the screen. By default it generates a B<.po> file, but you can
specify via I<--output> to dump B<json>, B<yaml> or a B<perl> data structure.

Be aware that we add the domain as flag in the B<.po> file. So double token
may appear which only differs through a B<#,> flag.

=head1 SUPPORT

Repository

  http://github.com/Getty/p5-locale-simple
  Pull request and additional contributors are welcome

Issue Tracker

  http://github.com/Getty/p5-locale-simple/issues

=head1 AUTHOR

Torsten Raudssus <torsten@raudss.us>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2012 by DuckDuckGo, Inc. L<http://duckduckgo.com/>, Torsten Raudssus <torsten@raudss.us>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

