#!/usr/bin/perl -s

use strict;
use warnings;

our $o;

eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();

eval { require Lingua::FreeLing3::Word };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
Lingua::FreeLing3::Word->import();


use XML::TMX::Reader '0.25';

my $file = shift or die "You must supply the name of the file to tokenize";

my $reader = XML::TMX::Reader->new($file);

my $output = "pos_$file";
$output = $o if $o;

binmode STDOUT, ":utf8";
$reader->for_tu(
                {
                 -output => $output,
                 -verbose => 1,
                 -verbatim => 1,
                },
                sub {
                    my $tu = shift;
                    for my $lang (keys %$tu) {
                        if ($lang =~ /(pt|es|it|ru|en)/i) {
                            my $seg = "";
                            my $ln = lc $1;
                            my $txt = $tu->{$lang}{-seg};
                            my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt;
                            my $sentences = splitter($lang)->split(\@tokens);
                            $sentences = morph($lang)->analyze($sentences);
                            $sentences = hmm($lang)->tag($sentences);
                            for my $s (@$sentences) {
                                $seg .= "<s>\n";
                                $seg .= _dump_words($s->words);
                                $seg .= "</s>\n";
                            }
                            $tu->{$lang}{-iscdata} = 1;
                            $tu->{$lang}{-seg} = $seg;
                        }
                    }
                    return $tu;
                 });


sub _dump_words {
    my @words = @_;
    my $seg;
    for my $t (@words) {
        if ($t->is_multiword) {
            $seg .= sprintf("<mwu lema=\"%s\" pos=\"%s\">\n", $t->lemma(), $t->tag());
            $seg .= _dump_words($t->get_mw_words);
            $seg .= "</mwu>\n";
        } else {
            $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n"
        }
    }
    return $seg;
}

=encoding UTF-8

=head1 NAME

tmx-POStagger - POStaggers translation units on a tmx file.

=head1 SYNOPSIS

   tmx-POStagger file.tmx  # creates t_file.tmx

   tmx-POStagger -o=out.tmx file.tmx

=head1 DESCRIPTION

Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.

At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.

It your TMX file includes any other language, they will be maintained
without a change.  This behavior can change in the future, as a basic
regexp based POStaggerr might be implemented.

=head1 SEE ALSO

XML::TMX, Lingua::FreeLing3

=head1 AUTHOR

Alberto Manuel Brandão Simões, E<lt>ambs@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2012 by Alberto Manuel Brandão Simões

=cut



