#!/usr/bin/perl
#-*-perl-*-
#
# USAGE: sta2moses sta-align-file.xml
#
# simple script to get the aligned sentences from Stockholm Treealigner files
#

use strict;
use FindBin;
use lib $FindBin::Bin.'/../lib';
use Lingua::Align::Corpus::Parallel::STA;
use File::Basename;

my $algfile = shift(@ARGV);
my $corpus=new Lingua::Align::Corpus::Parallel::STA(-alignfile => $algfile);
my $base=basename($algfile);
$base=~s/\.xml//;

my $srcfile=$base.'.src';
my $trgfile=$base.'.trg';

open SRC,">$srcfile" || die "cannot open $srcfile\n";
open TRG,">$trgfile" || die "cannot open $trgfile\n";
binmode(SRC,":encoding(utf8)");
binmode(TRG,":encoding(utf8)");

my %srctree=();
my %trgtree=();
my $links;

while ($corpus->next_alignment(\%srctree,\%trgtree,\$links)){
    print SRC lc(join(' ',$corpus->{SRC}->get_all_leafs(\%srctree)));
    print SRC "\n";
    print TRG lc(join(' ',$corpus->{TRG}->get_all_leafs(\%trgtree)));
    print TRG "\n";
}

close SRC;
close TRG;



__END__

=head1 NAME

sta2moses - a script that converts an aligned parallel treebank to plain text format (Moses/Giza++ format). Alignment has to be in sta (Stockholm Tree Aligner) format.

=head1 SYNOPSIS

    sta2moses alignments.xml

=head1 DESCRIPTION

This script reads through a parallel treebank using the tree alignment file (alignments.xml) and produces sentence aligned plain text files (to be used with Moses/Giza++). The corpus will be stored in F<alignments.src> and F<alignments.trg>.


=head1 SEE ALSO

L<Lingua::Align::Corpus>
 

=head1 AUTHOR

Joerg Tiedemann, E<lt>jorg.tiedemann@lingfil.uu.seE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut
