#!/usr/bin/perl
## -*-cperl-*-
## Author:  Stephanie Evert
## Purpose: import existing sentence alignment into CWB
##
$| = 1;
use warnings;
use strict;

use CWB;
use CWB::CQP;

use Getopt::Long;
use Pod::Usage;

## configuration variables
our $Verbose      = 0;      # -v ... show progress & status messages
our $Opt_Source   = undef;  # -l1 <id> ... source corpus (overrides alignment file header)
our $Opt_Target   = undef;  # -l2 <id> ... target corpus (overrides alignment file header)
our $Opt_Grid     = undef;  # -s <name> ... alignment grid attribute (usually <s>, overrides header)
our $Opt_Key      = undef;  # -k <pattern> ... key for identifying grid regions in alignment beads (overrides header)
our $Opt_NH       = 0;      # -nh ... alignment file does not have header (-l1, -l2, -s, -k are then required)
our $Opt_Inverse  = 0;      # -i ... encode "inverse" alignment (from target to source language)
our $Opt_Prune    = 0;      # -p ... automatically delete alignment beads if their keys are not found (implies -e)
our $Opt_Empty    = 0;      # -e ... allow 1:0 and 0:1 alignment beads (will be skipped)
our $Opt_Registry = undef;  # -r <dir> ... use registry directory <dir>
our $Opt_Help     = 0;      # -h ... show usage message

our $Opt_Test     = 0;      # -t .. use only first 100000 sentences for testing

OPTIONS_AND_USAGE:
{
  my $ok = GetOptions(
    "v|verbose"    => \$Verbose,
    "l1|source=s"  => \$Opt_Source,
    "l2|target=s"  => \$Opt_Target,
    "s|grid=s"     => \$Opt_Grid,
    "k|key=s"      => \$Opt_Key,
    "nh|no-header" => \$Opt_NH,
    "i|inverse"    => \$Opt_Inverse,
    "p|prune"      => \$Opt_Prune,
    "e|empty"      => \$Opt_Empty,
    "r|registry=s" => \$Opt_Registry,
    "h|help"       => \$Opt_Help,
    "t|test"       => \$Opt_Test,
  );
  pod2usage(-msg => "(Type 'perldoc cwb-align-import' for more information.)",
    -exitval => 0, -verbose => 1) if $Opt_Help;
  pod2usage(-msg => "(Type 'cwb-align-import -h' for more information.)",
    -exitval => 1, -verbose => 0) if $ok and @ARGV == 0;
  pod2usage(-msg => "SYNTAX ERROR.", 
    -exitval => 2, -verbose => 0) 
    unless $ok and @ARGV == 1;
  die "Flags -l1, -l2, -s and -k must be specified if -nh option is used.\n"
    if $Opt_NH and not($Opt_Source and $Opt_Target and $Opt_Grid and $Opt_Key);
  $Opt_Empty = 1 if $Opt_Prune;  # -p implies -e
}

## global variables
our ($C1_id, $C2_id, $C1_lc, $C2_lc, $S_id); # source and target corpus name (with lowercase variant) and alignment grid 
our ($align_file, $FH); # alignment file and file handle
our ($key_pattern);     # pattern used to generate keys that identify regions in the alignment grid
our (%R1, %R2);         # hashes mapping keys to [start, end] regions, in source and target corpus
our @Beads;             # list of alignment beads, with entries [$l1_start, $l1_end, $l2_start, $l2_end, ($annot)]

SETUP:
{
  $align_file = shift @ARGV;
  $FH = CWB::OpenFile $align_file;
  unless ($Opt_NH) {
    my $line = <$FH>;
    chomp $line;
    my @F = split /\t/, $line;
    die "Format error in alignment file header: ``$line''\n" unless @F == 4;
    ($C1_id, $C2_id, $S_id, $key_pattern) = @F;
  }
  $C1_id = $Opt_Source if $Opt_Source;
  $C2_id = $Opt_Target if $Opt_Target;
  $S_id  = $Opt_Grid if $Opt_Grid;
  $key_pattern = $Opt_Key if $Opt_Key;
  if ($Opt_Inverse) {
    ($C1_id, $C2_id) = ($C2_id, $C1_id); # swap source and target language with -i option
  }

  $C1_id = uc($C1_id);
  $C2_id = uc($C2_id);
  $C1_lc = lc($C1_id);
  $C2_lc = lc($C2_id);
}

MAKE_KEYS:
{
  print "Generating keys for grid regions:\n" if $Verbose;
  print "  - $C1_id "             if $Verbose;
  build_region_keys(\%R1, $C1_id, $S_id, $key_pattern);
  print " ok\n"   if $Verbose;
  print "  - $C2_id " if $Verbose;
  build_region_keys(\%R2, $C2_id, $S_id, $key_pattern);
  print " ok\n" if $Verbose;
}

READ_ALIGNMENT:
{
  print "Processing " if $Verbose;
  my $beads = 0;
  my $lines = 0;

  LINE:
  while (<$FH>) {
    $lines++;
    print "." if $Verbose and ($lines & 0xFFFF) == 1;  # 16 dots per 1M alignment beads

    chomp;
    my ($l1_keys, $l2_keys) = split /\t/;        # annotations are ignored so far
    if ($Opt_Inverse) {  
      ($l1_keys, $l2_keys) = ($l2_keys, $l1_keys); # swap source and target language with -i option
    }
    my @l1_keys = split " ", $l1_keys;
    my @l2_keys = split " ", $l2_keys;
    next LINE if $Opt_Empty and (@l1_keys == 0 xor @l2_keys == 0);    # skip 1:0 and 0:1 beads (only with -e or -p)
    die "ERROR: Syntax error on line #$.: ``$_''\n" unless @l1_keys and @l2_keys;

    ## need option to drop alignment beads with missing keys
    my @missing_l1 = grep { not exists $R1{$_} } @l1_keys;
    my @missing_l2 = grep { not exists $R2{$_} } @l2_keys;
    next LINE if $Opt_Prune and (@missing_l1 or @missing_l2);     # -p => prune alignment beads with missing keys
    die "ERROR: grid key(s) @missing_l1 not found in corpus $C1_id\n" if @missing_l1;
    die "ERROR: grid key(s) @missing_l2 not found in corpus $C2_id\n" if @missing_l2;

    ## extract regions for bead and check that multiple regions are contiguous
    my @l1_regions = sort { $a->[0] <=> $b->[0] } @R1{@l1_keys}; # ensure that regions are sorted correctly
    foreach my $i (1 .. $#l1_regions) {
      die "Error: alignment bead #$lines is non-contiguous in $C1_id\n\t(keys: @l1_keys)\n"
        unless $l1_regions[$i - 1][1] + 1 == $l1_regions[$i][0];
    }
    my @l2_regions = sort { $a->[0] <=> $b->[0] } @R2{@l2_keys};
    foreach my $i (1 .. $#l2_regions) {
      die "Error: alignment bead #$lines is non-contiguous in $C2_id\n\t(keys: @l2_keys)\n"
        unless $l2_regions[$i - 1][1] + 1 == $l2_regions[$i][0];
    }

    push @Beads, [$l1_regions[0][0], $l1_regions[-1][1], $l2_regions[0][0], $l2_regions[-1][1]];
    $beads++;
  }
  print " $beads / $lines alignment beads\n" if $Verbose;
}

our $n_beads = @Beads;
SORT_AND_VALIDATE:
{
  my $ok = 1;
  foreach my $i (1 .. $n_beads - 1) {
    if ($Beads[$i][0] <= $Beads[$i - 1][1]) {
      $ok = 0;  # overlapping or unordered source alignment regions => need to sort @Beads
      last;
    }
  }
  last SORT_AND_VALIDATE if $ok;    # regions are sorted properly and non-overlapping: skip rest of this block

  print "Sorting alignment beads ... " if $Verbose;
  @Beads = sort { $a->[0] <=> $b->[0] } @Beads;  # sort by ascending start position in source corpus

  print "checking ... " if $Verbose;
  foreach my $i (1 .. $n_beads - 1) {
    if ($Beads[$i][0] <= $Beads[$i - 1][1]) {
      die "Error: overlapping alignment beads in $C1_id\n",
        "\t(first overlap: " . $Beads[$i - 1][0] . ".." . $Beads[$i - 1][1] . " with " . $Beads[$i][0] . ".." . $Beads[$i][1] . ")\n";
    }
  }
  print "ok\n" if $Verbose;
}

EDIT_REGISTRY:
{
  my $rf_name = ($Opt_Registry) ? "$Opt_Registry/$C1_lc" : $C1_id;
  my $rf = new CWB::RegistryFile $rf_name;
  my $has_alignment = $rf->attribute($C2_lc);
  if (defined $has_alignment) {
    die "Error: attribute $C1_id.$C2_lc exists, but is not an alignment attribute. Can't continue.\n"
      unless $has_alignment eq "a";
    ## alignment attribute is already defined, nothing to do here
  }
  else {
    print "Registering alignment attribute $C1_id.$C2_lc ... " if $Verbose;
    $rf->add_attribute($C2_lc, "a");
    $rf->write;
    print "successful\n";
  }
}

ENCODE_ALIGNMENT:
{
  print "Writing alignment file ... " if $Verbose;
  my $temp = new CWB::TempFile "imported_alignment.gz"; # temporary input file for cwb-align-encode
  $temp->write(join("\t", $C1_id, $S_id, $C2_id, $S_id), "\n");
  foreach my $bead (@Beads) {
    $temp->write(join("\t", @$bead), "\n");
  }
  $temp->finish;
  
  print "encoding ... " if $Verbose;
  my @encode_cmd = ($CWB::AlignEncode, "-D");
  push @encode_cmd, "-r", $Opt_Registry if $Opt_Registry;
  push @encode_cmd, $temp->name;
  CWB::Shell::Cmd(\@encode_cmd);
  $temp->close;
  
  print "done\n" if $Verbose;
}

print "Alignment $C1_id => $C2_id has been created with $n_beads non-empty beads.\n";
exit 0;


sub build_region_keys {
  my ($hashref, $corpus, $grid_att, $key_pattern) = @_;
  $corpus = uc($corpus);
  my $cqp = new CWB::CQP;           # use CQP to build region keys
  $cqp->set_error_handler('die');
  $cqp->exec("set Registry ".$cqp->quote($Opt_Registry))
    if $Opt_Registry;
  $cqp->exec($corpus);              # activate corpus
  my %s_attribute =                 # names of all s-attributes
    map { $_->[1] => 1 }
    grep { $_->[0] eq "s-Att" }
    $cqp->exec_rows("show cd");
  die "Error: <$grid_att> is not an s-attribute in corpus $corpus\n"
    unless $s_attribute{$grid_att};
  $cqp->exec("Grid = <$grid_att> [] expand to $grid_att");  # get vector of grid regions
  print "." if $Verbose;
  my @regions = $cqp->exec_rows("tabulate Grid match, matchend");
  print ".." if $Verbose;
  my @keys = ("") x @regions; # initialise vector of keys with empty strings
  while ($key_pattern =~ s/^( [^{}]* ) \{ ( [^}]+ ) \}//x) {
    my ($literal, $name) = ($1, $2);
    if ($s_attribute{"${grid_att}_${name}"}) {
      $name = "${grid_att}_${name}";
    }
    elsif (not $s_attribute{$name}) {
      die "Error: neither <$name> nor <${grid_att}_${name}> is an s-attribute in corpus $corpus\n";
    }
    my @data = $cqp->exec("tabulate Grid match $name");
    print "." if $Verbose;
    foreach my $idx (0 .. $#keys) {
      $keys[$idx] .= $literal.$data[$idx];
    }
    print "." if $Verbose;
  }
  if ($key_pattern ne "") {
    foreach my $idx (0 .. $#keys) {
      $keys[$idx] .= $key_pattern;
    }
    print "." if $Verbose;
  }
  foreach my $idx (0 .. $#keys) {
    my $k = $keys[$idx];
    die "ERROR: duplicate key '$k' found; please revise your key specification.\n"
      if exists $hashref->{$k};
    $hashref->{$k} = $regions[$idx];
  }
  undef $cqp; # close CQP session
  return $hashref;
}


__END__

=head1 NAME

cwb-align-import - Import existing sentence alignment into a CWB corpus

=head1 SYNOPSIS

  cwb-align-import [options] <alignment_beads.txt>

Options:

  -r <dir>, --registry=<dir>    use registry directory <dir>
  -i, --inverse                 encode inverse alignment (target -> source)
  -p, --prune                   ignore alignment beads with ID errors
  -e, --empty                   allow 1:0 and 0:1 alignments (not encoded)
  -v, --verbose                 show progress messages during processing
  -h, --help                    display short help page

  -nh, --no-header              alignment file without header; must specify:
  -l1 <name>, --source=<name>   CWB name of source corpus
  -l2 <name>, --target=<name>   CWB name of target corpus
  -s <att>,   --grid=<att>      alignment grid (s-attribute, usually sentences)
  -k <spec>,  --key=<spec>      pattern for constructing unique sentence IDs

=head1 DESCRIPTION

This script is used to import an existing sentence-level alignment between two corpora into the IMS Open Corpus Workbench.  Alignments at other granularities (such as paragraph or clause) are also supported, but note that discontinuous regions are not allowed in alignment beads and there can only be I<one> alignment between any two corpora.  For simplicity, this manpage talks about sentence alignment only; adjust the instructions accordingly to import paragraph alignment etc.

First, the two corpora to be aligned must be CWB-encoded, making sure that all sentence regions (usually marked by an s-attribute C<s>) carry unique IDs.  For efficiency and convenience, IDs can be constructed from multiple annotations at different levels, e.g. document ID, paragraph number (within document) and sentence number (within paragraph).

The input file for B<cwb-align-import> contains one alignment bead per row, specifying first the ID(s) of the source language sentence(s) and then the ID(s) of the target language sentence(s).  The CWB names of the source and target corpus, the s-attribute containing sentence regions, and a pattern for constructing unique IDs are either listed in the header of the input file, or can be specified with command-line arguments.  See L</"INPUT FILE FORMAT"> below for details on the alignment file format.

An example illustrating a typical use case can be found in the I<CWB Corpus Encoding Tutorial>.


=head1 OPTIONS

=over 4

=item --help, -h

Show usage and options summary.

=item --verbose, -v

Verbose mode (shows progress messages during processing).

=item --registry=I<dir>, -r I<dir>

Locate corpora in CWB registry directory I<dir>, overriding the default directory and the environment variable C<CORPUS_REGISTRY>.

=item --inverse, -i

Encode inverse alignment (from B<target> language to B<source> language).

=item --prune, -p

Automatically ignore alignment beads if sentence IDs are not found, either in the source or the target corpus.  Without C<-p>, B<cwb-align-import> will abort with an error message in this case.  Note that the C<-p> option implies C<-e> (see below).

=item --empty, -e

Allow 1:0 and 0:1 alignment beads, which will be silently ignored (without C<-e>, they cause a fatal error).

=item --no-header, -nh

Alignment file does not contain a header line.  In this case, the header information must be provided on the command line with the C<-l1>, C<-l2>, C<-s> and C<-k> flags (documented below).

=item --source=I<ID>, -l1 I<ID>

CWB corpus I<ID> of the source language corpus.  Overrides information in alignment file header, if present.

=item --target=I<ID>, -l2 I<ID>

CWB corpus I<ID> of the target language corpus.  Overrides information in alignment file header, if present.

=item --grid=I<attribute>, -s I<attribute>

CWB I<attribute> used as alignment grid (each alignment bead links I<n> consecutive grid regions in the source language to I<m> consecutive grid regions in the target language).  For the most common case of sentence alignment, the grid I<attribute> will usually be C<s>.  Note that the same attribute name is used for both source and target language corpus.

=item --key=I<pattern>, -k I<pattern>

Pattern used to construct unique sentence IDs (must be the same in source and target language).  If sentences are directly annotated with IDs, say in the s-attribute C<s_id>, you can simply specify C<-k '{s_id}'> or C<-k '{id}'> for short (the name of the grid attribute is automatically prepended).  Note the curly braces around the attribute name!

In more complex situations, I<pattern> is an arbitrary character string that interpolates s-attributes enclosed in curly braces.  For example, if paragraphs and sentences are numbered (s-attributes C<p_num> and C<s_num>), you can construct IDs of the form C<id_p4_s2> (second sentence in fourth paragraph) with the key C<-k 'id_p{p_num}_s{s_num}'>.

=back


=head1 INPUT FILE FORMAT

The alignment file starts with an optional header line. Use the C<< -nh >> (C<< --no-header >>) flag if your file does not include a header.  In this case you need to specify the necessary information on the command line, using the  C<< -l1 >>, C<< -l2 >>, C<< -s >> and C<< -k >> flags.

The header specifies the corpora to be aligned, which must already have been CWB-encoded; the name of the s-attribute containing sentence regions (or other regions used as an alignment grid, such as paragraphs); and finally a key pattern to be used for constructing unique sentence IDs (see below).  The four items must be separated by TAB characters.

In the simplest case, the ID key pattern is a string that includes the name of the s-attribute containing sentence IDs enclosed in curly braces. For instance, if your corpus includes unique sentence numbers

  <s num="1">

encoded in the standard way (i.e. with C<< -S s+num >>), and if sentence IDs are given in the form C<id_1> in the alignment beads file, you have to specify C<< id_{s_num} >> (or C<< id_{num} >> for short) as key pattern.  The entire header line might thus look like this:

   SOURCE_CORPUS     TARGET_CORPUS      s      id_{num}

Each of the remaining lines in the input file corresponds to a single alignment bead.  It consists of the ID of a sentence in the source corpus (or multiple IDs separated by blanks), followed by a TAB character and the ID of the aligned sentence in the target corpus (or multiple IDs separated by blanks).

As an example, the line

    id_1 id_2 id_3  <TAB>  id_2 id_3  [ <TAB> ... ]

indicates that the first three sentences of the source corpus are aligned to the second and third sentence of the target corpus.  Further TAB-delimited fields (e.g. specifying the confidence in an alignment bead) are silently ignored.

Unique sentence IDs can also be constructed from multiple attributes, which is more efficient than storing the unique IDs in a single CWB attribute.  The individual components are usually XML attributes of the C<< <s> >> tags, but may also include XML attributes of different regions such as text IDs or paragraph numbers.  In the latter case, the full name of the corresponding s-attribute has to be specified.  For instance, C<< {text_id}:{num} >> would construct unique IDs from a text ID and sentence number, separated by a C<:> character.


=head1 COPYRIGHT

Copyright (C) 2007-2022 Stephanie Evert [https://purl.org/stephanie.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
