#!/usr/bin/perl
## -*-cperl-*-
## Author:  Stephanie Evert
## Purpose: export existing sentence alignment from CWB
##
$| = 1;
use warnings;
use strict;

use CWB;
use CWB::CQP;

use Getopt::Long;
use Pod::Usage;

BEGIN {
  eval "use CWB::CL::Strict";
  die "You must install the CWB-CL package in order to use this script.\n"
    if $@;
}
#### TODO: rewrite program to use cwb-align-decode in CWB 3.5 ####

our $Verbose      = 0;      # -v ... show progress & status messages
our $Opt_Force    = 0;      # -f ... ignore errors (alignment beads are skipped)
our $Opt_Registry = undef;  # -r <dir> ... use registry directory <dir>
our $Opt_File     = "-";    # -o <file> ... write output to file (default: stdout)
our $Opt_NH       = 0;      # -nh ... write alignment file without header
our $Opt_Help     = 0;      # -h ... show usage message

OPTIONS_AND_USAGE:
{
  my $ok = GetOptions(
                      "v|verbose"    => \$Verbose,
                      "f|force"      => \$Opt_Force,
                      "r|registry=s" => \$Opt_Registry,
                      "o|output=s"   => \$Opt_File,
                      "nh|no-header" => \$Opt_NH,
                      "h|help"       => \$Opt_Help,
                     );
  pod2usage(-msg => "(Type 'perldoc cwb-align-export' for more information.)",
    -exitval => 0, -verbose => 1) if $Opt_Help;
  pod2usage(-msg => "(Type 'cwb-align-export -h' for more information.)",
    -exitval => 1, -verbose => 0) if $ok and @ARGV == 0;
  pod2usage(-msg => "SYNTAX ERROR.",
    -exitval => 2, -verbose => 0)
    unless $ok and @ARGV == 4;
}

## global variables
our ($C1_id, $C2_id, $S_id, $key_pattern) = @ARGV; # source/target corpus ID, grid region, pattern for computing keys
($C1_id, $C2_id) = map {uc} ($C1_id, $C2_id); # ensure that corpus IDs are in uppercase
our (@R1, @R2); # list of grid regions in source and target corpus (values are the computed keys)
our (%LBound1, %RBound1, %LBound2, %RBound2); # lookup hashes for start and end points of grid regions (map corpus position to region number = index in @R1/@R2)
our ($OutFH);                                 # file handle for output file
our ($C1_lc, $C2_lc) = map {lc} ($C1_id, $C2_id); # corpus names in lowercase
our ($Corpus, $Align);                        # CWB::CL handles for source corpus and alignment attribute
our ($CQP);                                   # CQP backend

SETUP:
{
  $CWB::CL::Registry = $Opt_Registry
    if $Opt_Registry;
  $Corpus = new CWB::CL::Corpus $C1_id;
  $Align = $Corpus->attribute($C2_lc, "a");

  $CQP = new CWB::CQP (($Opt_Registry) ? "-r $Opt_Registry" : "");
  $CQP->set_error_handler('die');
}

MAKE_KEYS:
{
  print STDERR "Generating keys for grid regions:\n" if $Verbose;
  print STDERR "  - $C1_id "             if $Verbose;
  build_region_keys(\@R1, \%LBound1, \%RBound1, $CQP, $C1_id, $S_id, $key_pattern);
  print STDERR " ok\n"   if $Verbose;
  print STDERR "  - $C2_id " if $Verbose;
  build_region_keys(\@R2, \%LBound2, \%RBound2, $CQP, $C2_id, $S_id, $key_pattern);
  print STDERR " ok\n" if $Verbose;
  undef $CQP; # we no longer need CQP
}

SETUP2:
{
  $OutFH = CWB::OpenFile "> $Opt_File";
  print $OutFH join("\t", $C1_id, $C2_id, $S_id, $key_pattern), "\n"
    unless $Opt_NH;
}

EXPORT_ALIGNMENT:
{
  my $n_exported = 0;
  print STDERR "Exporting " if $Verbose;
  my $n_beads = $Align->max_alg;
  foreach my $bead (0 ..  $n_beads-1) {
    my ($s1, $e1, $s2, $e2) = $Align->alg2cpos($bead);
    my $from1 = $LBound1{$s1};
    if (not defined $from1) {
      print STDERR "\nError: Start of alignment bead #$bead in corpus $C1_id (cpos = $s1) not aligned with <$S_id> grid -- ";
      die "Aborted.\n" unless $Opt_Force;
      print STDERR "skipped\n";
      next;
    }
    my $to1 = $RBound1{$e1};
    if (not defined $to1) {
      print STDERR "\nError: End of alignment bead #$bead in corpus $C1_id (cpos = $e1) not aligned with <$S_id> grid -- ";
      die "Aborted.\n" unless $Opt_Force;
      print STDERR "skipped\n";
      next;
    }
    my $from2 = $LBound2{$s2};
    if (not defined $from2) {
      print STDERR "\nError: Start of alignment bead #$bead in corpus $C2_id (cpos = $s2) not aligned with <$S_id> grid. -- ";
      die "Aborted.\n" unless $Opt_Force;
      print STDERR "skipped\n";
      next;
    }
    my $to2 = $RBound2{$e2};
    if (not defined $to2) {
      print STDERR "\nError: End of alignment bead #$bead in corpus $C2_id (cpos = $e2) not aligned with <$S_id> grid. -- ";
      die "Aborted.\n" unless $Opt_Force;
      print STDERR "skipped\n";
      next;
    }

    print $OutFH join(" ", @R1[$from1 .. $to1]), "\t", join(" ", @R2[$from2 .. $to2]), "\n";
    $n_exported++;
    print STDERR "." if $Verbose and ($bead & 0xFFFF) == 1;  # 16 dots per 1M alignment beads
  }
  print STDERR " $n_exported / $n_beads alignment beads exported\n" if $Verbose;
}

close $OutFH;


sub build_region_keys {
  my ($Keys, $Start, $End, $cqp, $corpus, $grid_att, $key_pattern) = @_;
  $cqp->exec($corpus);              # activate corpus
  my %s_attribute =                 # names of all s-attributes
    map { $_->[1] => 1 }
    grep { $_->[0] eq "s-Att" }
    $cqp->exec_rows("show cd");
  die "Error: <$grid_att> is not an s-attribute in corpus $corpus\n"
    unless $s_attribute{$grid_att};
  $cqp->exec("Grid = <$grid_att> [] expand to $grid_att");  # get vector of grid regions
  print STDERR "." if $Verbose;
  my ($n_regions) = $cqp->exec("size Grid");
  $cqp->run("tabulate Grid match, matchend");
  my $idx = 0;
  while (my @pair = $cqp->getrow) {
    $Start->{$pair[0]} = $idx;
    $End->{$pair[1]} = $idx;
    $idx++;
  }
  die "Internal Error"
    unless $idx == $n_regions;
  print STDERR ".." if $Verbose;
  @$Keys = ("") x $n_regions;    # initialise vector of keys with empty strings
  my %seen = ();                 # check that keys are unique
  while ($key_pattern =~ s/^( [^{}]* ) \{ ( [^}]+ ) \}//x) {
    my ($literal, $name) = ($1, $2);
    if ($s_attribute{"${grid_att}_${name}"}) {
      $name = "${grid_att}_${name}";
    }
    elsif (not $s_attribute{$name}) {
      die "Error: neither <$name> nor <${grid_att}_${name}> is an s-attribute in corpus $corpus\n";
    }
    my @data = $cqp->exec("tabulate Grid match $name");
    print STDERR "." if $Verbose;
    foreach $idx (0 .. $n_regions-1) {
      $Keys->[$idx] .= $literal.$data[$idx];
    }
    print STDERR "." if $Verbose;
  }
  if ($key_pattern ne "") {
    foreach $idx (0 .. $n_regions-1) {
      $Keys->[$idx] .= $key_pattern;
    }
    print STDERR "." if $Verbose;
  }
  foreach my $idx (0 .. $n_regions-1) {
    my $k = $Keys->[$idx];
    die "ERROR: duplicate key '$k' found; please revise your key specification.\n"
      if exists $seen{$k};
    $seen{$k} = 1;
  }
}



__END__

=head1 NAME

cwb-align-export - Export existing sentence alignment from a CWB corpus

=head1 SYNOPSIS

  cwb-align-export [options] <SOURCE> <TARGET> <grid> <keyspec>

  <SOURCE>    CWB name of source corpus
  <TARGET>    CWB name of target corpus
  <grid>      s-attribute containing the alignment grid (usually "s")
  <keyspec>   pattern used to construct unique IDs for grid regions

Options:

  -r <dir>, --registry=<dir>    use registry directory <dir>
  -o <file>, --output=<file>    write output to <file>
  -nh, --no-header              write alignment file without header
  -f, --force                   skip alignment beads with errors rather than stopping
  -v, --verbose                 show progress messages during processing
  -h, --help                    display short help page


=head1 DESCRIPTION

This script exports an encoded sentence-level alignment between two CWB corpora (I<SOURCE> and I<TARGET>) as a text file compatible with B<cwb-align-import>.  In the output, alignment beads are specified by (sets of) unique sentence IDs in the source and target corpus.  Unique IDs are computed from one or more s-attributes according to the pattern I<keyspec>.  Alignments at other granularities (such as paragraph or clause) are also supported; the corresponding s-attribute is specified by the command-line argument I<grid>.

It is recommended to read the L<cwb-align-import> manpage first, in order to get a better understanding of the export file format and its correspondence to a CWB alignment attribute.  An illustrative example can be found in the I<CWB Corpus Encoding Tutorial>.


=head1 ARGUMENTS

=over 4

=item I<SOURCE>

CWB corpus ID of the source language corpus.

=item I<TARGET>

CWB corpus ID of the target language corpus.

=item I<grid>

CWB attribute representing the alignment grid, i.e. each alignment bead links I<n> consecutive grid regions in the source language to I<m> consecutive grid regions in the target language.  It is an error if the start or end of an alignment bead region doesn't match a corresponding grid boundary.

For the most common case of sentence alignment, I<grid> will usually be set to C<s>.  Note that the same grid attribute is used for both source and target language corpus.

=item I<keyspec>

Pattern used to construct unique sentence IDs (both in the source and target corpus).  If sentences are directly annotated with IDs, say in the s-attribute C<s_id>, you can simply specify C<{s_id}> or C<{id}> for short (the name of the grid attribute is automatically prepended).  Note the curly braces around the attribute name!

In more complex situations, I<keyspec> is an arbitrary character string that interpolates s-attributes enclosed in curly braces.  For example, if paragraphs and sentences are numbered (s-attributes C<p_num> and C<s_num>), you can construct IDs of the form C<id_p4_s2> (second sentence in fourth paragraph) with the pattern C<id_p{p_num}_s{s_num}>.

=back


=head1 OPTIONS

=over 4

=item --registry=I<dir>, -r I<dir>

Locate corpora in CWB registry directory I<dir>, overriding the default directory and the environment variable C<CORPUS_REGISTRY>.

=item --output=I<file>, -o I<file>

Write export data to I<file> rather than standard output.  Files with extension C<.gz> or C<.bz2> are compressed automatically.

=item --no-header, -nh

Write alignment file without the optional header line (see L<"EXPORT FILE FORMAT"> below).

=item --force, -f

Ignore errors and continue exporting.  If the start or end point of an alignment bead doesn't match grid boundaries, the bead will be skipped with an error message.

=item --verbose, -v

Verbose mode (shows progress messages during processing).

=item --help, -h

Show usage and options summary.

=back


=head1 EXPORT FILE FORMAT

The exported alignment file starts with an optional header line specifying the CWB names of source and target corpus, the s-attribute containing sentence regions (or other regions used as an alignment grid), and the key pattern for constructing unique sentence IDs.  The four items are separated by TAB characters.  Specify C<--no-header>) to omit the header line from the export file.

Each of the remaining lines in the export file corresponds to a single alignment bead.  It consists of the ID of a sentence in the source corpus (or multiple IDs separated by blanks), followed by a TAB character and the ID of the aligned sentence in the target corpus (or multiple IDs separated by blanks).

See the L<cwb-align-import> manpage for a more detailed description of the file format and the specification of unique IDs.


=head1 COPYRIGHT

Copyright (C) 2007-2022 Stephanie Evert [https://purl.org/stephanie.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut
