#!/usr/bin/perl -s
use strict;
use CL;
our ($tag, $registry, $l1, $l2, $debug); ## options
$tag ||= "p";
$registry ||= '/corpora/c1/registry';
$l1 ||= "Ling1";
$l2 ||= "Ling2";
my $corpus = lc(shift )
or die("usage: cqpalign2tmx [-l1=EN] [-l2=PT] [-debug] corpus > out.tmx");
my $paral=`grep ALIGNED $registry/$corpus`; ## is it a paralel corpus?
chomp $paral;
$paral =~ s/ALIGNED\s*(\S+)/$1/;
my $corpus1 = new CL::Corpus $corpus or die;;
my $w1 = $corpus1->attribute("word", 'p') or die;;
my $corpus2 = new CL::Corpus $paral or die;;
my $w2 = $corpus2->attribute("word", 'p') or die;;
my $f= $corpus1->attribute($paral, 'a') or die;;
while(<>){chomp();
# s/(\S+)/[word="$1"] /g;
print "DEBUG: $_\n";
my @l1= ($w1->regex2id($_));
my @cpl = $w1->idlist2cpos(@l1);
print "DEBUG: (", join(",", @cpl), ")\n";
for my $cp ( @cpl) {
print "DEBUG cpos: $cp\n";
my $al= $f->cpos2alg($cp);
print "DEBUG align: $al\n";
my ($src_start, $src_end, $tar_start, $tar_end) = $f->alg2cpos($al);
my $a= (getsent($w1,$src_start, $src_end));
my $b= (getsent($w2,$tar_start, $tar_end));
my $n=($debug ? " algnum='$al' st='$src_start' end='$src_end'" : "");
print qq{
$a
$b
};
}
}
sub xmlprotect{ my $f=shift;
$f =~ s/\&/\&/g; $f =~ s/\\</g; $f =~ s/\>/\>/g;
$f
}
sub getsent{ my ($c,$s,$e)=@_;
join(" ", $c->cpos2str($s..$e) )
}
sub tmxinit{
print qq{
};
}
__END__
=head1 NAME
cqpalign2tmx - generates TMX from a align CWB corpora
=head1 SYNOPSIS
cqpalign2tmx [-l1=EN] [-l2=PT] [-tag=p] corpusId > out.tmx
=head1 DESCRIPTION
Converts a parallel corpus in the CWB format to the TMX (translation
memory exange) format.
Segments with "no align found" or not transfered to the TMX file.
=head1 Options
-l2=...
-l1=... to define the language identification tag in atribut C of
the C elements (defaut lang1 and lang2)
-tag=...
-debug to write more information in the TMX file
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
CWB
CL
CQP
TMX
cwb-utils
=cut