#!/usr/bin/perl -s use strict; use CWB::CL; our ($tag, $registry, $l1, $l2, $debug,$local); ## options # my $corpdir ="/corpora"; chomp(my $regis =`cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; $tag ||= "p"; $registry ||= $regis ; $l1 ||= "Ling1"; $l2 ||= "Ling2"; my $corpus = lc(shift ) or die("usage: cqpalign2tmx [-l1=EN] [-l2=PT] [-debug] corpus > out.tmx"); my $paral=`grep ALIGNED $registry/$corpus`; ## is it a paralel corpus? chomp $paral; $paral =~ s/ALIGNED\s*(\S+)/$1/; my $corpus1 = new CWB::CL::Corpus $corpus; my $w1 = $corpus1->attribute("word", 'p'); my $corpus2 = new CWB::CL::Corpus $paral; my $w2 = $corpus2->attribute("word", 'p'); tmxinit(); my $f= $corpus1->attribute($paral, 'a'); my $nr_of_alignments = $f->max_alg; for (0 .. $nr_of_alignments-1){ my ($src_start, $src_end, $tar_start, $tar_end) = $f->alg2cpos($_); my $a= xmlprotect(getsent($w1,$src_start, $src_end)); my $b= xmlprotect(getsent($w2,$tar_start, $tar_end)); my $n=($debug ? " algnum='$_' st='$src_start' end='$src_end'" : ""); print qq{ $a $b }; } print qq{\n\n\n}; sub xmlprotect{ my $f=shift; $f =~ s/\&/\&/g; $f =~ s/\/\>/g; $f } sub getsent{ my ($c,$s,$e)=@_; join(" ", $c->cpos2str($s..$e) ) } sub tmxinit{ print qq{
}; } __END__ =head1 NAME cqpalign2tmx - generates TMX from a align CWB corpora =head1 SYNOPSIS cqpalign2tmx [-l1=EN] [-l2=PT] [-tag=p] corpusId > out.tmx =head1 DESCRIPTION Converts a parallel corpus in the CWB format to the TMX (translation memory exange) format. Segments with "no align found" or not transfered to the TMX file. =head1 Options -l2=... -l1=... to define the language identification tag in atribut C of the C elements (defaut lang1 and lang2) -tag=... -debug to write more information in the TMX file =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). CWB CL CQP TMX cwb-utils =cut