#!/usr/bin/perl -s use strict; use CL; our ($tag, $registry, $l1, $l2, $debug); ## options $tag ||= "p"; $registry ||= '/corpora/c1/registry'; $l1 ||= "Ling1"; $l2 ||= "Ling2"; my $corpus = lc(shift ) or die("usage: cqpalign2tmx [-l1=EN] [-l2=PT] [-debug] corpus > out.tmx"); my $paral=`grep ALIGNED $registry/$corpus`; ## is it a paralel corpus? chomp $paral; $paral =~ s/ALIGNED\s*(\S+)/$1/; my $corpus1 = new CL::Corpus $corpus or die;; my $w1 = $corpus1->attribute("word", 'p') or die;; my $corpus2 = new CL::Corpus $paral or die;; my $w2 = $corpus2->attribute("word", 'p') or die;; my $f= $corpus1->attribute($paral, 'a') or die;; while(<>){chomp(); # s/(\S+)/[word="$1"] /g; print "DEBUG: $_\n"; my @l1= ($w1->regex2id($_)); my @cpl = $w1->idlist2cpos(@l1); print "DEBUG: (", join(",", @cpl), ")\n"; for my $cp ( @cpl) { print "DEBUG cpos: $cp\n"; my $al= $f->cpos2alg($cp); print "DEBUG align: $al\n"; my ($src_start, $src_end, $tar_start, $tar_end) = $f->alg2cpos($al); my $a= (getsent($w1,$src_start, $src_end)); my $b= (getsent($w2,$tar_start, $tar_end)); my $n=($debug ? " algnum='$al' st='$src_start' end='$src_end'" : ""); print qq{ $a $b }; } } sub xmlprotect{ my $f=shift; $f =~ s/\&/\&/g; $f =~ s/\/\>/g; $f } sub getsent{ my ($c,$s,$e)=@_; join(" ", $c->cpos2str($s..$e) ) } sub tmxinit{ print qq{
}; } __END__ =head1 NAME cqpalign2tmx - generates TMX from a align CWB corpora =head1 SYNOPSIS cqpalign2tmx [-l1=EN] [-l2=PT] [-tag=p] corpusId > out.tmx =head1 DESCRIPTION Converts a parallel corpus in the CWB format to the TMX (translation memory exange) format. Segments with "no align found" or not transfered to the TMX file. =head1 Options -l2=... -l1=... to define the language identification tag in atribut C of the C elements (defaut lang1 and lang2) -tag=... -debug to write more information in the TMX file =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). CWB CL CQP TMX cwb-utils =cut