#!/usr/bin/perl -w -s $| = 1; use CWB; use CWB::CL; use strict; # my $corpdir ="/corpora"; our ($st, $l1, $l2, $debug,$local); ## options chomp(my $regis = `cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; $CWB::CL::Registry .= ":$regis"; my ($c1,$c2)=(0,1); my ($header,$S_corpus,$T_corpus,$S_perc,$T_perc);; $st ||=0; my $word = "word"; # operates on "word" attribute by default, can be made user-configurable die "Usage: $0 \n" unless @ARGV == 2; my ($infile, $outfile) = @ARGV; my $IF = CWB::OpenFile $infile; chomp($header = <$IF>); ($S_corpus, undef, $T_corpus, undef) = split /\t/, $header; my $S_C = new CWB::CL::Corpus $S_corpus or die "Can't access corpus $S_corpus -- aborted.\n"; my $S_W = $S_C->attribute($word, "p") or die "Can't find attribute $S_corpus.$word -- aborted\n"; my $S_size = $S_W->max_cpos; my $T_C = new CWB::CL::Corpus $T_corpus or die "Can't access corpus $T_corpus -- aborted.\n"; my $T_W = $T_C->attribute($word, "p") or die "Can't find attribute $T_corpus.$word -- aborted\n"; my $T_size = $T_W->max_cpos; $l1 ||= $S_corpus; $l2 ||= $T_corpus; ## my $OF = CWB::OpenFile(">", $outfile); open(OF,">", $outfile) or die("cant open $outfile\n"); ## open(OF,">:utf8", $outfile) or die("cant open $outfile\n"); put0(); while (<$IF>) { chomp; my ($s1, $s2, $t1, $t2, $v1, $v2) = split /\t/; if ($v2 == 0) { $c1++ ; next unless $debug;} # skip if quality == 0 put2( put1($l1,join(" ", $S_W->cpos2str($s1 .. $s2))), put1($l2,join(" ", $T_W->cpos2str($t1 .. $t2))), $v1,$v2 ); if ($debug){ if (($. % 100) == 0) { $S_perc = 100 * $s1 / $S_size; $T_perc = 100 * $t1 / $T_size; printf "Progress: %5.2f%s / %5.2f%s\r", $S_perc, '%', $T_perc, '%'; } } } put3(); $IF->close; close(OF); print("\n\n"); if($debug){ printf "Progress: %5.2f%s / %5.2f%s \n", 100, '%', 100, '%';} sub put0{ if($st==0){ print OF tmxinit()} } sub put1{ if($st==0){" ". xmlprotect($_[1]). ""} elsif($st==1){ "$_[0]:\t$_[1]\n"; } } sub put2{ my $r = ($_[0] =~ m/222 333 444 555 666 777/) ? "----------------------------" : ""; if($r){ print OF "\n\n"; print OF "\n\n"; $c1=$c2=0; } else { if($_[2] =~ /(\d):(\d)/){ $c1++ unless($1 == 1); $c1++ unless($2 == 1); $c2++}; if($st==0){print OF "\n\n$_[0]\n$_[1]\n\n";} elsif($st==1){print OF "$_[0]$_[1]\n";} } } sub put3{ if($st==0){ print OF "\n\n\n";} } sub tmxinit{ qq{
}; } sub xmlprotect{ my $f=shift; $f =~ s/\&/\&/g; $f =~ s/\/\>/g; $f =~ s/\x{15}/#/g; $f } __END__ =head1 NAME cqpalign2tmx - generates TMX from a align CWB corpora =head1 SYNOPSIS align2tmx [-l1=EN] [-l2=PT] [-debug] alignfile > out.tmx =head1 DESCRIPTION Converts a parallel corpus in the CWB format to the TMX (translation memory exange) format. Segments with "no align found" or not transfered to the TMX file. =head1 Options -l2=... -l1=... to define the language identification tag in atribut C of the C elements (defaut lang1 and lang2) -debug to write more information in the TMX file =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt Stefan Evert =head1 SEE ALSO perl(1). CWB CWB::CL CWB::CQP TMX cwb-utils =cut