#!/usr/bin/perl -s use strict; #use Guesser; our ($raw,$q0,$debug,$id,$h,$l1,$l2,$html,$patt4,$max,$pdf,$skipch,$isutf8, $tdef,$txt,$txtll,$aligned,$wl,$sync,$syncatt,$getalt,$negpatt4,$addtag,$local); use Corpora::ParGuesser; my $dir = shift or $h = 1; my $output = shift; my ($nom1,$nom2,$nom3,$nom4,$nom5,$nom6); my $enc=""; $enc = "-isutf8" if $isutf8; $raw = "-rawpdf" if $raw; $tdef= "-tdef=$tdef" if $tdef; if($debug) { $debug="-q0 -debug"; } elsif ($q0) { $debug="-q0"; } $getalt = "-getalt" if $getalt ; if($addtag){ my ($b,@t)=split(/[=,]/,$addtag); die("invaid -addtag value\n") unless $b; Corpora::ParGuesser::jjaddtag($b,@t); } my $t; #if($sync) { $sync eq '1'? $sync="-sync" : $sync="-sync=$sync"; } #elsif($syncatt){ $syncatt eq '1'? $sync="-syncatt" : $sync="-syncatt=$syncatt"; } if ($syncatt && $syncatt eq 1){ $sync = "-syncatt" } elsif($syncatt) { $sync = "-syncatt=$syncatt" } elsif($sync && $sync eq 1) { $sync = "-sync" } elsif($sync) { $sync = "-sync=$sync" } my $corpdir ="$ENV{HOME}/corpora"; mkdir $corpdir unless -d $corpdir; chomp(my $regis =`cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; #my $nom5=1; if($h){ die("usage mkterminum [-id=name] [-l1=pt] [-l2=fr] dir [output] mkterminum [-l1=pt] [-l2=fr] f.paths [output] mkterminum [-l1=pt] [-l2=fr] f.blocks [output] mkterminum f.en.pt._pairs [output]\n\n");} if ($dir =~ /(.+)\.(\w+)\.(\w+)\.pairs$/){ $nom1=$nom2=$nom3=$nom4=1; $l1=$2,$l2=$3; $id=$1 } elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\.tmxdir$/){ $nom1=$nom2=$nom3=$nom4=$nom5=1; $l1=$2,$l2=$3; $id=$1 } elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\._pairs$/){ $nom1=$nom2=$nom3=1; $l1=$2,$l2=$3; $id=$1 } elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\.blocks$/){ $nom1=$nom2=1; $l1=$2,$l2=$3; $id=$1 } #elsif ($dir =~ /(.+)\.blocks$/ ){ # $nom1=$nom2=1; $id=$1 } elsif ($dir =~ /(.+)\.paths$/ ){ $nom1=1; $id=$1 } elsif ( -d $dir) { } else {die("what shall I do?\n")}; if ($output =~ /(.+)\.(\w+)\.(\w+)\.blocks$/){ $nom3=$nom4=$nom5=$nom6=1; $l1||=$2, $l2||=$3; $id||=$1} elsif ($output =~ /(.+)\.(\w+)\.(\w+)\._pairs$/){ $nom4=$nom5=$nom6=1; $l1||=$2, $l2||=$3; $id||=$1} elsif ($output =~ /(.+)\.(\w+)\.(\w+)\.pairs$/){ $nom5=$nom6=1; $l1||=$2, $l2||=$3; $id||=$1} #elsif ($output =~ /(.+)\.blocks$/ ){ # $nom3=$nom4=$nom5=$nom6=1; $id||=$1} elsif ($output =~ /(.+)\.paths$/ ) { $nom2=$nom3=$nom4=$nom5=$nom6=1; $id||=$1} else {}; $l1 ||= "pt"; $l2 ||= "en"; $id ||= "_corpus_name_$l1-$l2"; $html = " -html " if $html; $max = "| head -$max " if $max; $patt4 = "| egrep '$patt4' " if $patt4; $patt4 = "| egrep -v '$negpatt4' " if $negpatt4; sub m1{ printf STDERR ("**1**** Calculo da lista de ficheiros\n"); executa("find $dir -type f > $id.paths"); } sub m2{printf STDERR ("**2**** Calculo de blocos\n"); Corpora::ParGuesser::list2blocks({lang=>[$l1,$l2]},"$id.paths"); } sub m3{ printf STDERR ("**3**** Blocos para pares $l1/$l2\n"); open(F,">$id.$l1.$l2._pairs") or die("cant create $id.pairs"); for (Corpora::ParGuesser::blocks2pairs(Corpora::ParGuesser::jjnames($l1),Corpora::ParGuesser::jjnames($l2),"$id.$l1.$l2.blocks")){ print F "$_->[0]\t$_->[1]\n"; } close F; } sub m4{ printf STDERR ("**4**** Verificar correspondencias de lĂ­nguas $id+($l1 $l2)\n"); # system("time bitextcheck -lang1=$l1 -lang2=$l1 -f $id._pairs > $id.pairs"); if($skipch or $aligned){ executa("time cat $id.$l1.$l2._pairs $patt4 $max > $id.$l1.$l2.pairs" ); } else { ## executa("time cat $id.$l1.$l2._pairs $patt4 $max | bi-langcheck -debug -l1=$l1 -l2=$l2 $html > $id.$l1.$l2.pairs" ); executa("time cat $id.$l1.$l2._pairs $patt4 $max | bi-lang_cld_check -debug -l1=$l1 -l2=$l2 $html > $id.$l1.$l2.pairs" ); } } sub m5{ printf STDERR ("**5**** alinhamento... $id+($l1 $l2)\n"); my ($a1,$a2); my $n=1; my $_dir="$id.$l1.$l2.tmxdir"; my $wlo = ($wl ? "-wl=$wl" : ""); is_wd($corpdir, $regis); mkdir ($_dir); if($html) {$t="-t=html"} elsif($pdf) {$t="-t=pdf" } elsif($txt) {$t="-t=txt" } elsif($txtll) {$t="-t=txtll" } ## txt long lines elsif($aligned) {$t="-t=aligned"} ## pre aligned line to line else { } ## filealigner will try to guess the type... open(LOG,">$_dir/LOG") or warn("cant log\n"); for my $p (`cat $id.$l1.$l2.pairs $patt4 $max`){ chomp($p); my $valid=0; my ($f1,$f2,$lt); $p =~ s/\s*#.*//; if ($p =~ /(.+?)\t(\S+)\t(\S+)/){ ($valid,$f1,$f2,$t)=(1,$1,$2,"-t=$3");} elsif ($p =~ /(.+?)\t(\S+)/ ){ ($valid,$f1,$f2,$lt)=(1,$1,$2,$t);} else { $valid = 0;} my $nn = sprintf("%05d",$n); if ($valid){ executa(qq{filealigner $tdef $getalt $raw -num=mkt_$$ $debug $wlo $t $enc $sync -l1=$l1 -l2=$l2 -o=$_dir/$id.$nn.$l1.$l2.tmx "$f1" "$f2"}); executa(qq{rm -rf $corpdir/*mktc-$$-pml}) unless ($aligned); executa(qq{rm $regis/*mktc-$$-pml}) unless ($aligned); } if(-f "$_dir/$id.$nn.$l1.$l2.tmx.BADALIGNMENT") { print LOG "$nn\t$f1\t$f2\tBAD\n"; } else { print LOG "$nn\t$f1\t$f2\n"; } $n++; } close LOG; } sub m6{ printf STDERR ("**6**** tmx concat... $id+($l1 $l2)\n"); my $_dir="$id.$l1.$l2.tmxdir"; fasttmxcat($_dir,$id,$l1,$l2); ## executa(qq{tmx2tmx -cat $_dir/$id.[0-9]*.tmx > $_dir/$id.$l1.$l2.tmx}); } sub fasttmxcat{ my ($_dir,$id,$l1,$l2) = @_; my $first=1; open(F1,">","$_dir/$id.$l1.$l2.tmx") or die; for(<$_dir/$id.[0-9]*.tmx>){ open(F2,"<",$_) or die; if($first){ $first = 0; while(){ print F1 $_; last if //; } print F1 "\n"; } print F1 "\n"; while(){ print F1 if /<\/?tu/; print F1 $_ if !/\S/; } close(F2) } print F1 "\n\n\n" } m1() unless $nom1; m2() unless $nom2; m3() unless $nom3; m4() unless $nom4; m5() unless $nom5; m6() unless $nom6; sub executa { my $cmd = shift; print STDERR "$cmd\n"; system ($cmd) == 0 or warn "** ERROR ************ system $cmd failed: $!$?\n"; } sub is_wd{ for(@_){unless( -d $_ && -w $_){ die("** FATAL ERROR ******** $_ must be a Writable directory\n"); }} return 1; } __END__ =head1 NAME mkterminum - makes text alignment, and builds TMX for paralell corpora =head1 SYNOPSIS mkterminum [-id=name] [-l1=pt] [-l2=fr] dir [output] mkterminum [-l1=pt] [-l2=fr] f.paths [output] mkterminum [-l1=pt] [-l2=fr] f.blocks [output] mkterminum f.en.pt._pairs [output] =head1 DESCRIPTION Depending on the arguments (input / output) some of the following steps are done dir directory -> paths .paths list of files -> blocks .blocks list of blocks -> _pairs ._pairs list of bitext candidate pairs -> pairs .pairs list of bitext -> tmx .tmxdir directory with the TMXs If we want just to calculate the tmx for a set of bitexts, we create a file name "name.pt.en.pairs" with the bitexts filename pairs and run: mkterminum name.pt.en.pairs And the only step performed is step5. =head2 Step1 : dir -> paths Given a directory, buils a file "name.paths" with the names of all the files =head2 Step2 : paths -> blocks Given a file "name.paths" extract the set of (sets os filenames) that are equal if we delete the Language-Names parts and builds a "name.l1.l2.blocks" file =head2 Step3 : blocks -> _pairs Given a set of blocks and a pair of languages buils a list of bitexts candidate pairs "name._pairs" =head2 Step4 : _pairs -> pairs Given a list of bitext candidate pairs, rejects those who have the wrong languages or that have very different sizes and build a "name.pairs" file =head2 Step5 : pairs -> TMXs -> TMX Given a list of bitexts, makes segmentation, align and naive analysis of the result. It builds a directory "name.tmxdir", and a TMX file is calculated for each bitext. If the result is considered bad, the TMX is renamed to "name.tmx.BADALIGNMENT". In the end the good alignments are concatedated with the command tmx2tmx -cat *.tmx See also "name.tmxdir/LOG". =head1 Options =head2 C<-t> to force file type -t=html or -html -t=pdf or -pdf -t=txt or -txt =head2 C<-tdef> C<-tdef=...> to force filetype if extension is unknown -tdef=html (filetype is html if unknown extension =head2 C<-patt4> to restrict the filenames -patt4='pdf' -- just accept files matching this pattern -negpatt4='pdf' -- just accept files don't matching this pattern =head2 C<-skipch> to skip bitext checking -skipch =head2 C<-max> to restrict the maximum number of files to process -max=20 -- just process the first 20 file pairs =head2 C<-addtag> -addtag=pt=po -- use "po" as a portuguese tag candidate =head2 C<-isutf8> use C<-isutf8> to force UTF-8 encoding in text files =head2 C<-h> help =head2 C<-id> =head2 C<-q0> use C<-q0> to keep 0:1 ou 1:0 conections =head2 C<-debug> keep 0:1, 1:0 connections and also temporary files =head2 C<-l1> C<-l2> -l1=PT -l2=FR =head2 C<-wl> Use C<-wl=file> to provide a file with pairs of words L1-L1 to help in the alignment process. =head2 C<-sync=synctag> C<-sync=synctag>. (synctag defaults to "sync" if no synctag is provided) use C<> as syncronization tag; (the number of syncronization tag should de equal. =head2 C<-syncatt=synctag> C<-syncatt=synctag>. (synctag defaults to "sync" if no synctag is provided) use C<> as syncronization tag for equal "id" values; =head1 Dependences CWB -- Stuttgard corpus workbench cwb-utils XML::TMX Lingua::PT::PLN Corpora::ParGuesser.pm (project Natura) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut __END__