#!/usr/bin/perl -w -s use CWB; use Lingua::PT::PLNbase; use Data::Dumper; use strict; our ($quebra,$l1,$l2, $cutmaxlen,$local); my $use="usage tmx2cqp tmx lang1 lang2\n tmx2cqp -l1=pt -l2=en tmx1 tmx2 ...\n"; #my $use="usage tmx2cqp lang1 lang2\n"; chomp(my $regis = `cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; my $corpdir="/corpora"; my $tmx = shift or die($use); $l1 ||= shift or die($use); $l2 ||= shift or die($use); my @outros = @ARGV; my $corpus1=lc("$tmx-$l1"); my $corpus2=lc("$tmx-$l2"); $corpus1 =~ s/\./-/g; $corpus2 =~ s/\./-/g; my $corpusname1=$corpus1; my $corpusname2=$corpus2; $corpusname1 =~ s/.*\///; $corpusname2 =~ s/.*\///; if(younger("$tmx-$l1",$tmx)){print "$tmx already splited\n"} else{ system("tmxsplit", ($cutmaxlen?("-cutmaxlen=$cutmaxlen"):() ), "-utf8", $tmx, @outros )==0 or die("Erro em tmxsplit\n"); print "fim de tmxsplit\n"; } if(younger("$regis/$corpusname1","$tmx-$l1")){ print "CWB corpus $corpus1 already created\n"} else{ xml2cqp($corpus1,"$tmx-$l1"); print "fim de xml2cqp\n"; } if(younger("$regis/$corpusname2","$tmx-$l2")){ print "CWB corpus $corpus2 already created\n"} else{ xml2cqp($corpus2,"$tmx-$l2"); print "fim de xml2cqp\n"; } if(younger("$tmx-$l1-$l2.align","$tmx-$l1")){ print "$corpus1 $corpus2 already aligned\n"} else{ open(R,">>$regis/$corpusname1") or die($!); print R "ALIGNED $corpus2\n"; close R; system("cwb-align -v -o $tmx-$l1-$l2.align -V tu $corpus1 $corpus2 tu"); system("cwb-align-encode -D $tmx-$l1-$l2.align "); print "fim de align1\n"; } if(younger("$tmx-$l2-$l1.align","$tmx-$l1")){ print "$corpus2 $corpus1 already aligned\n"} else{ open(R,">>$regis/$corpusname2") or die($!); print R "ALIGNED $corpus1\n"; close R; system("cwb-align -v -o $tmx-$l2-$l1.align -V tu $corpus2 $corpus1 tu"); system("cwb-align-encode -D $tmx-$l2-$l1.align "); print "fim de align2\nTHE END\n"; } sub xml2cqp{ my $corpname=shift; my @files=@_; open (X , ">:utf8", "$corpname.corpus"); select(X); my %t=(); my $n; my $tags=undef; for $n (@files){ $n = "quebraxmlsent $quebra < $n|" if $quebra ; $tags = cqptokens({enc=>":utf8",outenc=>":utf8"},$n); %{$t{s}} = ( %{$t{s}||{}}, %{$tags->{s}||{}} ); %{$t{v}} = ( %{$t{v}||{}}, %{$tags->{v}||{}} ); } select(STDOUT); close(X); my ($r,$o)= tags2options($corpname,%t); mkdir("$corpdir/$corpname"); mkdir("$corpdir/c1"); mkdir($regis); unlink(<$corpdir/$corpname/*>); open(R,">$regis/$corpname") or die("cant create corpus registry for $corpname:$!\n(mkdir -p $regis;chmod a+rwx $regis)\n"); print R $r; close R; !system ("$CWB::Encode -d $corpdir/$corpname -0 corpus -s $o < $corpname.corpus")|| die ("Erro Encode -d $corpdir/$corpname -s $o < $corpname.corpus ($?)$@$!\n"); !system ("$CWB::Makeall $corpname") || die ("Erro Makeall ($?)$@$!\n"); } sub tags2options{ my ($name,%t)=@_; my @v = keys(%{$t{v}}); my @s = keys(%{$t{s}}); my $options = ""; my $registry= qq{ NAME "$name" ID $name HOME $corpdir/$name ATTRIBUTE word }; for (@v){ $registry .= "STRUCTURE $_\n"; } for my $s (@s){ $registry .= "STRUCTURE $s\n" unless grep(/^$s$/,@v); } for (@v){ $options.=" -V $_";} for my $s (@s){ $options.=" -S $s" unless grep(/^$s$/,@v);} ($registry,$options) } sub younger{ ## f1 exists and is younger then f2 (for make f1) my ($f1,$f2)=@_; ## printf('%s %s -e = %d -Mf2 = %f >= -Mf1 %f',$f1, $f2, (-e $f1), (-M $f2), (-M $f1)), "\n"; (-e $f1 && -M $f2 >= -M $f1) } __END__