#!/usr/bin/perl -s use Guesser; use strict; our ($id,$h); $id ||= "_corpus_name_"; my $dir = shift || "./"; if ($h) { die("usage mkbitextra -id=name File.paths File.blocks File.pairs\n"); } if ($dir =~ m!([^/]+)\.pairs$!) { $id = $1; } else { unless ($dir =~ /\.blocks$/){ unless ($dir =~ /\.paths$/){ printf STDERR ("\n****** Calculo da lista de ficheiros\n"); system("find $dir > $id.paths"); } printf STDERR ("****** Calculo de blocos\n"); Guesser::list2blocks("$id.paths"); } printf STDERR ("****** Blocos para pares PT/EN\n"); Guesser::blocks2pairs("portuguese","english","$id.blocks"); printf STDERR ("****** Verificar correspondencias\n"); system("time bitextcheck -lang1=pt -lang2=en -f $id.blocks.pairs > $id.pairs"); } printf STDERR ("****** Converter HTML para PML\n"); system("html2pml -name=$id -listofpairs -noimg $id.pairs") == 0 or die $!; printf STDERR ("****** Alinhar frases em CWB\n"); system("xmlalign2cqp $id.A.out $id.B.out") == 0 or die $!; printf STDERR ("****** CWB para TMX\n"); my $corpusname = id2corpusname("$id.A.out"); system("cqpalign2tmx $corpusname > $id.tmx") == 0 or die $!; printf STDERR ("****** Conversão TMX para input do twente aligner\n"); # system("tmxsplit -twente $id.tmx "); printf STDERR ("****** Alinhamento à palavra (usando Natools)\n"); # system("nat-these $id.tmx-Ling1 $id.tmx-Ling2"); sub id2corpusname{ my $n=shift; for ($n){ s/\./-/g; s/[A-Z]/lc($&)/ge; } $n; }