#!/usr/bin/perl -w -s use CWB; use Lingua::PT::PLNbase; use Data::Dumper; use strict; #$wordlist = "-C:1 -W:30:$wl -S:50:0.4 -3:3 -4:4" sub mkconfpar{ my ($conf,$wl)=@_; return "" unless $conf || $wl; my %op = qw(-C 1 -W 30 -S 50:0.4 -3 3 -4 4); %op = (%op, ($conf =~ m/(-\w):(\S+)/g )) if $conf; $op{-W} = ($op{-W} || 30) . ":$wl" if $wl; join(" ",map {"$_:$op{$_}"} keys %op ) } our ($local, $quebra , $wl, $corpusdir, $sync, $syncatt , $cwbconfig); my $encoding= "-c utf8" ; ### ALLWAYS! ; chomp(my $regis = `cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; my $regisopt="-r $regis"; my $corpdir = $corpusdir || "$ENV{HOME}/corpora"; my $V =""; my $S =""; if ($syncatt && $syncatt eq 1){ $V = "-V sync" } elsif($syncatt) { $V = "-V $syncatt" } elsif($sync && $sync eq 1) { $S = "-S sync" } elsif($sync) { $S = "-S $sync" } my $tags; my $wordlist = mkconfpar($cwbconfig, $wl); #$wordlist = "-C:1 -W:30:$wl -S:50:0.4 -3:3 -4:4" if $wl; ##my $SET=""; ##$SET = "POSIXLY_CORRECT=yes " if $wl; my $f1 = shift or die "usage: xmlalign2cqp f1 f2\n"; my $f2 = shift or die "usage: xmlalign2cqp f1 f2\n"; my ($corpus1, $corpus2) = (lc($f1), lc($f2)); for ($corpus1, $corpus2) { s/\//=/g; s/_/c-/g; s/\./-/g; } xml2cqp($corpus1, $f1); print STDERR "fim de xml2cqp $f1 ($corpus1)\n"; xml2cqp($corpus2, $f2); print STDERR "fim de xml2cqp $f2 ($corpus2)\n"; open R, ">>$regis/$corpus1" or die "Error appending to $regis/$corpus1: $!"; print R "ALIGNED $corpus2\n"; close R; executa("cwb-align $regisopt -v -o '$f1-$f2.align' $S $V $corpus1 $corpus2 p $wordlist "); executa("cwb-align-encode $regisopt -D '$f1-$f2.align' "); print STDERR "fim de align1\n"; open R, ">>$regis/$corpus2" or die "Error appending to $regis/$corpus2: $!"; print R "ALIGNED $corpus1\n"; close R; my $wordlistr = mkconfpar($cwbconfig, $wl); ## $wordlistr .= "-C:1 -W:30:". revwordlist($wl) ." -S:50:0.4 -3:3 -4:4" if $wl; executa("cwb-align $regisopt -v -o '$f2-$f1.align' $S $V $corpus2 $corpus1 p $wordlistr"); executa("cwb-align-encode $regisopt -D '$f2-$f1.align' "); print STDERR "fim de align2\n"; print STDERR "THE END\n"; sub revwordlist{ my $wl = shift; if($wl =~ /(.*)\.rev$/){return $1} elsif( -e "$wl.rev " ){return "$wl.rev" } open F1, $wl or die "can't open Wordlist $wl: $!"; open F2, ">", "$wl.rev" or die "can't create reverse Wordlist $wl.rev: $!"; while () { chomp; my @a = split(/ /,$_); print F2 "$a[1] $a[0]\n"; } close F2; close F1; return "$wl.rev"; } sub xml2cqp { my $corpname = shift; my @files = @_; my $n; mkdir "$corpdir/$corpname"; mkdir $regis; open X , ">:utf8", "$corpname.corpus" or die "can't create '$corpname.corpus': $!"; my $OLDSTDOUT = select X; my %t = (); for $n (@files) { $n = "quebraxmlsent $quebra < $n|" if $quebra; $tags = cqptokens({enc=>":utf8",outenc=>":utf8"},$n); %{$t{s}} = ( %{$t{s}||+{}}, %{$tags->{s}} ) if $tags->{s} ; %{$t{v}} = ( %{$t{v}||+{}}, %{$tags->{v}} ) if $tags->{v} ; } select $OLDSTDOUT; close X; my ($r,$o) = tags2options($corpname, %t); open R, ">", "$regis/$corpname" or die "can't create '$regis/$corpname': $!"; print R $r; close R; unlink(<$corpdir/$corpname/*>); !system ("$CWB::Encode $encoding -d $corpdir/$corpname -0 corpus -s $o < $corpname.corpus") || die ("Erro Encode -d $corpdir/$corpname -s $o < $corpname.corpus ($?)$@$!\n"); !system ("$CWB::Makeall $regisopt $corpname") || die ("Erro Makeall ($?)$@$!\n"); ## LINDO! } sub tags2options{ my ($name, %t) = @_; my @v = keys(%{$t{v}}); my @s = keys(%{$t{s}}); my $options = ""; my $registry = qq{ NAME "$name" ID $name HOME $corpdir/$name ATTRIBUTE word }; for (@v) { next unless /^[a-zA-Z]/; $registry .= "STRUCTURE $_\n"; $options.=" -V $_"; } for my $s (@s) { next unless $s =~ /^[a-zA-Z]/; unless (grep(/^$s$/, @v)) { $registry .= "STRUCTURE $s\n"; $options.=" -S $s"; } } return ($registry,$options) } sub executa { my $cmd = shift; print STDERR "$cmd\n"; system ($cmd) == 0 or warn "system $cmd failed: $?\n"; } __END__ =head1 NAME xmlalign2cqp - aligns XML (or similar) files =head1 SYNOPSIS xmlalign2cqp [-quebra] [-wl=file] f1 f2 -syncatt=chap (use ... for sync.) or -sync=file (use ... for syncronization) =head1 DESCRIPTION f1 and f2 shoud have tag C (file) to make sync. f1 and f2 are align by C

tags Both alignment are calculated =head2 Options C<-sync=synctag>. (syncronization tag defaults "sync") use C<> as syncronization tag; (the number of syncronization tag should de equal. C<-syncatt=synctag>. (syncronization tag defaults "sync") use C<> as syncronization tag for equal "id" values; Defining a pair list to help in align process... -wl=file C should contains a list of translations pair separated by a space. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut