#!/usr/bin/perl -s use strict ; use NAT::Client; use Data::Dumper; $Data::Dumper::Indent=1; $Data::Dumper::Terse=1; my $Gn1={}; my $Gn2={}; my $Geq=0; my $Geq2=0; my $t; my $perfeitas =0; my $nonumber =0; my $pessimas =0; my $delnumbers=0; my $desiguaist=0; # size(bigger) > 2(size(shorter)) my $iguais=0; my $tmx = shift or die; use XML::TMX::Reader; my $t = XML::TMX::Reader->new( $tmx ); my $tun = 0; print "\n================($tmx)========\n"; $t->for_tu( \&mycmp ); print "\n comon numbers = ",MScardinter($Gn1,$Gn2)*2, "\n eq-same-TU numbers = ", $Geq*2 , "\n total numbers = ", $t= MScard($Gn1)+MScard($Gn2), "\n TU = $tun", "\n good TU (Num>0) = $perfeitas", "\n good TU (Num=0) = $nonumber", "\n very bad TU = $pessimas ($delnumbers)", "\n equal TU = $iguais", "\n size too diff TU = $desiguaist", (($Geq*2.4 < $t )? "\n PROBLEMS - numbers too diff." : ""), (($pessimas / $tun > 0.2 )? "\n PROBLEMS - too many bad TU" : ""), "\n After del. bad TU:". $Geq2*2, "/" , $t-$delnumbers,"=", ($Geq2*2)/( $t-$delnumbers), "\n\n"; sub mycmp{ $tun++; my $tu = shift; if(length($tu->{en}) > 30 && (length($tu->{en}) > 2 * length($tu->{pt}) || length($tu->{pt}) > 2 * length($tu->{en}))){ print "remove $tun = size is too different\n" ; $desiguaist ++ ; } if(length($tu->{en}) > 30 && $tu->{en} eq $tu->{pt}){ print "remove $tun = equal units\n" ; $iguais ++ ; } my $n1 = getn($tu->{en}); my $n2 = getn($tu->{pt}); my $n3 = MScardinter($n1,$n2); $Geq += $n3; $Geq2 += $n3; my $n4 = MScard($n1)+MScard($n2) ; $perfeitas ++ if $n4 == $n3*2 && $n4 > 0; $nonumber ++ if $n4 == $n3*2 && $n4 == 0; if((3+ $n3) * 2.4 < 3+$n4 ) { $delnumbers += $n4; $Geq2 -= $n3; $pessimas ++; print "remove $tun\n***EN\t$tu->{en}\n***PT\t$tu->{pt}\n" } MSaddto($Gn1,$n1); MSaddto($Gn2,$n2); } sub MSaddto{ my ($m1,$m2)=@_; for(keys %$m2){$m1->{$_} += $m2->{$_}} } sub MScardinter{ my ($m1,$m2)=@_; my $com = 0; for(keys %$m2){ $com += $m1->{$_} > $m2->{$_} ? $m2->{$_} : $m1->{$_}} $com } sub MScard{ my ($m1)=@_; my $c = 0; for(values %$m1){ $c += $_ } $c } sub getn{ my $s=shift; my %a=(); for( $s =~ m/(\d+)/g){$a{$_}++} \%a } __END__ our ($en); my @l = (); if ($en) {@l = (direction => "<~") } while(<>){ chomp; my $a = $client->ptd( { @l }, $_); print Dumper $a; } =head1 NAME jj-44 - example for (ptd) consult =head1 SYNOPSIS $a = $client->ptd( { options }, $word); print Dumper $a =head1 DESCRIPTION (Previously: jj-1 -> to create the corpus) Print the words and number of occurrences. Example of output: que [ 289, { 'que' => '0.96745', 'cual' => '0.01458' }, 'que' ] =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut