#!/usr/bin/perl -s use warnings; use strict; use utf8; our ($full, $html, $stats, $h, $kl); if ($kl) { require Math::KullbackLeibler::Discrete; } use CGI qw/:standard/; use locale; if ($h) { print STDERR "nat-compareDicts: used to compare two PTDs in Perl dumper format.\n\n"; print STDERR "\tnat-compareDicts [-html] [-full] \n\n"; print STDERR "For more help, please run 'perldoc nat-compareDicts'\n"; exit 1; } my $f1 = shift; my $f2 = shift; print STDERR "Loading [$f1]..."; my $D1 = do($f1); print STDERR "...done\n"; print STDERR "Loading [$f2]..."; my $D2 = do($f2); print STDERR "...done\n"; if ($full) { my @keys = U(keys %$D1, keys %$D2); for my $key (sort @keys) { printf "%15s | %5d | %5d\n", $key, ($D1->{$key}{count} || 0), ($D2->{$key}{count} || 0); } } elsif ($html) { my $res; my ($count, $average) = (0,0); for my $e (keys %$D1) { next if $e eq "(null)" || $e eq "(none)"; next unless exists $D2->{$e}; next unless $D1->{$e}{count}; my @trans = (keys %{$D1->{$e}{trans}}, keys %{$D2->{$e}{trans}}); my %x; @x{@trans}=@trans;@trans=keys %x; my $diff = 0; if ($kl) { $diff = Math::KullbackLeibler::Discrete::kl($D1->{$e}{trans}, $D2->{$e}{trans}); $count++; $average+=$diff; } else { for (@trans) { my $x = 100 * exists($D1->{$e}{trans}{$_})?$D1->{$e}{trans}{$_}:0; my $y = 100 * exists($D2->{$e}{trans}{$_})?$D2->{$e}{trans}{$_}:0; $diff += abs($x-$y); } $diff *= log( 1 + 0.5 * $D1->{$e}{count} + 0.5 * $D2->{$e}{count}); } my $best1 = (sort {$D1->{$e}{trans}{$a} <=> $D1->{$e}{trans}{$b}} keys %{$D1->{$e}{trans}})[-1]; $best1||=""; my $sbest1 = (sort {$D1->{$e}{trans}{$a} <=> $D1->{$e}{trans}{$b}} keys %{$D1->{$e}{trans}})[-2]; $sbest1||=""; my $prob1 = $D1->{$e}{trans}{$best1} || 0; $prob1*=100; my $sprob1 = $D1->{$e}{trans}{$sbest1} || 0; $sprob1*=100; my $best2 = (sort {$D2->{$e}{trans}{$a} <=> $D2->{$e}{trans}{$b}} keys %{$D2->{$e}{trans}})[-1]; $best2||=""; my $sbest2 = (sort {$D2->{$e}{trans}{$a} <=> $D2->{$e}{trans}{$b}} keys %{$D2->{$e}{trans}})[-2]; $sbest2||=""; my $prob2 = $D2->{$e}{trans}{$best2} || 0; $prob2*=100; my $sprob2 = $D2->{$e}{trans}{$sbest2} || 0; $sprob2*=100; my $count= $D1->{$e}{count}; next if $count < 10; next if $diff < 0.1; $res->{$e} = [$diff, $count, $best1, $prob1, $best2, $prob2, $sbest1, $sprob1, $sbest2, $sprob2]; } binmode STDOUT, ":utf8"; print "\n"; if ($kl) { print h1("average KL ", ($average/$count)); } print Tr(th([qw/Word Measure Occurrences/,"Best Trans 1","Best % 1","Best Trans 2","Best % 2"])); for (sort { $res->{$b}[0] <=> $res->{$a}[0]} keys %$res) { print Tr( td({class=>"ll"},$_), td(sprintf("%.5f", $res->{$_}[0])), td({class=>"r"},$res->{$_}[1]), td({class=>"l"}, $res->{$_}[2]), td({class=>"r"},sprintf("%.4f%%",$res->{$_}[3])), td({class=>"l"}, $res->{$_}[4]), td({class=>"r"},sprintf("%.4f%%",$res->{$_}[5])) ); print Tr( td({class=>"ll"}," "), td(""), td({class=>"r"}," "), td({class=>"l"}, $res->{$_}[6]), td({class=>"r"},sprintf("%.4f%%",$res->{$_}[7])), td({class=>"l"}, $res->{$_}[8]), td({class=>"r"},sprintf("%.4f%%",$res->{$_}[9])) ); } print "
\n"; } else { my $dir = "->"; $| = 1; while(1) { print "$dir # "; chomp(my $word = <>); exit if $word eq ""; if ($word =~ m!^/(.*)/$!) { my $regexp = $1; my $key; while ($key = each %$D1) { myDumpSome($key,$D1,$D2) if $key =~ m!$regexp!; } } else { myDumpSome($word,$D1,$D2); } } } sub load { my $file = shift; return (load_("$file/source-target.dmp"), load_("$file/target-source.dmp")); } sub load_ { my $file = shift; if (-e "$file.filtered") { return do "$file.filtered" } else { return do $file; } } sub myDump { my ($word,$structure,$indent) = @_; $indent = $indent ? " " x 28 : ""; print "** Word: $word\n"; print "** OccurrenceCount: $structure->{count}\n"; for (sort {$structure->{trans}{$b} <=> $structure->{trans}{$a}} keys %{$structure->{trans}}) { printf "$indent %15s: %7.4f %%\n", $_, $structure->{trans}{$_}*100; } print "\n"; } sub myDump2 { my ($word,$A,$B) = @_; print "** Word: $word\n"; print "** OccurrenceCount: $A->{count}, $B->{count}\n\n"; my @A = sort {$A->{trans}{$b} <=> $A->{trans}{$a}} keys %{$A->{trans}}; my @B = sort {$B->{trans}{$b} <=> $B->{trans}{$a}} keys %{$B->{trans}}; while(@A || @B) { my $a = shift @A || undef; my $b = shift @B || undef; if ($a && $b) { printf " %15s: %7.4f %% %15s: %7.4f %%\n", $a, $A->{trans}{$a}*100, $b, $B->{trans}{$b}*100; } elsif ($a) { printf " %15s: %7.4f %%\n", $a, $A->{trans}{$a}*100; } else { printf " %15s: %15s: %7.4f %%\n","",$b, $B->{trans}{$b}*100; } } print "\n"; } sub myDumpSome { my ($word,$A, $B) = @_; if (exists($A->{$word}) && exists($B->{$word})) { print "\n"; myDump2($word,$A->{$word},$B->{$word}); } elsif (exists($A->{$word})) { print "\n"; myDump($word,$A->{$word}); } elsif (exists($B->{$word})) { print "\n"; myDump($word,$B->{$word},1); } else { print "** Word '$word' not found in any dictionry\n"; } } sub U { my %U; $U{$_}++for@_; keys %U; } =encoding UTF-8 =head1 NAME nat-compareDicts - used to compare two PTDs in Perl dumper format. =head1 SYNOPSIS print "\tnat-compareDicts [-stats] [-html] [-full] \n\n"; =head1 DESCRIPTION This program compares two Probabilistic Translation Dictionaries in Perl Dumper format. It can be used in four different ways: =over 4 =item * With no switches, a shell will be loaded. In this shell the user can entry words, and those words entries in the dictionary willl be shown. =item * With the C<-html> switch, an HTML table will be printed to STDOUT. This table tries to show entries will less or more differences between the two dictionaries. =item * The C<-full> switch prints to STDOUT all entries from the dictionaries together with some comparison values. =back =head1 SEE ALSO perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2012 by Alberto Manuel Brandão Simões =cut