#!/usr/bin/perl -s -i use Data::Dumper; use warnings; use strict; use Data::Dumper; $Data::Dumper::Indent=0; $Data::Dumper::Terse=1; our ($h); our ($minocc, $minprob, $nonumbers, $nosymbols, $nonone, $i); sub usage { print STDERR "jj-PTDfilter: filter a pair of PTDs\n\n"; print STDERR "\tjj-PTDfilter [-minocc=2] [-minprob=0.05] [-nonone=0]\n"; print STDERR "\t [-nonumbers=1] [-nosymbols=1] \n\n"; print STDERR "\tjj-PTDfilter [-minocc=2] [-minprob=0.05] [-nonone=0]\n"; print STDERR "\t [-nonumbers=1] [-nosymbols=1] \n\n"; print STDERR "For more help, please run 'perldoc jj-PTDfilter'\n"; exit; } usage() if ($h); my $nat_object = shift @ARGV; usage() unless $nat_object; $minocc = defined($minocc)?$minocc:0; $minprob = defined($minprob)?$minprob:.0005; $nonumbers = defined($nonumbers)?$nonumbers:1; $nosymbols = defined($nosymbols)?$nosymbols:1; if (-d $nat_object) { my ($dic1, $dic2) = ("$nat_object/source-target.dmp", "$nat_object/target-source.dmp"); filter($dic1); print "\n"; filter($dic2); } else { filter($nat_object); } sub filter { my $dicname = shift; print STDERR "Filtering [$dicname]\n"; print STDERR " loading dictionary...."; my $dic = do $dicname; die ("###Error $!$@\n") if $! || $@; print STDERR "...done\n"; my ($total, $notremoved, $occrem, $numrem, $symrem, $trarem) = (0) x 6; print STDERR " performing main loop...."; for my $w (keys %$dic) { $total++; if ($dic->{$w}{count} <= $minocc) { $occrem++; delete $dic->{$w}; next } if ($nonumbers) { if ($w =~ m!\d!) { $numrem++; delete $dic->{$w}; next } for my $t (keys %{$dic->{$w}{trans}}) { if ($t =~ m!\d!) { delete $dic->{$w}{trans}{$t}; $trarem++; } } } if ($nosymbols) { if ($w ne "(none)" && $w =~ m/[#+\$_=%<>&\/\\.,;:\!?\(\)\{\}\[\]]|^[\\+*._'"-]+$/) { $symrem++; delete $dic->{$w}; next } for my $t (keys %{$dic->{$w}{trans}}) { next if $t eq "(none)" && ! $nonone; if ($t =~ m/[#+\$_=%<>&\/\\.,;:\!?\(\)\{\}\[\]]|^[\\+*._'"-]+$/) { delete $dic->{$w}{trans}{$t}; $trarem++; } } } $notremoved++; for my $t (keys %{$dic->{$w}{trans}}) { if ($dic->{$w}{trans}{$t} <= $minprob) { delete $dic->{$w}{trans}{$t}; $trarem++; } } } print STDERR "...done\n"; my $removed_empty_entries = 0; print STDERR " cleaning remaining empty entries...."; for my $k (keys %$dic) { unless (scalar(keys %{$dic->{$k}{trans}}) > 0) { delete $dic->{$k}; $notremoved--; $removed_empty_entries++; } } print STDERR "...done\n"; my $outname = "$dicname-f.dmp"; $outname =~ s/\.dmp-f\.dmp/-f.dmp/; print STDERR " writing [$outname]\n"; open D, ">$outname" or die("can create $outname\n"); my $old = select D; ptd_dumper($dic); select $old; close D; print STDERR "\n[$dicname]\n"; print STDERR " Removed $occrem entries with occurrence count <= $minocc.\n"; print STDERR " Removed $trarem translations with probability <= $minprob.\n"; print STDERR " Removed $numrem entries with numbers.\n" if $nonumbers; print STDERR " Removed $symrem entries with symbols.\n" if $nosymbols; print STDERR " Removed $removed_empty_entries empty entries.\n"; print STDERR " Preserved $notremoved entries from a total of $total entries.\n\n"; if ($i){ rename("$dicname","$dicname.old"); rename("$outname","$dicname"); } } sub ptd_dumper { my $d = shift; print "{\n"; for my $k (sort keys %$d){ print Dumper($k), " => {\n"; print " 'count' => $d->{$k}{count},\n"; # print " 'massa' => $d->{$k}{massa},\n" if exists($d->{$k}{massa}); # print " 'nrTrans' => $d->{$k}{nrTrans},\n" if exists($d->{$k}{nrTrans}); print " 'trans' => {\n"; for my $t (sort {$d->{$k}{trans}{$b} <=> $d->{$k}{trans}{$a} } keys %{$d->{$k}{trans}}){ print " ", sprintf("%14s => %.6f,\n",Dumper($t),$d->{$k}{trans}{$t}); } print " },\n"; print "},\n"; } print "};\n"; } =head1 NAME jj-PTDfilter - a simples PTD filter =head1 SYNOPSIS jj-PTDfilter [-minocc=5] [-nonone=0] [-minprob=0.05] [-nonumbers=1] [-nosymbols=1] jj-PTDfilter [-minocc=5] [-nonone=0] [-minprob=0.05] [-nonumbers=1] [-nosymbols=1] =head1 DESCRIPTION This simple filter loads the dumped dictionaries from the specified Nat-object, and filters them accordingly with the following options: =over 4 =item i Filtered dictionaries replace the originals; original version is renamed to C; =item minocc the minimum number of occurrences for the entries preserved. Defaults to 0. =item minprob the minimum translation probability for the entries preserved. Defaults to 0.005. =item nonumbers remove entries which words include numbers. Defaults to true. =item nosymbols remove entries which words include strange symbols. Defaults to true. =item nonone remove entries "(none)". Defaults to false. =back =head1 SEE ALSO perl(1) =head1 AUTHOR J.Joao Almeida Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006 by Alberto Manuel Brandão Simões =cut