#!/usr/bin/perl -s use strict; use warnings; use Lingua::NATools::Lexicon; use Lingua::NATools::Dict; my $MAXENTRY = 8; our ($h); sub usage { print "nat-addDict: adds a dictionary in Perl Dumper format into a NATools corpus\n\n"; print "\tnat-addDict \n\n"; print "\tnat-addDict \n\n"; print "For more help, please run 'perldoc nat-addDict'\n"; exit; } usage() if ($h); my $selfdir = shift @ARGV; my $dumper = shift @ARGV; my $dumper2 = shift @ARGV || undef; usage() unless -d $selfdir && -f $dumper; print STDERR "step 1: load Data::Dumper file\n"; our ($DIC1, $DIC2); if ($dumper2) { $DIC1 = do $dumper; $DIC2 = do $dumper2; } else { do $dumper; } my $s1 = keys %$DIC1; my $s2 = keys %$DIC2; printf STDERR " size 1: $s1\n"; printf STDERR " size 2: $s2\n"; print STDERR "step 2: associate IDs to source Dumper entries\n"; die "Can't find source.lex file" unless -f "$selfdir/source.lex"; my $source = Lingua::NATools::Lexicon->new("$selfdir/source.lex"); my $new_source_dic_size = $source->size; my $Oc1source = $source->occurrences; my $Oc2source = 0; $source->enlarge($s1); my $i = 0; my $new = 0; for my $w (keys %$DIC1) { ++$i; my $wid = ($w eq "(null)" || $w eq "(none)")?1:$source->id_from_word($w); # this increments the occurrence number to 1 unless ($wid) { $wid = $source->add_word($w); $new++; } $Oc2source+=$DIC1->{$w}{count}; $source->set_id_count($wid, $source->id_count($wid) + $DIC1->{$w}{count}); $DIC1->{$w}{id} = $wid; } printf STDERR " %d ids reused\n", $i - $new; printf STDERR " %d new ids\n", $new; $s1 = $source->size(); $source->save("$selfdir/source.lex"); $source->close(); $new_source_dic_size+= $new; print STDERR "step 3: associate IDs to target Dumper entries\n"; die "Can't find target.lex file" unless -f "$selfdir/target.lex"; my $target = Lingua::NATools::Lexicon->new("$selfdir/target.lex"); my $new_target_dic_size = $target->size; my $Oc1target = $target->occurrences; my $Oc2target = 0; $target->enlarge($s2); $i = 0; $new = 0; for my $w (keys %$DIC2) { ++$i; my $wid = ($w eq "(null)" || $w eq "(none)")?1:$target->id_from_word($w); # this increments the occurrence number to 1 unless ($wid) { $wid = $target->add_word($w); $new++; } $Oc2target+=$DIC2->{$w}{count}; $target->set_id_count($wid, $target->id_count($wid) + $DIC2->{$w}{count}); $DIC2->{$w}{id} = $wid; } printf STDERR " %d ids reused\n", $i - $new; printf STDERR " %d new ids\n", $new; $s2 = $target->size(); $target->save("$selfdir/target.lex"); $target->close(); $new_target_dic_size+= $new; ### $Oc2source P(D1,wa,wb) wa1Occ + $oc1source P(D2,wa,wb) wa2occ ### ------------------------------------------------------------------------ ### wa1Occ $oc2source + wa2occ $oc1source print STDERR "step 4: add source/target dictionary\n"; my $stDic = Lingua::NATools::Dict->open("$selfdir/source-target.bin"); die unless $stDic; $stDic->enlarge($new_source_dic_size); $target = Lingua::NATools::Lexicon->new("$selfdir/target.lex"); for my $w (keys %$DIC1) { my $wid = $DIC1->{$w}{id}; my $wa1Occ = $stDic->occ($wid); my $wa2Occ = $DIC1->{$w}{count}; my $ovals = $stDic->vals($wid); my %ovals = @$ovals; my @keys = keys %ovals; push @keys, grep { !exists($ovals{$_}) } map {$DIC2->{$_}{id}} keys %{$DIC1->{$w}{trans}}; my %dic = (); for my $k (@keys) { my $P1 = $ovals{$k} || 0; my $tw = $target->word_from_id($k); my $P2 = $DIC1->{$w}{trans}{$tw} || 0; if ($wa1Occ + $wa2Occ == 0.0000000) { $dic{$k} = ($Oc2source * $P1 * ($Oc1source/100000) + $Oc1source * $P2 * ($Oc2source/100000)) / (($Oc1source/100000) * $Oc2source + ($Oc2source/100000) * $Oc1source); } else { $dic{$k} = ($Oc2source * $P1 * $wa1Occ + $Oc1source * $P2 * $wa2Occ) / ($wa1Occ * $Oc2source + $wa2Occ * $Oc1source); } } my $index = 0; for my $k (sort {$dic{$b} <=> $dic{$a}} keys %dic) { last if $index >= $MAXENTRY; $stDic->set_val($wid, $index, $k, $dic{$k}); $index++; } $stDic->set_occ($wid, $wa1Occ+$wa2Occ); } $stDic->save("$selfdir/source-target.bin"); $stDic->close(); $target->close(); $source = Lingua::NATools::Lexicon->new("$selfdir/source.lex"); print STDERR "step 5: add target/source dictionary\n"; my $tsDic = Lingua::NATools::Dict->open("$selfdir/target-source.bin"); die unless $tsDic; $tsDic->enlarge($new_target_dic_size); for my $w (keys %$DIC2) { my $wid = $DIC2->{$w}{id}; my $wa1Occ = $tsDic->occ($wid); my $wa2Occ = $DIC2->{$w}{count}; my $ovals = $tsDic->vals($wid); my %ovals = @$ovals; my @keys = keys %ovals; push @keys, grep { !exists($ovals{$_}) } map {$DIC1->{$_}{id}} keys %{$DIC2->{$w}{trans}}; my %dic = (); for my $k (@keys) { my $P1 = $ovals{$k} || 0; my $tw = $source->word_from_id($k); my $P2 = $DIC2->{$w}{trans}{$tw} || 0; if ($wa1Occ + $wa2Occ == 0.0000000) { $dic{$k} = ($Oc2target * $P1 * ($Oc1target/100000) + $Oc1target * $P2 * ($Oc2target/100000)) / (($Oc1target/100000) * $Oc2target + ($Oc2target/100000) * $Oc1target); } else { $dic{$k} = ($Oc2target * $P1 * $wa1Occ + $Oc1target * $P2 * $wa2Occ) / ($wa1Occ * $Oc2target + $wa2Occ * $Oc1target); } } my $index = 0; for my $k (sort {$dic{$b} <=> $dic{$a}} keys %dic) { last if $index >= $MAXENTRY; $tsDic->set_val($wid, $index, $k, $dic{$k}); $index++; } $tsDic->set_occ($wid, $wa1Occ+$wa2Occ); } $tsDic->save("$selfdir/target-source.bin"); $tsDic->close(); $source->close(); print STDERR "** DONE **\n"; =encoding UTF-8 =head1 NAME nat-addDict: adds a dictionary in Perl Dumper format into a NATools corpus. =head1 SYNOPSIS nat-addDict nat-addDict =head1 DESCRIPTION This command is used to add an external dictionary (in Perl Dumper format) to a NATools corpus. =head1 SEE ALSO NATools documentation, perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2009 by Alberto Manuel Brandão Simões =cut