#!/usr/bin/perl -s use strict; use warnings; use Lingua::NATools::Lexicon; use Lingua::NATools::Dict; our ($h); sub usage { print "nat-substDict: substitutes the dictionary in a NATools corpus by a Dumper PTD\n\n"; print "\tnat-substDict \n\n"; print "\tnat-substDict \n\n"; print "For more help, please run 'perldoc nat-substDict'\n"; exit; } usage() if ($h); my $selfdir = shift @ARGV; my $dumper = shift @ARGV; my $dumper2 = shift @ARGV || undef; usage() unless -d $selfdir && -f $dumper; print STDERR "step 1: load Data::Dumper file\n"; our ($DIC1, $DIC2); if ($dumper2) { $DIC1 = do $dumper; $DIC2 = do $dumper2; } else { do $dumper; } my $s1 = keys %$DIC1; my $s2 = keys %$DIC2; printf STDERR " size 1: $s1\n"; printf STDERR " size 2: $s2\n"; print STDERR "step 2: associate IDs to source Dumper entries\n"; die "Can't find source.lex file" unless -f "$selfdir/source.lex"; my $source = Lingua::NATools::Lexicon->new("$selfdir/source.lex"); $source->enlarge($s1); my $i = 0; my $new = 0; for my $w (keys %$DIC1) { ++$i; my $wid = ($w eq "(null)" || $w eq "(none)")?1:$source->id_from_word($w); # this increments the occurrence number to 1 unless ($wid) { $wid = $source->add_word($w); $new++; } $source->set_id_count($wid, $DIC1->{$w}{count}); $DIC1->{$w}{id} = $wid; } printf STDERR " %d ids reused\n", $i - $new; printf STDERR " %d new ids\n", $new; $s1 = $source->size(); $source->save("$selfdir/source.lex"); $source->close(); print STDERR "step 3: associate IDs to target Dumper entries\n"; die "Can't find target.lex file" unless -f "$selfdir/target.lex"; my $target = Lingua::NATools::Lexicon->new("$selfdir/target.lex"); $target->enlarge($s2); $i = 0; $new = 0; for my $w (keys %$DIC2) { ++$i; my $wid = ($w eq "(null)" || $w eq "(none)")?1:$target->id_from_word($w); # this increments the occurrence number to 1 unless ($wid) { $wid = $target->add_word($w); $new++; } $target->set_id_count($wid, $DIC2->{$w}{count}); $DIC2->{$w}{id} = $wid; } printf STDERR " %d ids reused\n", $i - $new; printf STDERR " %d new ids\n", $new; $s2 = $target->size(); $target->save("$selfdir/target.lex"); $target->close(); print STDERR "step 4: change source/target dictionary\n"; unlink "$selfdir/source-target.bin" if -f "$selfdir/source-target.bin"; my $stDic = Lingua::NATools::Dict->new("$selfdir/source-target.bin", $s1); die unless $stDic; for my $w (keys %$DIC1) { my $wid = $DIC1->{$w}{id}; $stDic->set_occ($wid, $DIC1->{$w}{count}); my $index = 0; for my $k (keys %{$DIC1->{$w}{trans}}) { die "$wid > $s1" if $wid > $s1; $stDic->set_val($wid, $index, $DIC2->{$k}{id}, $DIC1->{$w}{trans}{$k}); ++$index; } } $stDic->save(); $stDic->close(); print STDERR "step 5: change target/source dictionary\n"; unlink "$selfdir/target-source.bin" if -f "$selfdir/target-source.bin"; my $tsDic = Lingua::NATools::Dict->new("$selfdir/target-source.bin", $s2); die unless $tsDic; for my $w (keys %$DIC2) { my $wid = $DIC2->{$w}{id}; $tsDic->set_occ($wid, $DIC2->{$w}{count}); my $index = 0; for my $k (keys %{$DIC2->{$w}{trans}}) { die "$wid > $s2" if $wid > $s2; $tsDic->set_val($wid, $index, $DIC1->{$k}{id}, $DIC2->{$w}{trans}{$k}); ++$index; } } $tsDic->save(); $tsDic->close(); =encoding UTF-8 =head1 NAME nat-substDict: substitutes a dictionary in a NATools corpus by a Perl Dumper format PTD. =head1 SYNOPSIS nat-substDict nat-substDict =head1 DESCRIPTION This tool is used to substitute a dictionary on a NATools corpus. Unless you know what you are doing, this can be a bad option making the corpus unusable. To use it just pass as first parameter a NATools corpus directory and as second argument a Perl Data::Dumper file with the two dictionaries (or a pair of Data::Dumper files, one with each dictionary). =head1 SEE ALSO NATools documentation, perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2009 by Alberto Manuel Brandão Simões =cut