#!/usr/bin/perl -s use warnings; use strict; use Data::Dumper; our ($h); sub usage { print "nat-mkRealDict: used to create a dictionary similar to a PTD based on a\n"; print " word aligned corpus.\n\n"; print "\tnat-mkRealDict \n\n"; print "For more help, please run 'perldoc nat-mkRealDict'\n"; exit; } usage() if $h; my $data; while(<>) { chomp; next if m!^$!; my ($l,$r) = split /\t/; next if $r =~ m!\s!; $data->{$l}{count}++; $data->{$l}{trans}{$r}++; } for my $k (keys %$data) { my $total; for my $t (keys %{$data->{$k}{trans}}) { $total += $data->{$k}{trans}{$t} } for my $t (keys %{$data->{$k}{trans}}) { $data->{$k}{trans}{$t} = $data->{$k}{trans}{$t} / $total; } } print Dumper($data); =encoding UTF-8 =head1 NAME nat-mkRealDict - used to create a dictionary similar to a PTD based on a word aligned corpus. =head1 SYNOPSIS nat-mkRealDict =head1 DESCRIPTION While not fully functional, this scripts reads from the supplied file an word aligned corpus (two columns separated by a tab character). The result printed to STDOUT is a dictionary similar to a PTD in structure) with the real alignments used in the corpus. =head1 SEE ALSO NATools documentation, perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2009 by Alberto Manuel Brandão Simões =cut