#!/usr/bin/perl -w -s #!/usr/bin/perl -s use strict; our($simple,$oco,$complex,$raw,$lemma,$alemma,$steps,$log,$h,$help); if ($h || $help){ print "Usage freqnormpt [-simple] [-oco] file+ -simple just give the words in inverse order -oco input is already in (number word)* format -complex a more complex information about form oco. -raw complex information on raw(perl) mode. -lemma input is already in (number word)* format for lemma forms -alemma input is already in (number word (conf=30%))* format -steps=l1:l2:l3:..:ln limits -log=k uses k for maximum "; exit; } use Lingua::Jspell; use Lingua::PT::PLN; use Data::Dumper; use Storable; use utf8::all; my $dict = Lingua::Jspell->new("port"); my @A; my $max=0; if($oco||$lemma||$alemma){@A=@ARGV} else {oco({num=>1,output=>"/tmp/_oco_",encoding=>"utf8"}, @ARGV); @A=("/tmp/_oco_"); } $complex = 1 if $raw; my %oco=(); my %gara=(); my %ngara=(); my $confia; for (@A){ # open(F,"<:utf8",$_); open(F,$_); while(){ chomp; my ($numoco,$pal,$resto) = split(' ',$_,3); next if ($pal =~/[0-9]/); $max=$numoco if $numoco > $max; if ($lemma) { $oco{$pal} += $numoco; } elsif ($alemma) { if($resto && $resto =~ /\(\s*conf.*?\)\s*1/){ $oco{$pal}{desc}=1} if($resto && $resto =~ /conf=(\d+)/){$confia=$1/100} else {$confia=1;} $oco{$pal}{gar} += $numoco * $confia; $oco{$pal}{duv} += $numoco - $oco{$pal}{gar} ; # print "Debug...$pal,$numoco,$confia($resto)\n"; } else { my @w = $dict->rad($pal); my $duvida = (scalar(@w) > 1); if($complex){ for(@w){ if(! $duvida){ $gara{$_} += $numoco; push @{$oco{$_}}, [$numoco,$pal]} else{ $ngara{$_} += $numoco; push @{$oco{$_}}, [$numoco,$pal, [@w]]} } } else{ if(@w) { for(@w){ $oco{$_}{desc} = 0; if($duvida){ $oco{$_}{duv} += $numoco; $oco{$_}{gar} += 0 } else { $oco{$_}{gar} += $numoco; $oco{$_}{duv} += 0 } }} else { $oco{$pal}{duv} += $numoco; $oco{$pal}{gar} += 0; $oco{$pal}{desc} = 1; } } } } close F; } ## print "DUBUG.... MAX=$max\n"; Log::setmax($max); Log::setlogmax($log<=1 ? 12 : $log) if $log; if ($raw) { #store("_.store",\%oco); print Dumper(\%oco); } elsif($complex){ for(sort keys %oco){ print "$_ ($gara{$_}:$ngara{$_})\n" ; for $a (@{$oco{$_}}){ print "\t$a->[1]($a->[0]) ", join("/",@{$a->[2]}),"\n" } } } elsif($steps){ print STDERR "not yet ready :(\n\n"; } elsif($log){ for(sort keys %oco){ if(ref($oco{$_})){ my $tot = $oco{$_}{duv} + $oco{$_}{gar}; $confia = int($oco{$_}{gar}/$tot * 100) ; my $t1 = int( Log::logit($tot)); my $t2 = int( Log::logit($oco{$_}{gar})); if($oco{$_}{desc}){ print("$_ ($t1)\n")} elsif($t1==$t2){ print("$_ $t1\n")} else { print("$_ ($t1,$t2)\n")} #printf("\%s \%d (conf=\%d\%\%, inf=%d)\n", # $_, # Log::logit($tot), # $confia, # Log::logit($oco{$_}{gar}+1)) } #else { printf("\%s \%d (conf=\%d)\n", $_, Log::logit($oco{$_}),$confia)} else { printf("\%s \%d\n", $_, Log::logit($oco{$_}))} } } else{ for(sort {$oco{$b}{duv}+$oco{$b}{gar} <=> $oco{$a}{duv}+$oco{$a}{gar}} keys %oco){ my $tot = $oco{$_}{duv} + $oco{$_}{gar}; $confia = int($oco{$_}{gar}/$tot * 100) ; if($simple){ print "$_\n";} else { print sprintf('%10d %20s (conf=%d%%) %s', $tot, $_, $confia, $oco{$_}{desc}),"\n";}} } package Log; my ($lmax,$maxlog,$magicF); BEGIN{ print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"; $lmax=1000000; $maxlog=13.815; ## = log(1000000) $magicF=$maxlog/log($lmax);} sub setlogmax{ $maxlog = shift; $magicF=$maxlog/log($lmax); ## print "Debud .... Maxlog=$maxlog; magic=$magicF\n"; } sub setmax{ $lmax = shift; if($lmax <= 1){ $lmax = 1.1 } $magicF=$maxlog/log($lmax); ## print "Debud .... Max=$lmax; magic=$magicF\n"; } sub logit{ my $n=shift; return 0 unless $n; ## print STDERR "...$n,", log($n*$magicF) ,"\n" ; log($n)*$magicF } 1; __END__ =head1 NAME freqnormpt - evaluate word and lema frequences =head1 SYNOPSIS freqnormpt [-simple] [-oco] file+ -simple just give the words in inverse order -oco input is already in (number word)* format -complex a more complex information about form oco. -raw complex information on raw(perl Data::Dumper) mode. -lemma input is already in (number word)* format for lemma forms -alemma input in (number word (conf=number%) 1?)* format for lemma forms -steps=l1:l2:l3:..:ln limits ((not yet ready to work) -log=k uses logaritmic scale with k for maximum =head1 DESCRIPTION C reads a text or a list of (integer, word) pairs and build a lema frequences in several possible output formats. By default, the format is 10411 ir (conf=33%) 9464 ter (conf=82%) 9140 por (conf=100%) 8403 pelo (conf=11%) 7739 não (conf=100%) 9 nnão (conf=0%) 1 Each value has the number of ocorrence and a confidance factor (many words may have more then one possible lema). When the word is unknown to the dictionary, a "1" appears in the end (see nnão). =head2 Option C<-simple> With the option C<-simple>, the output is just a list of words in sorted by the number of ocorrence (reverse) o de do a e que =head2 Option C<-complex> With the option C<-complex>, the output is list of wordinfo in the form: lema --> (int x int) x (word --> (int x OtherPossibleLemas)) Example: ajuda (:229) ajuda(176) ajuda/ajudar ajudas(53) ajuda/ajudar ajudar (175:229) ajuda(176) ajuda/ajudar ajudar(140) ajudas(53) ajuda/ajudar ajudam(15) ajudaram(10) ajude(10) =head2 Option C<-lemma> =head2 Option C<-alemma> Acepts input produced by default options of freqnormpt =head2 Option C<-log> Option C<-log> makes output in logaritmic scale. Default range is 0 .. 12 =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt Alberto Simões, albie@alfarrabio.di.uminho.pt =head1 SEE ALSO freqcomp(1). perl(1). =cut