#!/usr/bin/perl -w -s use Lingua::PT::PLN; use utf8::all; use strict; # use locale; my $filename = shift or die ("usage: freqoco [-log] [-i] [-bigrams] [-trigrams] [-o=output.txt] file(s)\n"); our($oco,$o,$i,$log,$utf8,$latin1,$trigrams,$bigrams,$unigrams); $log||=0; $i||=0; my $enc = ($latin1 ? "cp1252" : "utf8"); my ($k,$v); $o||= "$filename.3.oco" if $trigrams; $o||= "$filename.2.oco" if $bigrams; $o||= "$filename.1.oco" if $unigrams; $o||= "$filename.oco" ; if($trigrams or $bigrams or $unigrams){ my %oco; if($oco){ for my $f (($filename, @ARGV)){ open(F1,"<",$f); while(){ if(/(\d+)\s*(\S+)/){ if($i){ $oco{lc($2)} +=$1} else { $oco{$2} +=$1} } } close F1; } } else{ %oco = oco({ignorecase=>$i, log=> $log, encoding => $enc }, $filename, @ARGV); } my %ocog; if($trigrams){ while (($k, $v) = each %oco) { while($k=~ m{(.)(?=(..))}g) {$ocog{"$1$2"} += $v;} }} elsif($bigrams){ while (($k, $v) = each %oco) { while($k=~ m{(.)(?=(.))}g) {$ocog{"$1$2"} += $v;} }} elsif($unigrams){ while (($k, $v) = each %oco) { while($k=~ m{(.)}g) {$ocog{$1} += $v;} }} open(F,"|-","sort -nr > '$o'"); while (($k, $v) = each %ocog){print F "$v $k\n"} close F; } else { oco({ log=> $log, output => $o, num => 1, ignorecase=>$i, encoding => $enc }, $filename, @ARGV); } __END__ =head1 NAME freqoco - Word frequence calculator =head1 SYNOPSIS freqoco [-o=output] [-i] file(s) =head1 DESCRIPTION Example of the output produced 123 the 100 one 23 cat 3 Option =head1 Options -utf8 input encoding is UTF8 -o=out output is sent to "out" (def: file.oco) -log output is in logaritmic scale (log(oco / milion)) -i ignore case (ouput is all in lower case) -trigrams -bigrams -unigrams -oco input is in "oco" format (useful for ngrams) example: freqoco -bigrams -oco corpus.oco =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO Lingua::PT::PLN perl(1). =cut