#!/usr/bin/perl -w -s use utf8::all; use strict; my $filename = shift or die ("usage: $0 [-o=output.txt] file(s)\n"); our($f,$oco,$o,$i,$log); $log||=0; $i||=0; my ($k,$v); my $base=$o || $filename; my $hoco; if($oco){ for my $f (($filename, @ARGV)){ open(F1,"<",$f); while(){ if(/(\d+)[ \t]+(\S+)/){ if($i){ $hoco->{lc($2)} +=$1} else { $hoco->{$2} +=$1} } } close F1; } } else{ $hoco = utf8oco({ignorecase=>$i}, $filename, @ARGV); } my %oco1; my %oco2; my %oco3; while (($k, $v) = each %$hoco) { $k = "=$k=" if $f ; ## = for word frontiers next if $k =~ /\d/; while($k=~ m{(.)(?=(..))}g) {$oco3{"$1$2"} += $v;} while($k=~ m{(.)(?=(.))}g) {$oco2{"$1$2"} += $v;} while($k=~ m{(.)}g) {$oco1{$1} += $v;} } open(F,"|-","sort -nr > '$base.1.oco'"); while (($k, $v) = each %oco1){print F "$v $k\n"} close F; open(F,"|-","sort -nr > '$base.2.oco'"); while (($k, $v) = each %oco2){print F "$v $k\n"} close F; open(F,"|-","sort -nr > '$base.3.oco'"); while (($k, $v) = each %oco3){print F "$v $k\n"} close F; open(F,"|-","sort -nr > '$base.oco'"); while (($k, $v) = each %$hoco){print F "$v $k\n"} close F; sub utf8oco{ my %opt =(ignorecase => 0); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my %oco; my @fs = @_; for my $f (@fs){ open(F1,"<",$f); while(){ for my $w (m/(\w(?:[-'"'"']\w|\w)+)/g){ if($opt{ignorecase}){ $oco{lc($w)}++ } else { $oco{$w }++ } } } close F1; } return \%oco; } __END__ =head1 NAME 123gram - uni/bi/trigrams and Word frequence calculator =head1 SYNOPSIS 123gram [-o=output] [-i] file(s) =head1 DESCRIPTION 123gram file creates: file.oco (word freq) file.1.oco (unigrams) file.2.oco (bigrams) file.3.oco (trigrams) Example of the output produced 123 the 100 one 23 cat 3 Option =head1 Options -utf8 input encoding is UTF8 -o=out output is out.oco out.1.oco out.2.oco out.3.oco (def: file.oco) -log output is in logaritmic scale (log(oco / milion)) -i ignore case (ouput is all in lower case) -f use "=" for wordboundaries ngrams (star/end of the word) -oco input is in "oco" format (useful for ngrams) example: 123gram -oco corpus.oco =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut