#!/usr/bin/perl -w -s

use Lingua::PT::PLN;
use utf8::all;
use strict;
# use locale;

my $filename = shift or die ("usage: freqoco [-log] [-i] [-bigrams] [-trigrams] [-o=output.txt] file(s)\n");

our($oco,$o,$i,$log,$utf8,$latin1,$trigrams,$bigrams,$unigrams);

$log||=0;

$i||=0;

my $enc = ($latin1 ? "cp1252" : "utf8");
my ($k,$v);

$o||= "$filename.3.oco" if $trigrams;
$o||= "$filename.2.oco" if $bigrams;
$o||= "$filename.1.oco" if $unigrams;
$o||= "$filename.oco" ;

if($trigrams or $bigrams or $unigrams){
  my %oco;
  if($oco){
    for my $f (($filename, @ARGV)){
      open(F1,"<",$f);
      while(<F1>){ if(/(\d+)\s*(\S+)/){
        if($i){ $oco{lc($2)} +=$1}
        else  { $oco{$2}     +=$1}
        }
      }
      close F1;
    }
  }
  else{
     %oco = oco({ignorecase=>$i, log=> $log, encoding => $enc }, 
                  $filename, @ARGV);
  }
  my %ocog;
  if($trigrams){
    while (($k, $v) = each %oco) {
       while($k=~ m{(.)(?=(..))}g) {$ocog{"$1$2"} += $v;}
    }}
  elsif($bigrams){
    while (($k, $v) = each %oco) {
       while($k=~ m{(.)(?=(.))}g) {$ocog{"$1$2"} += $v;}
    }}
  elsif($unigrams){
    while (($k, $v) = each %oco) {
       while($k=~ m{(.)}g) {$ocog{$1} += $v;}
    }}
  open(F,"|-","sort -nr > '$o'");
  while (($k, $v) = each %ocog){print F "$v	$k\n"}
  close F; 
}
else { 
  oco({ log=> $log, 
      output => $o, 
      num => 1, ignorecase=>$i, encoding => $enc }, $filename, @ARGV);
}

__END__

=head1 NAME

freqoco - Word frequence calculator

=head1 SYNOPSIS

 freqoco [-o=output] [-i] file(s)

=head1 DESCRIPTION

Example of the output produced

 123   the
 100   one
 23    cat
 3     Option

=head1 Options

 -utf8      input encoding is UTF8
 -o=out     output is sent to "out" (def: file.oco)
 -log       output is in logaritmic scale (log(oco / milion))
 -i         ignore case (ouput is all in lower case)
 -trigrams
 -bigrams
 -unigrams
 -oco       input is in "oco" format (useful for ngrams)
       example:  freqoco -bigrams -oco corpus.oco

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

Lingua::PT::PLN

perl(1).

=cut