#!/usr/bin/perl -w -s

use utf8::all;
use strict;

my $filename = shift or die ("usage: $0 [-o=output.txt] file(s)\n");

our($f,$oco,$o,$i,$log);

$log||=0;

$i||=0;

my ($k,$v);

my $base=$o || $filename;

my $hoco;
  if($oco){
    for my $f (($filename, @ARGV)){
      open(F1,"<",$f);
      while(<F1>){ if(/(\d+)[ \t]+(\S+)/){
        if($i){ $hoco->{lc($2)} +=$1}
        else  { $hoco->{$2}     +=$1}
        }
      }
      close F1;
    }
  }
  else{
     $hoco = utf8oco({ignorecase=>$i}, $filename, @ARGV);
  }

  my %oco1;
  my %oco2;
  my %oco3;
  while (($k, $v) = each %$hoco) {
       $k = "=$k=" if $f   ; ## = for word frontiers
       next if $k =~ /\d/;
       while($k=~ m{(.)(?=(..))}g) {$oco3{"$1$2"} += $v;}
       while($k=~ m{(.)(?=(.))}g)  {$oco2{"$1$2"} += $v;}
       while($k=~ m{(.)}g)         {$oco1{$1}     += $v;}
  }
  open(F,"|-","sort -nr > '$base.1.oco'");
  while (($k, $v) = each %oco1){print F "$v	$k\n"}
  close F; 
  open(F,"|-","sort -nr > '$base.2.oco'");
  while (($k, $v) = each %oco2){print F "$v	$k\n"}
  close F; 
  open(F,"|-","sort -nr > '$base.3.oco'");
  while (($k, $v) = each %oco3){print F "$v	$k\n"}
  close F; 
  open(F,"|-","sort -nr > '$base.oco'");
  while (($k, $v) = each %$hoco){print F "$v	$k\n"}
  close F; 


sub utf8oco{
 my %opt =(ignorecase => 0);
 if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;
 my %oco;
 my @fs = @_;
 for my $f (@fs){
    open(F1,"<",$f);
    while(<F1>){
        for my $w (m/(\w(?:[-'"'"']\w|\w)+)/g){
           if($opt{ignorecase}){ $oco{lc($w)}++ }
           else                { $oco{$w    }++ }
        }
    }
    close F1;
 }
 return \%oco;
}

__END__

=head1 NAME

123gram - uni/bi/trigrams and Word frequence calculator

=head1 SYNOPSIS

 123gram [-o=output] [-i] file(s)

=head1 DESCRIPTION

 123gram file 

creates:

 file.oco     (word freq)
 file.1.oco   (unigrams)
 file.2.oco   (bigrams)
 file.3.oco   (trigrams)

Example of the output produced

 123   the
 100   one
 23    cat
 3     Option

=head1 Options

 -utf8      input encoding is UTF8
 -o=out     output is out.oco out.1.oco out.2.oco out.3.oco (def: file.oco)
 -log       output is in logaritmic scale (log(oco / milion))
 -i         ignore case (ouput is all in lower case)
 -f         use "=" for wordboundaries ngrams (star/end of the word)
 -oco       input is in "oco" format (useful for ngrams)
       example:  123gram -oco corpus.oco

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO


perl(1).

=cut