#!/usr/bin/perl -s

#use File::Spec;
#use File::Spec::Functions;
use utf8::all;
use Cwd 'abs_path';
use Text::Hunspell;
use strict;
our($minratio,$l, $i, $pcase, $pacc, $txt, $min, $max, $lat1, $latin1, $q, $br, $oco, $debug,$c,$std,$log,$ptall);
my $pp="";

my %L=(pt => "cetemGordo.txt",
       en => "bnc.txt");

$l //= "pt";

my $lll=$l;
$lll = "pt_PT" if $l eq "pt";
$lll = "en_US" if $l eq "en";
my $s;

if($c){
  $s= Text::Hunspell->new(	
     "/usr/share/hunspell/$lll.aff",
     "/usr/share/hunspell/$lll.dic",);
#  $s->set_option('lang',$lll);
}

my $dir="/usr/local/share/oco";

my $op = " -sub ";
   $op.= " -minratio=$minratio " if $minratio;
   $op.= " -pcase " if $pcase;
   $op.= " -pacc " if $pacc;

if($br) { $std //= "$dir/tokens.todos_br" } 
if($ptall) { $std //= "$dir/tokens.todos_ptall" } 
if($q)  { $std //= "$dir/cetemMagro.txr" } 
if($l)  { $std //= "$dir/".$L{$l} } 
if($i)  { $op .= " -ignorecase " }
if($oco){ $op .= " -oco " }         else { $op .= " -istxt1 " }
if($txt){ $op .= " -istxt1 " }
if($min){ $op .= " -minoco=$min " } else { $op .= " -minoco=30 " }

if($max){ $pp .= " |head -$max "}
if($lat1){ $pp .= " | iconv -t CP1252 -t utf8  "}

my $tmp="/tmp/__$$";
my $x= shift or die("wrong usage; try perldoc susprise\n");
my $charset= `file -i $x`;
if(not $lat1 and $charset =~ m!text/\S+\s*charset=(\S+)!){
  if   ($1 eq "utf-8")     {linkit($x, $tmp)}
  elsif($1 eq "utf-16")    {syst("iconv -c -f utf16  -t utf8   $x > $tmp")}
  elsif($1 eq "unknown")   {syst("iconv -c -f cp1252 -t utf8   $x > $tmp")}
  elsif($1 eq "iso-8859-1"){syst("iconv -c -t utf8   -f latin1 $x > $tmp")}
  else                     {linkit($x, $tmp)}
}
else                       {linkit($x, $tmp)}

my $aux="/tmp/_surprise$$";

if($log){syst("freqcomp -log $op $tmp $std |sort -nr $pp | cut -c '1-7,12-' > $aux"); }
else { syst("freqcomp $op $tmp $std |sort -r $pp | cut -c '1-7,12-' > $aux"); }

open(F,"<",$aux);
while(<F>){
  if($c){
    if(/(\S+)\s+(\S{3,})(.*)/){
      if ($s->check($2)){ print "+"}
      else              { print "-"}
    }
  }
  print $_;
}
close(F);

unlink("__$$",$aux);

sub linkit{
  my ($a,$tmp)=@_;
  my $abs_path = abs_path($a);
  syst("ln -s $abs_path $tmp");
}
 
sub syst{
  my $com = shift;
  print "Debug $com\n" if $debug;
  system($com);
  if ($!) { print STDERR "Erro ($com): $_\n"; }
}

__END__

=encoding utf8

=head1 NAME

surprise - extracting unexpected words from text

=head1 SYNOPSIS

 surprise [options] file.oco
 surprise -txt [options] file.txt

=head1 DESCRIPTION

Compare the number of ocurences of the words in a text and the words in
a big reference corpus. Output is sorted by "what is new in this text"

=head2 Options

 -i        ignore case
 -pcase    preferencial case
 -pacc     preferencial accents
 -l=EN     set language (def: PT)
 -txt      input is a text (def: list of (word ocurrences)
 -min=10   minimum of occurences/million of words to be analysed (def: 30)
 -max=     Maximum number of words in output
 -lat1     input is encoded in latin1 (def: utf8)
 -oco      input is in oco format (see freqoco command)
 -std=file.oco   user provided  word-oco database
 -q        quick (compare text with smaller word-oco database)
 -br       compare agains pt_br tokens (instead of pt_pt tokens)
 -ptall    compare agains (pt_br+pt_pt) tokens
 -loq      use log compare/sort strategies
 -minratio minimum logaritmic difference
           eg(minratio=2.30  (=log(10)) -- 10 times bigger)

=head1 AUTHOR

J.Joao Almeida,  jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

freqcomp

=cut