#!/usr/bin/perl -w -s

use strict;

use POSIX qw(locale_h);

use utf8;
setlocale(LC_CTYPE, "pt_PT");
use locale;
use Data::Dumper;
use NAT::Matrix;
use NAT::Client;

our($juststep2, $local, $crp, $rules, $offset, $length,$out);
my $NATOOLS= $ENV{NATOOLS} || "$ENV{HOME}/svn/main/NATools/";

$out ||= "__extracted__";

my ($f1, $f2);

unless($juststep2){
  ($f1,$f2) = @ARGV;
  die ("usage: $0 [options] f1 f2\n  option: -local=... -crp=... -rules=...\n")
    unless -f $f2;
  step1({ 
    f1=>$f1, f2=>$f2, 
    ($out ? (  output=>$out):()),
    ($length ? (  length=>$length):()),
    ($offset ? (  offset=>$offset):()),
    ($rules  ? (  rules=>$rules):()), 
    ($crp    ? (  crp=>$crp):())  ,
    ($local  ? (  local=>$local ):()) ,
  });  
}

step2();

sub step1 {
 my %opt =(length => 10_000_000, 
           rules  => "$NATOOLS/rules.test-case", 
           output => "__extracted__",
          );
 if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;

 my $offset = $opt{offset} || "";
 die("Cant file Rules ($opt{rules})\n") unless -f $opt{rules};

 my $clength = 0;

 local $/ = "\n\$\n";

 open F1, "$f1";
 open F2, "$f2";
 my ($s1,$s2);

 print STDERR "Parsing rules file\n";
 my $rules = NAT::PatternRules->parseFile($opt{rules}) if $opt{rules};

 my $server;
 if ($opt{local}) {
   print STDERR "Loading corpus from '$opt{local}'\n";
   $server = NAT::Client->new(local => $opt{local}); } 
 elsif ($opt{crp}) {
   $server = NAT::Client->new;
   $server->set_corpus($opt{crp}); }
 else { 
   die("I need a corpus: please provide option '-local=... or -crp=...\n") }

 if(!$offset && -f "__number"){ $offset = `cat __number` + 0 }

 if($offset){ open( EXT,">>$opt{output}");}
 else       { open( EXT,">$opt{output}");}

 print STDERR "Recovering: Skipping to $offset...\n" if $offset;
 print STDERR "length= $opt{length}\n";

 while (defined($s1 = <F1>) && defined($s2 = <F2>) && $clength < $opt{length}){

    $clength++;
    if ($offset) { $offset--; next }
    `echo $clength > __number_`;

    print STDERR "$clength\n" unless ($clength % 10);

    chomp $s1;
    chomp $s2;

    my @x = split /\s+/, $s1;
    my @y = split /\s+/, $s2;

    next unless $s1 =~ /[a-zA-Z]{2}/; #JJ
    next if @x > 100;
    next if @y > 100;
    next if $s1 eq $s2;  #JJ
    next if @x > 6 && @y > 2 * @x ;  #JJ
    next if @y > 6 && @x > 2 * @y ;  #JJ
    next if @y > 4 * @x ;  #JJ
    next if @x > 4 * @y ;  #JJ

    my $matrix = NAT::Matrix->new($server,$rules,$s1,$s2);
    $matrix->findDiagonal;

    # print Dumper($matrix);

    for my $b (@{$matrix->{patterns}}) {
      my $x = $matrix->dump_block($b);
      print EXT $x->[0], " =!",$b->{id},"!= ", $x->[1], "\n";
    }

    my $blocks = $matrix->grep_blocks;
    my $bs = $matrix->combine_blocks($blocks, 1);

    for my $b (@$bs) {
      my $x = $matrix->dump_block($b);
      print EXT $x->[0], " === ", $x->[1], "\n";
    }
    print EXT "\n";
    `cp  __number_ __number`;
  }
  close EXT;
}


sub step2{
 open (EXT, "grep '=!' $out |LANG=C sort |LANG=C uniq -c | sort -nr |");
 open (TERM, ">_ext1");

 local $/ = "\n";

 while(<EXT>){
  next if m{ [)(%.?"',]
             | \d\s.*\d
             | \b( 
                   or | também | and | e | não | not | the | que | of |um | as
                   | como | with
                 )\b 
           }x;
  print TERM $_;
 }

 close EXT;
 close TERM;
}

unlink ("__number_", "__number");

__END__

=head1 NAME

term_extract - extract terminology and examples from paralell corpora

=head1 SYNOPSIS

 term_extract [options] file1 file2
 options
    -local=corpusDir 
    -crp=corpusName  (def -local )
    -rules=file      (def $NATOOLS/... )
    -offset=number   (def 0 or last number )
    -length=number   (10 000 000 )
    -out=file        (output file; default = __extrated__ )
    -juststep2       (just calculates _ext1 from __extracted__)

=head1 DESCRIPTION

 term_extract creates the following files:
   __extrated__ (or -out=file)  -- examples 
   _ext1                        -- bilingual terminology

=head1 AUTHOR

Alberto Simoes

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut