#!/usr/bin/perl -s
use Data::Dumper;
$Data::Dumper::Indent=1;
$Data::Dumper::Terse=1;

use utf8;

our ($c,$pt2en);

use POSIX qw(locale_h);
setlocale(&POSIX::LC_ALL, "pt_PT");
use locale;

my @baden=qw{
is
a
are 
on
was
were
};

my @badpt=qw{
é são
um uma
sobre
por pela
para
na no nos nas
a o
};

my %baden;
my %badpt;
@baden{@baden}=@baden;
@badpt{@badpt}=@badpt;

if($pt2en){ example2pten()}
else      { example2enpt()}

sub bad{
 my ($pt,$en,$r)=@_;
 for(@$en){
   return(1) if $baden{$_}
 }
 for(@$pt){
   return(1) if $badpt{$_}
 }
 return 0;
}

sub example2pten{
while(<>){
  chomp;
  next if /[_,.:;]/;

  my ($q,$l1,$rule,$l2)=split(/\s*!?=!?\s*/,$_);

  next if ("$l1 $l2" =~ /[^[:alpha:]_ -]/);

  my @l1=split(/\s+/,$l1); 
  my @l2=split(/\s+/,$l2); 
  next if bad(\@l1,\@l2,$rule);
  if($rule eq "ABBA"){
     $dic{$l1[0]}{$l2[1]}{oco}+=$q;
     $dic{$l1[0]}{$l2[1]}{cause}{"sub=sub"}+=$q;
     $dic{$l1[1]}{$l2[0]}{oco}+=$q;
     $dic{$l1[1]}{$l2[0]}{cause}{"adj=adj"}+=$q;
     $dic{$l1}{$l2}{oco}+=$q;
     $dic{$l1}{$l2}{cause}{"sub_adj=adj_sub"}+=$q;
  }
  elsif($rule eq "A"){
     $dic{$l1[2]}{$l2[0]}{oco}+=$q;
     $dic{$l1[2]}{$l2[0]}{cause}{"de sub=adj"}+=$q;
     $dic{$l1[0]}{$l2[1]}{oco}+=$q;
     $dic{$l1[0]}{$l2[1]}{cause}{"sub=sub"}+=$q;
     $dic{$l1}{$l2}{oco}+=$q;
     $dic{$l1}{$l2}{cause}{"sub_de_sub=adj_sub"}+=$q;
  }
  elsif($rule eq "ABCCBA"){
     $dic{$l1[2]}{$l2[0]}{oco}+=$q;
     $dic{$l1[2]}{$l2[0]}{cause}{"adj=adj"}+=$q;

     $dic{$l1[1]}{$l2[1]}{oco}+=$q;
     $dic{$l1[1]}{$l2[1]}{cause}{"adj=adj"}+=$q;

     $dic{$l1[0]}{$l2[2]}{cause}{oco}+=$q;
     $dic{$l1[0]}{$l2[2]}{cause}{"sub=sub"}+=$q;

     $dic{"$l1[0] $l1[1]"}{"$l2[1] $l2[2]"}{oco}+=$q;
     $dic{"$l1[0] $l1[1]"}{"$l2[1] $l2[2]"}{cause}{"sub_adj=adj_sub"}+=$q;

     $dic{$l1}{$l2}{oco}+=$q;
     $dic{$l1}{$l2}{cause}{"sub_adj_adj=adj_adj_sub"}+=$q;
  }
}

geradic();
}

sub example2enpt{

  while(<>){
    chomp;

    next if /[_,.:;]/;
  
    my ($q,$l2,$rule,$l1)=split(/\s*!?=!?\s*/,$_);

    next if ("$l1 $l2" =~ /[^[:alpha:]_ -]/);
  
    my @l1=split(/\s+/,$l1); 
    my @l2=split(/\s+/,$l2); 
  
    next if bad(\@l2,\@l1,$rule);
  
    if($rule eq "ABBA"){
       $dic{$l1[0]}{$l2[1]}{oco}+=$q;
       $dic{$l1[0]}{$l2[1]}{cause}{"adj=adj"}+=$q;
       $dic{$l1[1]}{$l2[0]}{oco}+=$q;
       $dic{$l1[1]}{$l2[0]}{cause}{"sub=sub"}+=$q;
       $dic{$l1}{$l2}{oco}+=$q;
       $dic{$l1}{$l2}{cause}{"adj_sub=sub_adj"}+=$q;
    }
    elsif($rule eq "A"){
       $dic{$l1[0]}{$l2[2]}{oco}+=$q;
       $dic{$l1[0]}{$l2[2]}{cause}{"adj=de sub"}+=$q;
       $dic{$l1[1]}{$l2[0]}{oco}+=$q;
       $dic{$l1[1]}{$l2[0]}{cause}{"sub=sub"}+=$q;
       $dic{$l1}{$l2}{oco}+=$q;
       $dic{$l1}{$l2}{cause}{"adj_sub=sub_de_sub"}+=$q;
    }
    elsif($rule eq "ABCCBA"){
       $dic{$l1[0]}{$l2[2]}{oco}+=$q;
       $dic{$l1[0]}{$l2[2]}{cause}{"adj=adj"}+=$q;
  
       $dic{$l1[1]}{$l2[1]}{oco}+=$q;
       $dic{$l1[1]}{$l2[1]}{cause}{"adj=adj"}+=$q;
  
       $dic{$l1[2]}{$l2[0]}{cause}{oco}+=$q;
       $dic{$l1[2]}{$l2[0]}{cause}{"sub=sub"}+=$q;
  
       $dic{"$l1[1] $l1[2]"}{"$l2[0] $l2[1]"}{oco}+=$q;
       $dic{"$l1[1] $l1[2]"}{"$l2[0] $l2[1]"}{cause}{"adj_sub=sub_adj"}+=$q;
  
       $dic{$l1}{$l2}{oco}+=$q;
       $dic{$l1}{$l2}{cause}{"adj_adj_sub=sub_adj_adj"}+=$q;
    }
  }

  geradic();
}

sub geradic{
 for (sort keys %dic){
   geralinha( $_, $dic{$_});
 }
}

sub geralinha{
 my ($w,$inf)=@_;
 @l = (sort { $inf->{$b}{oco} <=> $inf->{$a}{oco} } keys %$inf);
 my $nam = (scalar(@l) == 1) || ($w =~ m/ /);
 if($nam){
   my $p = $l[0];
   if ($p =~ / /){ $p =~ s/ /+/g; $p = "{$p}";}
   print "$w=$p";
   my $c = maxcause($inf->{$l[0]}{cause});
   print "#A"  if ($c eq "adj=adj");
   print "#S1" if ($c eq "adj=de sub");
   print "#S2" if ($c eq "sub=sub");
   print "#S3" if ($c =~ /=sub_/);
   print "\n";
 }
 else{
   for (@l){
     print "$w=$_";
     my $c = maxcause($inf->{$_}{cause});
     print "#A"  if ($c eq "adj=adj");
     print "#S1" if ($c eq "adj=de sub");
     print "#S2" if ($c eq "sub=sub");
     print "#S3" if ($c =~ /=sub_/);

     print "=$inf->{$_}{oco}";
     print "\n";
   }
 }
}

sub maxcause{
 my ($inf)=@_;
 for (sort { $inf->{$b} <=> $inf->{$a} } keys %$inf){
   return $_;  ## retorna a causa com máximas ocorrências
 }
}

__END__
print Dumper \%dic;


__END__
  39214 = comunidades europeias =!ABBA!= european communities
  32850 = jornal oficial =!ABBA!= official journal
  32832 = parlamento europeu =!ABBA!= european parliament
  32730 = união europeia =!ABBA!= european union
  31650 = comunidade europeia =!ABBA!= european community
  15602 = países terceiros =!ABBA!= third countries
  12441 = pergunta escrita =!ABBA!= written question
  10783 = reino unido =!ABBA!= united kingdom
  10745 = texto original =!ABBA!= original text
  10321 = autoridades competentes =!ABBA!= competent authorities
   9108 = tratado ce =!ABBA!= ec treaty
   8870 = autoridade competente =!ABBA!= competent authority
   8211 = comunidade económica europeia =!ABCCBA!= european economic community
   7670 = autoridades _genti_ =!ABBA!= _genti_ authorities
   7214 = mercado interno =!ABBA!= internal market
   6987 = votação do pe =!A!= ep vote
   6918 = comité de gestão =!A!= management committee
   6831 = medidas de execução =!A!= implementing measures
   5765 = produtos agrícolas =!ABBA!= agricultural products
   5706 = seu artigo _ord_ =!ABCBCA!= article _int_ thereof

   1212 = pauta aduaneira comum =!ABCCBA!= common customs tariff
