#!/usr/bin/perl -s

use strict;
#use Guesser;

our ($raw,$q0,$debug,$id,$h,$l1,$l2,$html,$patt4,$max,$pdf,$skipch,$isutf8,
     $tdef,$txt,$txtll,$aligned,$wl,$sync,$syncatt,$getalt,$negpatt4,$addtag,$local);

use Corpora::ParGuesser;

my $dir = shift or $h = 1;
my $output = shift;
my ($nom1,$nom2,$nom3,$nom4,$nom5,$nom6);
my $enc="";
$enc = "-isutf8"     if $isutf8;
$raw = "-rawpdf"     if $raw;
$tdef= "-tdef=$tdef" if $tdef;
if($debug)	{	$debug="-q0 -debug"; 	}
elsif ($q0) {	$debug="-q0";			}

$getalt = "-getalt" if $getalt ;
if($addtag){
 my ($b,@t)=split(/[=,]/,$addtag);
 die("invaid -addtag value\n") unless $b;
 Corpora::ParGuesser::jjaddtag($b,@t);
}
my $t;
#if($sync)      {  $sync eq '1'?    $sync="-sync"    : $sync="-sync=$sync"; }
#elsif($syncatt){  $syncatt eq '1'? $sync="-syncatt" : $sync="-syncatt=$syncatt"; }

if   ($syncatt && $syncatt eq 1){ $sync = "-syncatt"          }
elsif($syncatt)                 { $sync = "-syncatt=$syncatt" }
elsif($sync    && $sync eq 1)   { $sync = "-sync"             }
elsif($sync)                    { $sync = "-sync=$sync"       }

my $corpdir ="$ENV{HOME}/corpora";
mkdir $corpdir unless -d $corpdir;
chomp(my $regis =`cwb-config -r`);
$regis="$ENV{HOME}/registry" if $local;


#my $nom5=1;

if($h){ die("usage mkterminum [-id=name] [-l1=pt] [-l2=fr] dir [output]
      mkterminum [-l1=pt] [-l2=fr] f.paths [output]
      mkterminum [-l1=pt] [-l2=fr] f.blocks [output]
      mkterminum f.en.pt._pairs [output]\n\n");} 

if    ($dir =~ /(.+)\.(\w+)\.(\w+)\.pairs$/){
                $nom1=$nom2=$nom3=$nom4=1;   $l1=$2,$l2=$3;      $id=$1  }
elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\.tmxdir$/){
          $nom1=$nom2=$nom3=$nom4=$nom5=1;   $l1=$2,$l2=$3;      $id=$1  }
elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\._pairs$/){
                      $nom1=$nom2=$nom3=1;   $l1=$2,$l2=$3;      $id=$1  }
elsif ($dir =~ /(.+)\.(\w+)\.(\w+)\.blocks$/){
                            $nom1=$nom2=1;   $l1=$2,$l2=$3;      $id=$1  }
#elsif ($dir =~ /(.+)\.blocks$/ ){
#                            $nom1=$nom2=1;                       $id=$1  }
elsif ($dir =~ /(.+)\.paths$/ ){  $nom1=1;                       $id=$1  }
elsif ( -d $dir)           {                                             }
else  {die("what shall I do?\n")};

if    ($output =~ /(.+)\.(\w+)\.(\w+)\.blocks$/){
                 $nom3=$nom4=$nom5=$nom6=1; $l1||=$2, $l2||=$3;  $id||=$1}
elsif ($output =~ /(.+)\.(\w+)\.(\w+)\._pairs$/){
                       $nom4=$nom5=$nom6=1; $l1||=$2, $l2||=$3;  $id||=$1}
elsif ($output =~ /(.+)\.(\w+)\.(\w+)\.pairs$/){
                             $nom5=$nom6=1; $l1||=$2, $l2||=$3;  $id||=$1}
#elsif ($output =~ /(.+)\.blocks$/ ){
#                 $nom3=$nom4=$nom5=$nom6=1;                      $id||=$1}
elsif ($output =~ /(.+)\.paths$/ ) {
           $nom2=$nom3=$nom4=$nom5=$nom6=1;                      $id||=$1}
else                               {};

$l1 ||= "pt";
$l2 ||= "en";
$id ||= "_corpus_name_$l1-$l2";

$html  = " -html "           if $html;
$max   = "| head -$max "     if $max;
$patt4 = "| egrep '$patt4' " if $patt4;
$patt4 = "| egrep -v '$negpatt4' " if $negpatt4;


sub m1{ printf STDERR ("**1**** Calculo da lista de ficheiros\n");
 executa("find $dir -type f > $id.paths");
}

sub m2{printf STDERR ("**2**** Calculo de blocos\n");
 Corpora::ParGuesser::list2blocks({lang=>[$l1,$l2]},"$id.paths");
}

sub m3{ printf STDERR ("**3**** Blocos para pares $l1/$l2\n");
  open(F,">$id.$l1.$l2._pairs") or die("cant create $id.pairs");
  for (Corpora::ParGuesser::blocks2pairs(Corpora::ParGuesser::jjnames($l1),Corpora::ParGuesser::jjnames($l2),"$id.$l1.$l2.blocks")){
    print F "$_->[0]\t$_->[1]\n";
  }
  close F;
}

sub m4{ printf STDERR ("**4**** Verificar correspondencias de línguas $id+($l1 $l2)\n");
# system("time bitextcheck -lang1=$l1 -lang2=$l1 -f $id._pairs > $id.pairs");
  if($skipch or $aligned){  
    executa("time cat $id.$l1.$l2._pairs $patt4 $max > $id.$l1.$l2.pairs" ); 
  }
  else {
##    executa("time cat $id.$l1.$l2._pairs $patt4 $max | bi-langcheck -debug -l1=$l1 -l2=$l2 $html > $id.$l1.$l2.pairs" ); 
    executa("time cat $id.$l1.$l2._pairs $patt4 $max | bi-lang_cld_check -debug -l1=$l1 -l2=$l2 $html > $id.$l1.$l2.pairs" ); 
  }
}

sub m5{ printf STDERR ("**5**** alinhamento... $id+($l1 $l2)\n");
  my ($a1,$a2);
  my $n=1;
  my $_dir="$id.$l1.$l2.tmxdir";

  my $wlo = ($wl ? "-wl=$wl" : "");

  is_wd($corpdir, $regis);
  mkdir ($_dir);
     if($html)    {$t="-t=html"}
  elsif($pdf)     {$t="-t=pdf" }
  elsif($txt)     {$t="-t=txt" }
  elsif($txtll)   {$t="-t=txtll" }    ## txt long lines
  elsif($aligned) {$t="-t=aligned"}   ## pre aligned line to line
  else {   } ## filealigner will try to guess the type...
  
  open(LOG,">$_dir/LOG") or warn("cant log\n");
  for my $p (`cat $id.$l1.$l2.pairs $patt4 $max`){
    chomp($p);
    my $valid=0;
    my ($f1,$f2,$lt);
    $p =~ s/\s*#.*//;

    if    ($p =~ /(.+?)\t(\S+)\t(\S+)/){ ($valid,$f1,$f2,$t)=(1,$1,$2,"-t=$3");}
    elsif ($p =~ /(.+?)\t(\S+)/       ){ ($valid,$f1,$f2,$lt)=(1,$1,$2,$t);}
    else                               {  $valid = 0;}

    my $nn = sprintf("%05d",$n);
    if ($valid){
      executa(qq{filealigner $tdef $getalt $raw -num=mkt_$$ $debug $wlo $t $enc $sync -l1=$l1 -l2=$l2 -o=$_dir/$id.$nn.$l1.$l2.tmx "$f1" "$f2"});
      executa(qq{rm -rf $corpdir/*mktc-$$-pml}) unless ($aligned);
      executa(qq{rm $regis/*mktc-$$-pml}) unless ($aligned);
    }

	if(-f "$_dir/$id.$nn.$l1.$l2.tmx.BADALIGNMENT")
         { print LOG "$nn\t$f1\t$f2\tBAD\n"; }
    else { print LOG "$nn\t$f1\t$f2\n"; } 
    $n++;
  } 
  close LOG;
}


sub m6{ printf STDERR ("**6**** tmx concat... $id+($l1 $l2)\n");
  my $_dir="$id.$l1.$l2.tmxdir";
  fasttmxcat($_dir,$id,$l1,$l2);
  ##  executa(qq{tmx2tmx -cat $_dir/$id.[0-9]*.tmx > $_dir/$id.$l1.$l2.tmx});
}

sub fasttmxcat{ my ($_dir,$id,$l1,$l2) = @_;
  my $first=1;
  open(F1,">","$_dir/$id.$l1.$l2.tmx") or die;
  for(<$_dir/$id.[0-9]*.tmx>){
    open(F2,"<",$_) or die;
    if($first){ $first = 0;
      while(<F2>){
        print F1 $_;
        last if /<body>/;
      }
      print F1 "\n";
    }
    print F1 "<!-- $_ -->\n";
    while(<F2>){
      print F1 if /<\/?tu/;
      print F1 $_  if !/\S/;
    }
    close(F2)
  }
  print F1 "\n</body>\n</tmx>\n"
}

m1() unless $nom1;
m2() unless $nom2;
m3() unless $nom3;
m4() unless $nom4;
m5() unless $nom5;
m6() unless $nom6;

sub executa {
  my $cmd = shift;
  print STDERR "$cmd\n";
  system ($cmd) == 0 or warn "** ERROR ************ system $cmd failed: $!$?\n";
}

sub is_wd{
  for(@_){unless( -d $_ && -w $_){ 
    die("** FATAL ERROR ******** $_ must be a Writable directory\n"); }}
  return 1;
}

__END__

=head1 NAME

mkterminum - makes text alignment, and builds TMX for paralell corpora

=head1 SYNOPSIS

 mkterminum [-id=name] [-l1=pt] [-l2=fr] dir [output]
 mkterminum [-l1=pt] [-l2=fr] f.paths [output]
 mkterminum [-l1=pt] [-l2=fr] f.blocks [output]
 mkterminum f.en.pt._pairs [output]

=head1 DESCRIPTION

Depending on the arguments (input / output) some of the following steps are
done

  dir                                 directory
     -> paths        .paths           list of files
     -> blocks       .blocks          list of blocks
     -> _pairs       ._pairs          list of bitext candidate pairs
     -> pairs        .pairs           list of bitext
     -> tmx          .tmxdir          directory with the TMXs

If we want just to calculate the tmx for a set of bitexts, we create a
file name "name.pt.en.pairs" with the bitexts filename pairs and run:

  mkterminum name.pt.en.pairs

And the only step performed is step5.

=head2   Step1 : dir -> paths

Given a directory, buils a file "name.paths" with the names of all the files

=head2   Step2 : paths -> blocks

Given a file "name.paths" extract the set of (sets os filenames) that
are equal if we delete the Language-Names parts
and builds a "name.l1.l2.blocks" file

=head2   Step3 : blocks -> _pairs

Given a set of blocks and a pair of languages buils a list of bitexts 
candidate pairs "name._pairs"

=head2   Step4 : _pairs -> pairs

Given a list of bitext candidate pairs, rejects those who have the wrong
languages or that have very different sizes and build a "name.pairs" file

=head2   Step5 : pairs -> TMXs -> TMX

Given a list of bitexts, makes segmentation, align and naive analysis of the
result. It builds a directory "name.tmxdir", and a TMX file is calculated for
each bitext. If the result is considered bad, the TMX is renamed to
"name.tmx.BADALIGNMENT".
In the end the good alignments are concatedated with the command

  tmx2tmx -cat *.tmx

See also "name.tmxdir/LOG".

=head1 Options

=head2 C<-t> to force file type

 -t=html  or  -html
 -t=pdf   or  -pdf
 -t=txt   or  -txt

=head2 C<-tdef> 

C<-tdef=...> to force filetype if extension is unknown

 -tdef=html  (filetype is html if unknown extension

=head2 C<-patt4> to restrict the filenames

 -patt4='pdf'     -- just accept files matching this pattern
 -negpatt4='pdf'  -- just accept files don't matching this pattern

=head2 C<-skipch> to skip bitext checking

 -skipch    

=head2 C<-max> to restrict the maximum number of files to process

 -max=20  -- just process the first 20 file pairs

=head2 C<-addtag> 

 -addtag=pt=po  -- use "po" as a portuguese tag candidate

=head2 C<-isutf8> 

use C<-isutf8> to force UTF-8 encoding in text files

=head2 C<-h> help

=head2 C<-id> 


=head2 C<-q0>

use C<-q0> to keep 0:1 ou 1:0 conections

=head2 C<-debug>

keep 0:1, 1:0 connections and also temporary files

=head2 C<-l1> C<-l2> 

 -l1=PT -l2=FR 

=head2 C<-wl>

Use C<-wl=file> to provide a file with pairs of words L1-L1 to help in the
alignment process.

=head2 C<-sync=synctag>

C<-sync=synctag>.  (synctag defaults to "sync" if no synctag is provided)

use C<<synctag>> as syncronization tag; (the number of syncronization tag
should de equal. 

=head2 C<-syncatt=synctag>

C<-syncatt=synctag>. (synctag defaults to "sync" if no synctag is provided)

use C<<synctag id="value">> as syncronization tag for equal "id" values; 

=head1 Dependences

 CWB -- Stuttgard corpus workbench

 cwb-utils

 XML::TMX

 Lingua::PT::PLN

 Corpora::ParGuesser.pm (project Natura) 

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut      

__END__