package corporaUtils;

require 5.005_62;
use strict;
use warnings;
use Lingua::PT::PLN;

require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw( ) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw( &listofpairs2pml &htmls2pml &tmxsplit );
our $VERSION = '0.01';

my (@breakby, @removtag, @remov, $rawbreakby);
my ($patremovtag, $patremov, $patsep);

sub init{
  my %opt=(img=>1, breakbyemptyline=>0, txt=>0, breakby=>[]);
  if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;

  if($opt{img})   { push (@removtag, "img"); }
  if($opt{breakbyemptyline} || $opt{txt}){ $rawbreakby = '|\n[ 	]*\n'; }
  if($opt{breakby}) { push (@breakby, @{$opt{breakby}}); }

  $patremovtag = q{</?(?:}. 
                 join('|', @removtag) .
                 q{)\b(?:=\"[^"]{1,80}\"|=\'[^']{1,80}\'|[^>])*>};
  $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?</\1>';
  $patsep = '\s*(?:</?(?:'
            .          join('|', @breakby) 
            .        ')\b[^>]*>\s*'
            .        $rawbreakby
            . ')+';
}

BEGIN{
  @breakby=qw(table tr td th p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd 
           span div blockquote hr address center form input);
  @removtag=qw( sup sub body html em font a b i u tt small strong);
  @remov=qw(frameset head meta script);
  $rawbreakby="";
  corporaUtils::init();
}

# listofpair( listoffiles, [outputA, outputB])
sub listofpairs2pml{
   my $name = shift; 
   my $corpus1 =  shift || "$name.A.out";
   my $corpus2 =  shift || "$name.B.out";
   my $id; 
   open(A,">$corpus1") or die;
   open(B,">$corpus2") or die;
   open(F,$name) or die ("cant read $name\n");

   while(<F>){
     my ($a,$b) = m!(.*?)\t(.*)! or die("invalid lines");
     print STDERR "($a)($b)\n";
     next if ($a =~ /\.pdf$/ or $b =~ /\.pdf$/ );
     $id ++;
     print A "<f id='$id' name='$a'>\n", html2p($a),"</f>\n";
     print B "<f id='$id' name='$b'>\n", html2p($b),"</f>\n";
   }
   close A; close B; close F;
}

sub htmls2pml{
   for my $f (@_){
      print "<f name='$f'>\n", html2p($f), "</f>\n";
   }
}
 
sub html2p{
 my %opt =(tag => "p", latin1 => 1, comm => 0 );
 if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;

  my $f = shift;
  my $r = "";
  if($opt{latin1}){open(F,$f) or die("cant open $f\n"); }
  else     {open(F,"recode -f html..latin1 < '$f'|") or die("cant open $f\n")}
  local $/;
  undef $/;
  $_= <F>;
  close F;
  s#<!--.*?-->##sg;
  s#<!DOCTYPE.*?>##isg;
  s#<\?xml.*?>##isg;
  s#\xA0# | #g;                          #A0 - strange character similar to "|"
  s#$patremovtag##ig;
  s#$patremov##ig;
  for(split(/$patsep/i,$_)){
     $r .=  "<!-- $& -->\n" if $opt{comm};
     s/\r/ /g;
     s/\s*\n\s*/ /g;
     $r .=  Lingua::PT::pln::xmlsentences({st=>$opt{tag}},$_)."\n";
  }
  $r
}

sub tmxsplit{ #### (type => "[tu]|twente",encoding="latin1|...",cutmaxlen=inf)
 my %opt =(type => "tu");
 if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;
 my @fileArgv = @_;

 use XML::DT;
 my $q;
 my $i = 0;
 my $f;
 my $filename = $fileArgv[0];
 my %files;
 my $data;
 my %h = (
 #     '-outputenc' => "ISO-8859-1",
      'seg' => sub{ for ($c){ s/\s\s+|^\s+|\s+$/ /g; }; 
                    $c},
      'ut'  => sub{" "},
      'tu'  => sub{$c},
      'tuv' => sub{$c =~ s/^[\s\n]*//; $c =~ s/[\s\n]*$//;
                   $data->{$v{lang}||$v{"xml:lang"}} = 
                       $opt{cutmaxlen} && length($c) > $opt{cutmaxlen} 
                       ? substr($c,0,$opt{cutmaxlen})."||" : $c},
     );

 $h{-outputenc} = "ISO-8859-1" if $opt{twente} || $opt{latin1};

 $| = 1;
 for $f (@fileArgv){
 #  print "\n$f" unless $q;
   print "\n$f";
   $/ = "\n";
   open X, $f or die "cannot open file $f";
   do {
     if(/encoding=.ISO-8859-1./){$h{-outputenc}=$h{-inputenc}="ISO-8859-1";}
   } while (defined($_ = <X>) and $_ !~ /<body\b/);
 
   $/ = "</tu>";

   while(<X>) {
     $i++;
     last if /<\/body>/;
     #print "." if (!$q && $i%500==0);
     print "." if ($i%500==0);
     s/\>\s+/>/;
     undef($data);
     eval {dtstring($_, %h)} ; ## dont die in invalid XML
     if($@){warn($@)}
     else{
       for my $k (keys %$data) {
         if (exists($files{"$filename-$k"})) {
           myprint(\%opt,$files{"$filename-$k"}, $data->{$k},$i);} 
         else { my $x;
           open $x, ">$filename-$k" or die("cant >$filename-$k\n");
           myprint(\%opt,$x, $data->{$k},$i);
           $files{"$filename-$k"} = $x;
         }
       }
     }
   }
   close X;
   for (keys %files){ close $files{$_}}
 }
}

sub myprint{
 my %opt =();
 if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ;
 my($f,$tu,$i)=@_;

 if ($opt{twente}){
      for ($tu){
         s/<.*?>/ /gs;
         s/[\|\$]/ /gs;
         s/(\w)([.;,!:?«»"])/$1 $2/g;
         s/([.;,!:?«»"])(\w)/$1 $2/g;
         s/\s\s+|^\s+|\s+$/ /g;
      }
      print {$f} "$tu\n\$\n";
    } else {
      print {$f} "<tu id=\"$i\">$tu</tu>\n";
    }
}

1;
__END__

=head1 NAME

corporaUtils - Perl extension for blah blah blah
html2pml - html to list of C<P>

=head1 SYNOPSIS

  html2pml [-tag=...] [-com] [-noimg] [-nolatin1] file
  html2pml -listofpairs [-tag=...] [-com] [-noimg] [-nolatin1] file

=head1 DESCRIPTION

C<html2pml> transforms HTML in PML ("<p>" markcup language - only use tags P)
with the independent segments, after dividing them in sentences.

It was designed to help in the process of aligning texts.

The command C<recode> should be installed in order to be possible to
make the conversion to latin1.

=head2 With C<-listofpairs> option

With C<-listofpairs> option, it accepts a file with lines with 2 filenames
separated by a tab, and converts them to PML and makes 2 output files
(_Aout and _Bout) with the PLMs. Each file is tagged with

 <f id='linenumber' name='filename'> .... </f>

in order to help in the process of aligning texts.

=head1 Options

C<-nolatin1> - by default Html is converted to latin1; use this option to aviod
this
C<-com> - with this option a XML comment is inserted with the
removed/translated tags

C<-tag=T> - use tag name T (instead of default - C<p>)

C<-noimg> - remove IMG tags (default keep them)

C<-breakby=tag> - use C<tag> as a sentence separator

C<-txt> or C<-breakbyemptyline> - use empty lines as paragraph separators 

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut      

__END__

=head1 NAME

tmxsplit - splits a TMX file several files, one for each language

=head1 SYNOPSIS

 tmxsplit f.tmx f2.tmx ...
 tmxsplit -twente f.tmx
 

=head1 DESCRIPTION

splits a TMX file in several files (one per language) and put
a tag C<tu id=...> in each translate union.

The names of the output files is taken from the first tmx file.

=head1 Options

 -twente  -- makes a format compatible with twente-aligner 

 -latin1  -- a make latin1-encoded output

 -q       -- dont print filenames and "."

 -cutmaxlen=n -- cut translations by the n character

=head1 AUTHOR

Alberto Simões, albie@di.uminho.pt

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

tmx2cqp(1)

=cut      

