package corporaUtils; require 5.005_62; use strict; use warnings; use Lingua::PT::PLN; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( &listofpairs2pml &htmls2pml &tmxsplit ); our $VERSION = '0.01'; my (@breakby, @removtag, @remov, $rawbreakby); my ($patremovtag, $patremov, $patsep); sub init{ my %opt=(img=>1, breakbyemptyline=>0, txt=>0, breakby=>[]); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; if($opt{img}) { push (@removtag, "img"); } if($opt{breakbyemptyline} || $opt{txt}){ $rawbreakby = '|\n[ ]*\n'; } if($opt{breakby}) { push (@breakby, @{$opt{breakby}}); } $patremovtag = q{])*>}; $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?'; $patsep = '\s*(?:]*>\s*' . $rawbreakby . ')+'; } BEGIN{ @breakby=qw(table tr td th p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd span div blockquote hr address center form input); @removtag=qw( sup sub body html em font a b i u tt small strong); @remov=qw(frameset head meta script); $rawbreakby=""; corporaUtils::init(); } # listofpair( listoffiles, [outputA, outputB]) sub listofpairs2pml{ my $name = shift; my $corpus1 = shift || "$name.A.out"; my $corpus2 = shift || "$name.B.out"; my $id; open(A,">$corpus1") or die; open(B,">$corpus2") or die; open(F,$name) or die ("cant read $name\n"); while(){ my ($a,$b) = m!(.*?)\t(.*)! or die("invalid lines"); print STDERR "($a)($b)\n"; next if ($a =~ /\.pdf$/ or $b =~ /\.pdf$/ ); $id ++; print A "\n", html2p($a),"\n"; print B "\n", html2p($b),"\n"; } close A; close B; close F; } sub htmls2pml{ for my $f (@_){ print "\n", html2p($f), "\n"; } } sub html2p{ my %opt =(tag => "p", latin1 => 1, comm => 0 ); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my $f = shift; my $r = ""; if($opt{latin1}){open(F,$f) or die("cant open $f\n"); } else {open(F,"recode -f html..latin1 < '$f'|") or die("cant open $f\n")} local $/; undef $/; $_= ; close F; s###sg; s###isg; s#<\?xml.*?>##isg; s#\xA0# | #g; #A0 - strange character similar to "|" s#$patremovtag##ig; s#$patremov##ig; for(split(/$patsep/i,$_)){ $r .= "\n" if $opt{comm}; s/\r/ /g; s/\s*\n\s*/ /g; $r .= Lingua::PT::pln::xmlsentences({st=>$opt{tag}},$_)."\n"; } $r } sub tmxsplit{ #### (type => "[tu]|twente",encoding="latin1|...",cutmaxlen=inf) my %opt =(type => "tu"); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my @fileArgv = @_; use XML::DT; my $q; my $i = 0; my $f; my $filename = $fileArgv[0]; my %files; my $data; my %h = ( # '-outputenc' => "ISO-8859-1", 'seg' => sub{ for ($c){ s/\s\s+|^\s+|\s+$/ /g; }; $c}, 'ut' => sub{" "}, 'tu' => sub{$c}, 'tuv' => sub{$c =~ s/^[\s\n]*//; $c =~ s/[\s\n]*$//; $data->{$v{lang}||$v{"xml:lang"}} = $opt{cutmaxlen} && length($c) > $opt{cutmaxlen} ? substr($c,0,$opt{cutmaxlen})."||" : $c}, ); $h{-outputenc} = "ISO-8859-1" if $opt{twente} || $opt{latin1}; $| = 1; for $f (@fileArgv){ # print "\n$f" unless $q; print "\n$f"; $/ = "\n"; open X, $f or die "cannot open file $f"; do { if(/encoding=.ISO-8859-1./){$h{-outputenc}=$h{-inputenc}="ISO-8859-1";} } while (defined($_ = ) and $_ !~ /"; while() { $i++; last if /<\/body>/; #print "." if (!$q && $i%500==0); print "." if ($i%500==0); s/\>\s+/>/; undef($data); eval {dtstring($_, %h)} ; ## dont die in invalid XML if($@){warn($@)} else{ for my $k (keys %$data) { if (exists($files{"$filename-$k"})) { myprint(\%opt,$files{"$filename-$k"}, $data->{$k},$i);} else { my $x; open $x, ">$filename-$k" or die("cant >$filename-$k\n"); myprint(\%opt,$x, $data->{$k},$i); $files{"$filename-$k"} = $x; } } } } close X; for (keys %files){ close $files{$_}} } } sub myprint{ my %opt =(); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my($f,$tu,$i)=@_; if ($opt{twente}){ for ($tu){ s/<.*?>/ /gs; s/[\|\$]/ /gs; s/(\w)([.;,!:?Ťť"])/$1 $2/g; s/([.;,!:?Ťť"])(\w)/$1 $2/g; s/\s\s+|^\s+|\s+$/ /g; } print {$f} "$tu\n\$\n"; } else { print {$f} "$tu\n"; } } 1; __END__ =head1 NAME corporaUtils - Perl extension for blah blah blah html2pml - html to list of C

=head1 SYNOPSIS html2pml [-tag=...] [-com] [-noimg] [-nolatin1] file html2pml -listofpairs [-tag=...] [-com] [-noimg] [-nolatin1] file =head1 DESCRIPTION C transforms HTML in PML ("

" markcup language - only use tags P) with the independent segments, after dividing them in sentences. It was designed to help in the process of aligning texts. The command C should be installed in order to be possible to make the conversion to latin1. =head2 With C<-listofpairs> option With C<-listofpairs> option, it accepts a file with lines with 2 filenames separated by a tab, and converts them to PML and makes 2 output files (_Aout and _Bout) with the PLMs. Each file is tagged with .... in order to help in the process of aligning texts. =head1 Options C<-nolatin1> - by default Html is converted to latin1; use this option to aviod this C<-com> - with this option a XML comment is inserted with the removed/translated tags C<-tag=T> - use tag name T (instead of default - C

) C<-noimg> - remove IMG tags (default keep them) C<-breakby=tag> - use C as a sentence separator C<-txt> or C<-breakbyemptyline> - use empty lines as paragraph separators =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut __END__ =head1 NAME tmxsplit - splits a TMX file several files, one for each language =head1 SYNOPSIS tmxsplit f.tmx f2.tmx ... tmxsplit -twente f.tmx =head1 DESCRIPTION splits a TMX file in several files (one per language) and put a tag C in each translate union. The names of the output files is taken from the first tmx file. =head1 Options -twente -- makes a format compatible with twente-aligner -latin1 -- a make latin1-encoded output -q -- dont print filenames and "." -cutmaxlen=n -- cut translations by the n character =head1 AUTHOR Alberto Simőes, albie@di.uminho.pt J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). tmx2cqp(1) =cut