#!/usr/bin/perl -s use Lingua::PT::pln; use vars qw{$comm $img $tag $latin1}; @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd div blockquote hr address center); @removtag=qw( body html font a b i u tt small); @remov=qw(frameset head meta); if(not $tag) { $tag="p" } unless($img) { push (@removtag, "img"); } $patremovtag = ']*>'; $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?'; $patsep = '\s*(?:]*>\s*)+'; open(A,">_Aout") or die; open(B,">_Bout") or die; while(<>){ my ($a,$b) = m!(.*)\t(.*)!; print STDERR "($a)($b)\n"; $id ++; print A "\n", html2p($a),"\n"; print B "\n", html2p($b),"\n"; } sub html2p{ my $f = shift; my $r = ""; if($latin1){open(F,"recode -f html..latin1 < $f|") or die("cant open $f\n")} else {open(F,$f) or die("cant open $f\n"); } local $/; undef $/; $_= ; close F; s###sg; s#$patremovtag##ig; s#$patremov##ig; for(split(/$patsep/i,$_)){ $r .= "\n" if $comm; s/\s*\n\s*/ /g; $r .= Lingua::PT::pln::xmlsentences({st=>$tag},$_)."\n"; } $r } __END__ =head1 NAME htmlpairlist2p - file with html pairs list to a single pair of C

s =head1 SYNOPSIS htmlpairlist2p [-comm] [-img] [-latin1] file =head1 DESCRIPTION C makes a html page with "

" with the independent text segments after dividing it in sentences. It was designed to help in the process of aligning texts. The command C should be installed in order to be possible to use C<-latin1> option. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut