#!/usr/bin/perl -w -s use Lingua::PT::PLN; use Data::Dumper; use vars qw{$noimg $tag $latin1}; my @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd div blockquote hr address); my @removtag=qw(body html font a b i tt small); my @remov=qw(head meta); my %oco=(); if(not $tag) { $tag="p" } if($noimg) { push (@removtag, "img"); } if($latin1) { $ARGV[0] = "recode -f html..latin1 < $ARGV[0] |" or die;} my $patremovtag = ']*>'; my $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?'; # my $patsep = '\s*(?:]*>\s*)+'; my $patsep = '(?:]*>)'; undef $/; $_= <>; s#$patremovtag##ig; s#$patremov##ig; for(split(/($patsep)/i,$_)){ if(/$patsep/){$oco{$&} ++ ;} else { my @l = m{([0-9]+|[\@:;!?\%=+*\\\/]|\.+)}g; for(@l){$oco{$_}++}} # print join("\n",@l),"\n"; } print join(",\n", map{ "'$_' => $oco{$_}" } sort keys %oco); __END__ =head1 NAME html2p - html to list od C

=head1 SYNOPSIS html2p [-noimg] [-latin1] file =head1 DESCRIPTION C makes a html page with "

" with the independent text segments after dividing it in sentences. It was designed to help in the process of aligning texts. The command C should be installed in order to be possible to use C<-latin1> option. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut