#!/usr/bin/perl -w -s use Lingua::PT::PLN; use Data::Dumper; use vars qw{$noimg $tag $latin1}; my @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd div blockquote hr address); my @removtag=qw(body html font a b i tt small); my @remov=qw(head meta); my %oco=(); if(not $tag) { $tag="p" } if($noimg) { push (@removtag, "img"); } if($latin1) { $ARGV[0] = "recode -f html..latin1 < $ARGV[0] |" or die;} my $patremovtag = '?(?:'. join('|', @removtag) .')\b[^>]*>'; my $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?\1>'; # my $patsep = '\s*(?:?(?:'. join('|', @breakby) .')\b[^>]*>\s*)+'; my $patsep = '(?:?(?:'. join('|', @breakby) .')\b[^>]*>)'; undef $/; $_= <>; s#$patremovtag##ig; s#$patremov##ig; for(split(/($patsep)/i,$_)){ if(/$patsep/){$oco{$&} ++ ;} else { my @l = m{([0-9]+|[\@:;!?\%=+*\\\/]|\.+)}g; for(@l){$oco{$_}++}} # print join("\n",@l),"\n"; } print join(",\n", map{ "'$_' => $oco{$_}" } sort keys %oco); __END__ =head1 NAME html2p - html to list od C
=head1 SYNOPSIS
html2p [-noimg] [-latin1] file
=head1 DESCRIPTION
C " with the independent text segments after
dividing it in sentences.
It was designed to help in the process of aligning texts.
The command C