#!/usr/bin/perl -s use Lingua::PT::PLN; use vars qw{$nocom $noimg $tag $latin1}; @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd div blockquote hr address); @removtag=qw(body html font a b i tt small); @remov=qw(head meta); if(not $tag) { $tag="p" } if($noimg) { push (@removtag, "img"); } if($latin1) { $ARGV[0] = "recode -f html..latin1 < $ARGV[0] |" or die;} $patremovtag = '?(?:'. join('|', @removtag) .')\b[^>]*>'; $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?\1>'; $patsep = '\s*(?:?(?:'. join('|', @breakby) .')\b[^>]*>\s*)+'; undef $/; $_= <>; s#$patremovtag##ig; s#$patremov##ig; print "
\n"; for(split(/$patsep/i,$_)){ print "\n" unless $nocom; s/\s*\n\s*/ /g; print Lingua::PT::PLN::xmlsentences({st=>$tag},$_),"\n"; } print "\n\n"; __END__ =head1 NAME html2p - html to list od C
=head1 SYNOPSIS
html2p [-nocom] [-noimg] [-latin1] file
=head1 DESCRIPTION
C " with the independent text segments after
dividing it in sentences.
It was designed to help in the process of aligning texts.
The command C