#!/usr/bin/perl -s
use locale;


$proteger = '
     [\w_.-]+ \@ [\w_.-]+                      # emails
  |  <[^>]*>                                   # marcup XML SGML
  |  ((https?|ftp|gopher)://|www)[\w_./~-]+    # urls
';

$abreviatura = join '|', qw( srt?a? dra? [A-Z] etc exa? jr profs? arq av estr? tv lgo pr Oliv ig mrs? min rep);


local $/ = ">";
while(<>) {
#        $n=0;
#	s/(.*>)\n*/ / and print $1;
        s/($proteger)/savit($1)/xge;
	s#([»\]])#$1 #g;
	s#([«\[])# $1#g;
	s#\"# \" #g;
 	s/(\s*\b\s*|\s+)/\n/g;
	s/(.)\n-\n/$1-/g;
	s/\n+/\n/g;
	s/\n(\.?[ºª])\b/$1/g;
	while ( s#\b([0-9]+)\n([\,.])\n([0-9]+\n)#$1$2$3#g ){};
#	s/\b([a-z]+)\n-\n/$1-/g;
	s#\n($abreviatura)\n\.\n#\n$1\.\n#ig;
#	s/(\w)[-\xad]\n/$1/g if $junta;
        s/\n?</\n</;
        $_=loadit($_);
	print;
}

my %savit_p = ();
my $savit_n = 0;

sub savit{
  my $a=shift; 
  $savit_p{++$savit_n}=$a ;
  " __MARCA__$savit_n " }

sub loadit{
  my $a = shift; 
  $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
  $savit_n = 0;
  $a;
}

=head1 NAME

cpqtokens - encodes a text for CQP (one token per line)

=head1 SYNOPSIS

  cqptokens file*  >  out

=head1 DESCRIPTION


=head1 AUTHOR

Paulo Rocha, paulo.rocha@alfa.di.uminho.pt

J.Joao Almeira, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

cqp(1)

=cut      

__END__
