package pln;
use locale;

my %savit_p = ();
my $savit_n = 0;

my $terminador='([.?!;:]+|<[pP]>|<br>)';

my $protect = '
       [\w_.-]+ \@ [\w_.-]+                      # emails
    |  <[^>]*>                                   # marcup XML SGML
    |  \d+\.\d+                                  # numbers
    |  \d+\:\d+                                  # the time
    |  ((https?|ftp|gopher)://|www)[\w_./~-]+    # urls
';

my $abrev = join '|', qw( srt?a? dra? [A-Z] etc exa? jr profs? arq av estr?
    et al vol eng tv lgo pr Oliv ig mrs? min rep );

sub separaFrases{
  my $par=shift;
  for($par){
      s/($protect)/savit($1)/xge;
      s#\b(($abrev)\.)#savit($1)#ige;
      s#($terminador)#$1</s>\n<s>#g;
      $_=loadit($_);
      s#<s>\s*$##s;
  }
  "<s>$par";
}
  
sub cqptokens{
  local $/ = ">";
  while(<>) {
        s/($protect)/savit($1)/xge;
	s#([»\]])#$1 #g;
	s#([«\[])# $1#g;
	s#\"# \" #g;
 	s/(\s*\b\s*|\s+)/\n/g;
	s/(.)\n-\n/$1-/g;
	s/\n+/\n/g;
	s/\n(\.?[ºª])\b/$1/g;
	while ( s#\b([0-9]+)\n([\,.])\n([0-9]+\n)#$1$2$3#g ){};
	s#\n($abrev)\n\.\n#\n$1\.\n#ig;
        s/\n?</\n</;
        $_=loadit($_);
	print;
  }
}

sub savit{
  my $a=shift; 
  $savit_p{++$savit_n}=$a ;
  " __MARCA__$savit_n " 
}

sub loadit{
  my $a = shift; 
  $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
  $savit_n = 0;
  $a;
}

1;

=head1 NAME

cpqtokens - encodes a text for CQP (one token per line)

=head1 SYNOPSIS

  cqptokens file*  >  out

=head1 DESCRIPTION


=head1 AUTHOR

Paulo Rocha, paulo.rocha@alfa.di.uminho.pt

J.Joao Almeira, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

cqp(1)

=cut      

__END__





sub savit{
  my $a=shift; 
  $savit_p{++$savit_n}=$a ;
  " __MARCA__$savit_n " }

sub loadit{
  my $a = shift; 
  $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
  $savit_n = 0;
  $a;
}

1;

=head1 NAME

cpqtokens - encodes a text for CQP (one token per line)

=head1 SYNOPSIS

  cqptokens file*  >  out

=head1 DESCRIPTION


=head1 AUTHOR

J.Joao Almeira, jj@di.uminho.pt

Paulo Rocha, paulo.rocha@di.uminho.pt

=head1 SEE ALSO

perl(1).

cqp(1)

=cut      

__END__

$lm='[a-záéíóúâêôàãõçüöñ]';                      # letra minuscula
$lM='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑ]';                      # letra Maiuscula
$l1='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑa-záéíóúâêôàãõçüöñ0-9]'; # letra e numero
$c1='[^»a-záéíóúâêà,;?!)]';
