#!/usr/bin/perl -s
use strict;
use warnings;
use File::Copy;
my $prefix = '__DIR__'; # This gets substituted by autoconf/automake
our (
$p, # html like paragraphs
$l, # each line is paragraph
$s, # sentence separator
$n, # Natools like file
);
my $f = shift;
my $t = "/tmp/_${$}_"; # temporary file
$/='';
if ($f) {
if (-f $f) {
open Fi, "sed -e 's/ *\$//' $f|" or die "Can't open pipe with sed!\n"
}
else {
die "Can't find $f!\n"
}
}
else {
open Fi, "<&=STDIN "
}
# Maybe we must check if there are more than one of these options
$/ = '
' if $p;
$/ = "\n" if $l;
$/ = '' if $s;
$/ = "\n\$\n" if $n;
open Fo, ">$t.unt" or die "Error can't write to tmp file ($t.unt): $!\n";
while () {
chomp;
s/\s*([,.;!?()«»])\s*/ $1 /g;
s/(\s|\n)+/ /g;
s/\. \. \./.../g;
s/ $/\n\n/;
print Fo "$_\n\n"
}
close Fi;
close Fo;
system ("$prefix/bin/ems-tagger $prefix/share/ems/lexico $t.unt $prefix/share/ems/bigramas $prefix/share/ems/regras-lexicais $prefix/share/ems/regras-contextuais -i $t.int > $t.out");
#if ($f) {
# move("$t.out", "$f.out")
$\ = $/;
$/ = "\n";
open In, "$t.out" or die "Cant open $t.out for reading";
if ($f) {
open Out, ">$f.out" or die "Cant open $f.out for writing";
select Out;
}
while () {
chomp;
print;
}
if ($f) {
close Out;
}
close In;
unlink "$t.out";
#}
#else {
#open CAT, "$t.out" or die "Ooops. Someone deleted $t.out!\n";
#while () { print }
#close CAT
#}
unlink "$t.int" if -f "$t.int";
unlink "$t.unt" if -f "$t.unt";
unlink $t if -f $t;
1;
__END__
=head1 NAME
ems-tag - Top level interface to EMS
=head1 SYNOPSIS
ems-tag [-p|-l|-s] [inputfile]
=head1 DESCRIPTION
This script is the top level interface to EMS. If you use a filename
in the command line, it is used as the input text. If not, the
standard input will be read.
The three switches can only be used independently:
=over 4
=item B<-p>
Input file uses HTML like paragraphs;
=item B<-l>
Input file uses each line as a different paragraph;
=item B<-s>
Input file uses sentence separator;
=back
=head1 SEE ALSO
ems
=head1 COPYRIGHT
Copyright (C)2002-2003 Jose Joao Almeida
GNU GENERAL PUBLIC LICENSE (LGPL) Version 2 (June 1991)
=cut