#!/usr/bin/perl -s use strict; use warnings; use File::Copy; my $prefix = '__DIR__'; # This gets substituted by autoconf/automake our ( $p, # html like paragraphs $l, # each line is paragraph $s, # sentence separator $n, # Natools like file ); my $f = shift; my $t = "/tmp/_${$}_"; # temporary file $/=''; if ($f) { if (-f $f) { open Fi, "sed -e 's/ *\$//' $f|" or die "Can't open pipe with sed!\n" } else { die "Can't find $f!\n" } } else { open Fi, "<&=STDIN " } # Maybe we must check if there are more than one of these options $/ = '

' if $p; $/ = "\n" if $l; $/ = '' if $s; $/ = "\n\$\n" if $n; open Fo, ">$t.unt" or die "Error can't write to tmp file ($t.unt): $!\n"; while () { chomp; s/\s*([,.;!?()«»])\s*/ $1 /g; s/(\s|\n)+/ /g; s/\. \. \./.../g; s/ $/\n\n/; print Fo "$_\n\n" } close Fi; close Fo; system ("$prefix/bin/ems-tagger $prefix/share/ems/lexico $t.unt $prefix/share/ems/bigramas $prefix/share/ems/regras-lexicais $prefix/share/ems/regras-contextuais -i $t.int > $t.out"); #if ($f) { # move("$t.out", "$f.out") $\ = $/; $/ = "\n"; open In, "$t.out" or die "Cant open $t.out for reading"; if ($f) { open Out, ">$f.out" or die "Cant open $f.out for writing"; select Out; } while () { chomp; print; } if ($f) { close Out; } close In; unlink "$t.out"; #} #else { #open CAT, "$t.out" or die "Ooops. Someone deleted $t.out!\n"; #while () { print } #close CAT #} unlink "$t.int" if -f "$t.int"; unlink "$t.unt" if -f "$t.unt"; unlink $t if -f $t; 1; __END__ =head1 NAME ems-tag - Top level interface to EMS =head1 SYNOPSIS ems-tag [-p|-l|-s] [inputfile] =head1 DESCRIPTION This script is the top level interface to EMS. If you use a filename in the command line, it is used as the input text. If not, the standard input will be read. The three switches can only be used independently: =over 4 =item B<-p> Input file uses HTML like paragraphs; =item B<-l> Input file uses each line as a different paragraph; =item B<-s> Input file uses sentence separator; =back =head1 SEE ALSO ems =head1 COPYRIGHT Copyright (C)2002-2003 Jose Joao Almeida GNU GENERAL PUBLIC LICENSE (LGPL) Version 2 (June 1991) =cut