#!/usr/bin/perl -s use strict; use warnings; #use POSIX qw(locale_h); #setlocale(&POSIX::LC_ALL, "pt_PT"); #use locale; use utf8::all; use Lingua::PT::PLNbase; our ($h, $help, $nat, $tokenize, $o); if ($h || $help) { print_usage(); exit; } my $opt = {}; $opt->{o_format} = 'NATools' if $nat; if ($tokenize) { if ($tokenize eq "cqp") { $opt->{tokenize} = 'cqp'; } else { $opt->{tokenize} = 1; } } $opt->{output} = $o if $o; fsentences($opt,@ARGV); sub print_usage { print "sentences -h/-help -- This help screen\n"; print "sentences [-tokenize[=cqp]] [-nat] [-o=output] file...\n"; print "\t-tokenize: tokenize sentences\n"; print "\t-nat: output format suitable for NATools\n"; print "\t-o=: output to a specific file\n"; } __END__ =encoding UTF-8 =head1 NAME sentences - Command line tool for text segmentation, tokenization and annotation =head1 SYNOPSIS sentences [-tokenize[=cqp]] [-nat] [-o=output] =head1 DESCRIPTION C is a command line tool for text segmentation and annotation. It uses the C function from C. Its main behaviour is the detection of sentences and paragraphs, and their annotation with XML-like tags: EsE for sentences, EpE for paragraphs, and EtextE for different files. If the flag C<-tokenize> is used, then words are detected and separated from each other by a space. If C<-tokenize=cqp> is used, then each token is placed in a line by itself. The C<-nat> flag can be used to force a non-XML output, used for NATools alignment tools. It is also possible to use the C<-o> flag to send the output to a specific file. =head1 SEE ALSO Lingua::PT::PLNbase (3) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE José João Almeida, Ejj@di.uminho.ptE =head1 COPYRIGHT AND LICENSE Copyright (C) 2007-2008 by Projecto Natura =cut