#!/usr/bin/perl -s use Lingua::PT::PLNbase; use Lingua::NATools::ConfigData; use warnings; use strict; our ($h); our ($tmx,$single,$raw, $d, $D, $utf8); sub usage { print "nat-sentence-align: simple interface for Vanilla aligner\n\n"; print "\tnat-sentence-align [-tmx] [-single] [-d=.EOS] [-D=.EOP] \n\n"; print "For more help, please run 'perldoc nat-sentence-align'\n"; exit; } usage() if ($h); my $BINPREFIX = Lingua::NATools::ConfigData->config('bindir'); my $command = ""; my $file1 = shift or die "I need two files to align\n"; my $file2 = shift or die "I need two files to align\n"; if ($tmx && $single) { undef $single } unless ($raw) { $/ = "\n\n"; for my $file ($file1, $file2) { open I, $utf8?"<:utf8":"<", $file or die "Cannot open file '$file': $!\n"; open O, $utf8?">:utf8":">", "$file.tok" or die "Cannot open file '$file.tok': $!\n"; while () { next if m!^(\s\n)*$!; s/\n/ /g; s/\ +/ /g; my @sentences = sentences($_); for my $s (@sentences) { my @atomos = atomiza($s); print O join("\n",@atomos),"\n.EOS\n"; } print O ".EOP\n"; } close O; close I; } $command .= " -d .EOS -D .EOP $file1.tok $file2.tok"; } else { die "Using 'raw', you must supply -d and -D" if ($raw && (!$d || !$D)); $command .= " -d $d -D $D $file1 $file2"; } $command = " -s$command" if ($single); $command = "$BINPREFIX/nat-sentalign$command"; system($command); die $@ if ($@); #print $command; if ($tmx) { # files are $file1.tok.al e $file2.tok.al # langs... local $/ = "\n"; my ($lang1, $lang2); print STDOUT "Please enter language code for '$file1': "; chomp($lang1 = ); print STDOUT "Please enter language code for '$file2': "; chomp($lang2 = ); print STDOUT "Producing TMX '$lang1-$lang2.tmx'...\n"; `nat-pair2tmx $file1.tok.al $lang1 $file2.tok.al $lang2 > $file1-$file2.tmx`; } =encoding UTF-8 =head1 NAME nat-sentenec-align - simple interface for Vanilla aligner. =head1 SYNOPSIS nat-sentence-align [-tmx] [-single] [-d=.EOS] [-D=.EOP] =head1 DESCRIPTION This command is a frontend for the Vanilla aligner supplied with NATools. To use it you must supply two parallel files with soft and hard delimiters for sincronization. By default these delimiters are '.EOS' and '.EOP'. You can change them using the C<-d> and C<-D> switches. At the end it creates a pair of sentence-aligned files. You can supply a C<-tmx> option to force the creation of a TMX, or the C<-single> option to force the creation of a single file with the two languages (mainly used for debugging). =head1 SEE ALSO NATools Documentation, perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2009 by Alberto Manuel Brandão Simões =cut