#!/usr/bin/perl -s use strict; use warnings; use File::Spec::Functions; use Lingua::NATools::ConfigData; no warnings 'once'; my $prefix = Lingua::NATools::ConfigData->config('libdir'); # Hope they maintain this... my $rules_prefix = catfile $prefix => "NATools"; our ($id, $h, $tokenize, $full, $i, $ngrams, $terminology, $utf8, $csize, $langs); $terminology = 1 && $ngrams = 1 if $full; sub usage { print "nat-mkMakefile: generates a pmakefile for Makefile::Parallel use.\n\n"; print "\tnat-mkMakefile -i -utf8 -tokenize -csize=70000 -id= > pmakefile\n\n"; print "\tnat-mkMakefile -i -utf8 -tokenize -csize=70000 -id= > pmakefile\n\n"; print "For more help, please run 'perldoc nat-mkMakefile'\n"; exit; } usage() if ($h); $i = 1 if $utf8; my $home=$ENV{HOME}; $id ||= "ALIGNED"; my $source = ""; if ($#ARGV == 0 && $ARGV[0] =~ m!tmxt?$!) { $source = "-tmx $ARGV[0]"; } elsif ($#ARGV == 1) { $source = join(" ",@ARGV); } else { usage(); } $source = "-langs=$langs $source" if $langs; my $makefile; $makefile .= $_ while ; $makefile .= "\n"._examples() if $terminology; $makefile .= "\n"._ngrams() if $ngrams; $makefile .= _code() if $ngrams; $tokenize = $tokenize?"-tokenize":""; $i = $i ? "-i" : ""; $utf8 = $utf8 ? "-utf8" : ""; $csize = $csize ? "-csize=$csize": ""; for ($makefile) { s/<=ID=>/$id/g; s/<=SOURCE=>/$source/g; s/<=HOME=>/$home/g; s/<=TOKENIZE=>/$tokenize/g; s/<=I=>/$i/g; s/<=UTF8=>/$utf8/g; s/<=CSIZE=>/$csize/g; } print $makefile; sub _examples { return <<'EO'; examples$i: dicA dicB (20:00:00) <=HOME=>/NATools/NAT/scripts/nat-examplesExtractor -chunk=$i -local=$ID -langs=pt..en cleanExamples: examples$i (20:00:00) rm -f $ID/patterns.txt $ID/examples.txt for a in @i; do cat $ID/examples.${a}.txt | grep '=!' >> $ID/patterns.txt; done for a in @i; do cat $ID/examples.${a}.txt | grep -v '=!' >> $ID/examples.txt; done for a in @i; do rm -f $ID/examples.${a}.txt; done EO } sub _ngrams { return <<'EO'; ngramS$i.$j: codify (20:00:00) nat-ngrams -n $j $ID/source.$i.crp $ID/ngramS.$i.$j.db ngramT$i.$j: codify (20:00:00) nat-ngrams -n $j $ID/target.$i.crp $ID/ngramT.$i.$j.db joinGramS$j: ngramS$i.$j (20:00:00) nat-ngrams -j $ID/ngramS$j $ID/ngramS*.$j.db rm -f $ID/ngramS*.$j.db joinGramT$j: ngramT$i.$j (20:00:00) nat-ngrams -j $ID/ngramT$j $ID/ngramT*.$j.db rm -f $ID/ngramT*.$j.db gramSQLiteS$j: joinGramS$j (20:00:00) sub{ create_sqlite('S', $j); } gramSQLiteT$j: joinGramT$j (20:00:00) sub{ create_sqlite('T', $j); } EO } sub _code { return <<'EO'; %% sub create_sqlite { my ($t,$n) = @_; my $lex = ($t eq 'S')?"source":"target"; `nat-ngrams -o 2 -d $ID/ngram$t$n $ID/$lex.lex > $ID/ngram$t$n.txt`; `rm -f $ID/ngram$t$n`; `sort -n -r $ID/ngram$t$n.txt > $ID/_$t$n`; `rm -f $ID/ngram$t$n.txt`; open R, "$ID/_$t$n"; open W, ">$ID/__$t$n"; while() { chomp; my @F = split /\s/, $_; push @F, shift @F; print W "@F\n"; } close W; close R; `rm -f $ID/_$t$n`; my @v=(undef, undef, qw/bigrams trigrams tetragrams/); open SQL, "|sqlite3 $ID/$t.$n.sqlite"; my $fields = join(",",map{"w$_"}(1..$n)); print SQL "CREATE TABLE $v[$n] ($fields,occs);\n"; print SQL ".separator ' '\n"; print SQL ".import $ID/__$t$n $v[$n]\n"; for my $i (1..$n) { print SQL "CREATE INDEX idx${t}${n}w${i} ON $v[$n] (w$i);" } close SQL; `rm -f $ID/__$t$n`; } EO } =encoding UTF-8 =head1 NAME nat-mkMakefile - generates a pmakefile to be used by Makefile::Parallel =head1 SYNOPSIS nat-mkMakefile -id= > pmakefile nat-mkMakefile -id= > pmakefile =head1 DESCRIPTION This script generates a parallel makefile to be used by Makefile::Parallel to align and extract examples using a PBS based cluster. The C<-id> switch is required and should contain the identifier of the corpus to be created. =head1 OPTIONS =over 4 =item C<-full> Creates the full makefile, including n-grams and terminology extraction. =item C<-ngrams> Creates the makefile including n-grams computation. =item C<-terminology> Creates the makefile including terminology extraction. =back =head1 SEE ALSO NATools documentation, Makefile::Parallel, perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2012 by Alberto Manuel Brandão Simões =cut __DATA__ # -*- makefile -*- ID=<=ID=> codify: (20:00:00) nat-codify -v <=UTF8=> <=I=> -id=$ID <=CSIZE=> <=TOKENIZE=> <=SOURCE=> i <- sub{ $nr = `cat <=ID=>/nat.cnf |grep nr-chunks|cut -f 2 -d "="`; printf("%03d\n",$_) for (1..$nr); } j <- sub{ printf("%d\n", $_) for (2..4) } initmat$i: codify (20:00:00) nat-initmat $ID/source.$i.crp $ID/target.$i.crp $ID/mat.$i.in ipfp$i: initmat$i (20:00:00) nat-ipfp 5 $ID/source.$i.crp $ID/target.$i.crp $ID/mat.$i.in $ID/mat.$i.out rm -f $ID/mat.$i.in postipfp$i: ipfp$i (20:00:00) nat-mat2dic $ID/mat.$i.out $ID/dict.$i rm -f $ID/mat.$i.out postbin$i: postipfp$i (20:00:00) nat-postbin $ID/dict.$i $ID/source.$i.crp.partials $ID/target.$i.crp.partials $ID/source.lex $ID/target.lex $ID/source-target.$i.bin $ID/target-source.$i.bin rm -f $ID/dict.$i dicA: postbin$i (20:00:00) for a in @i; do nat-dict add $ID/source-target.bin $ID/source-target.${a}.bin; done for a in @i; do rm -f $ID/source-target.${a}.bin; done dicB: postbin$i (20:00:00) for a in @i; do nat-dict add $ID/target-source.bin $ID/target-source.${a}.bin; done for a in @i; do rm -f $ID/target-source.${a}.bin; done dump: dicA dicB (20:00:00) nat-dumpDicts <=UTF8=> -self $ID