#!/usr/bin/perl -w -s use strict; use POSIX qw(locale_h); use utf8; setlocale(LC_CTYPE, "pt_PT"); use locale; use Data::Dumper; use NAT::Matrix; use NAT::Client; our($juststep2, $local, $crp, $rules, $offset, $length,$out); my $NATOOLS= $ENV{NATOOLS} || "$ENV{HOME}/svn/main/NATools/"; $out ||= "__extracted__"; my ($f1, $f2); unless($juststep2){ ($f1,$f2) = @ARGV; die ("usage: $0 [options] f1 f2\n option: -local=... -crp=... -rules=...\n") unless -f $f2; step1({ f1=>$f1, f2=>$f2, ($out ? ( output=>$out):()), ($length ? ( length=>$length):()), ($offset ? ( offset=>$offset):()), ($rules ? ( rules=>$rules):()), ($crp ? ( crp=>$crp):()) , ($local ? ( local=>$local ):()) , }); } step2(); sub step1 { my %opt =(length => 10_000_000, rules => "$NATOOLS/rules.test-case", output => "__extracted__", ); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my $offset = $opt{offset} || ""; die("Cant file Rules ($opt{rules})\n") unless -f $opt{rules}; my $clength = 0; local $/ = "\n\$\n"; open F1, "$f1"; open F2, "$f2"; my ($s1,$s2); print STDERR "Parsing rules file\n"; my $rules = NAT::PatternRules->parseFile($opt{rules}) if $opt{rules}; my $server; if ($opt{local}) { print STDERR "Loading corpus from '$opt{local}'\n"; $server = NAT::Client->new(local => $opt{local}); } elsif ($opt{crp}) { $server = NAT::Client->new; $server->set_corpus($opt{crp}); } else { die("I need a corpus: please provide option '-local=... or -crp=...\n") } if(!$offset && -f "__number"){ $offset = `cat __number` + 0 } if($offset){ open( EXT,">>$opt{output}");} else { open( EXT,">$opt{output}");} print STDERR "Recovering: Skipping to $offset...\n" if $offset; print STDERR "length= $opt{length}\n"; while (defined($s1 = ) && defined($s2 = ) && $clength < $opt{length}){ $clength++; if ($offset) { $offset--; next } `echo $clength > __number_`; print STDERR "$clength\n" unless ($clength % 10); chomp $s1; chomp $s2; my @x = split /\s+/, $s1; my @y = split /\s+/, $s2; next unless $s1 =~ /[a-zA-Z]{2}/; #JJ next if @x > 100; next if @y > 100; next if $s1 eq $s2; #JJ next if @x > 6 && @y > 2 * @x ; #JJ next if @y > 6 && @x > 2 * @y ; #JJ next if @y > 4 * @x ; #JJ next if @x > 4 * @y ; #JJ my $matrix = NAT::Matrix->new($server,$rules,$s1,$s2); $matrix->findDiagonal; # print Dumper($matrix); for my $b (@{$matrix->{patterns}}) { my $x = $matrix->dump_block($b); print EXT $x->[0], " =!",$b->{id},"!= ", $x->[1], "\n"; } my $blocks = $matrix->grep_blocks; my $bs = $matrix->combine_blocks($blocks, 1); for my $b (@$bs) { my $x = $matrix->dump_block($b); print EXT $x->[0], " === ", $x->[1], "\n"; } print EXT "\n"; `cp __number_ __number`; } close EXT; } sub step2{ open (EXT, "grep '=!' $out |LANG=C sort |LANG=C uniq -c | sort -nr |"); open (TERM, ">_ext1"); local $/ = "\n"; while(){ next if m{ [)(%.?"',] | \d\s.*\d | \b( or | também | and | e | não | not | the | que | of |um | as | como | with )\b }x; print TERM $_; } close EXT; close TERM; } unlink ("__number_", "__number"); __END__ =head1 NAME term_extract - extract terminology and examples from paralell corpora =head1 SYNOPSIS term_extract [options] file1 file2 options -local=corpusDir -crp=corpusName (def -local ) -rules=file (def $NATOOLS/... ) -offset=number (def 0 or last number ) -length=number (10 000 000 ) -out=file (output file; default = __extrated__ ) -juststep2 (just calculates _ext1 from __extracted__) =head1 DESCRIPTION term_extract creates the following files: __extrated__ (or -out=file) -- examples _ext1 -- bilingual terminology =head1 AUTHOR Alberto Simoes J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut