#!/usr/bin/perl -w -s use Data::Dumper; use strict; use locale; use Lingua::PT::PLN; use Lingua::PT::PLN::tokenizer; my $tab = $Lingua::PT::PLN::tokenizer::tab; use jspell; our ($nopn); jspell_dict("port"); my %savit_p; my $savit_n; my %reap = (); my %subs = (); my $i=0; my $r; my $wp='\w+(?:-\w+)*'; #pattern for word my %validas; if($nopn){open(F,">output1") or die; while(<>){print F $_} close F; } else {tagPN("output1"); } open( I, "output1") or die; open( X, "|jspell -d port -W 0 -J -g -a -w \"'\"> output2") or die; while() { for my $rule (@$tab) { s{(^|\s|\b|["'«»])($rule->[0])}{ $1 . sav("$2","lex($2,$rule->[1])") }ge } print X tokenize($_); } close I; close X; open(X,"output2") or die; while(){ s{\&\s+(.*?) 0 :}{procDesc($1)}ge; s{\*\s+(.*?) 0 :}{$1 }g; s{#n(\d+)}{$subs{$1} || "???$1"}eg; print; } close X; sub procDesc{ my $p = shift ; my @as = feaWithNewFlags($p); if(@as) { "$p\t".join(",", map {any2lex($_,$_->{rad})} @as); } else { my @an = analisa::analisa($p); if (@an) { "$p\t".join(",", map {any2lex($_,$_->{rad})} @an); } else { "$p\tNOK"; } } } sub tokenize{ my $abrev = join '|', qw( srt?a? dra? [A-Z] etc exa? jr profs? arq av estr? et al vol eng tv lgo pr Oliv ig mrs? min rep ); my $protect = ' \#n\d+ | \w+\'\w+ | \w+-\w+ | [\w_.-]+ \@ [\w_.-]+ # emails | \w+\.[ºª] # ordinals | <[^>]*> # marcup XML SGML | \d+(?:\.\d+)+ # numbers | \d+\:\d+ # the time | ((https?|ftp|gopher)://|www)[\w_./~-]+ # urls '; my $conf = { rs => "\n" }; my $text = shift; if (ref($text) eq "HASH") { $conf = { %$conf, %$text}; $text = shift; } my $result = ""; local $/ = ">"; my %tag=(); my ($a,$b); for ($text) { if(/<(\w+)(.*?)>/) { ($a, $b) = ($1,$2); if ($b =~ /=/ ) { $tag{'v'}{$a}++ } else { $tag{'s'}{$a}++ } } s/<\?xml.*?\?>//s; s/(.)\n-\n/$1-/g; s/($protect)/savit($1)/xge; s/([\»\]])/$1 /g; s#([\«\[])# $1#g; s#\"# \" #g; s/(\s*\b\s*|\s+)/\n/g; s/\n+/\n/g; s/\n(\.?[ºª])\b/$1/g; while ( s#\b([0-9]+)\n([\,.])\n([0-9]+\n)#$1$2$3#g ){}; s#\n($abrev)\n\.\n#\n$1\.\n#ig; s/\n*{rs}/g; $result; } sub savit{ my $a=shift; $savit_p{++$savit_n}=$a ; " __MARCA__$savit_n " } sub loadit{ my $a = shift; $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g; $savit_n = 0; $a; } sub tagPN{ my $out = shift || "output1"; forPN( {t => "double", out => $out } , sub {my ($p, $contex)=@_; if($p =~ m/($wp)\b(\s*)(.*)/) { my ($a,$b,$c)=($1,$2,$3); if (vazia($a)){ if($c) { "$a$b". savPN($c, "lex($c, [CAT=np, unknown=guessed])") } else { $p } } else { savPN($p,"lex($p, [CAT=np,unknown=guessed])") } } else { $p } }, sub {my ($p, $contex)=@_; if($p =~ m/($wp)\b(\s*)(.*)/) { my ($a,$b,$c)=($1,$2,$3); if (!possibleNP($a)){ if($c) { "$a$b".savPN( $c ,"lex($c, [CAT=np,unknown=guessed])") } else { $p } } else { savPN($p,"lex($p, [CAT=np,unknown=guessed])") } } else { $p } } ); } sub savPN{ my ($k,$inf)=@_; my $r = "UNDEF"; my @a ; if($k =~ /^($wp)/) { my $x = $1; @a = fea($x); my $fs = +{ onethat({CAT=>"np"},@a) }; if(defined $fs->{CAT}) { delete $fs->{rad}; $fs->{unknown}="guessed" if length($k) != length($x); $r = any2lex( $fs, $k); } else{ $fs = +{ onethat({CAT=>"nc"},@a) }; if(defined $fs->{CAT}) { $fs->{unknown}="guessed"; delete $fs->{rad}; $fs = { CAT=> "np", (defined $fs->{G}) ? (G => $fs->{G}) : (), (defined $fs->{N}) ? (N => $fs->{N}) : ()}; $r=any2lex( $fs, $k) ; } else { $r = any2lex($inf,$k) } } } else { $r=any2lex($inf,$k)} sav( $k,$r) } sub any2lex{ my $x = shift; my $o = shift||" "; my $aux= any2str($x,"compact"); $aux =~ s/\{(.*?)\}/lex($o,[$1])/g; $aux } sub sav { my ($k,$inf)=@_; if (exists($reap{$k})) { "#n$reap{$k}" } else { $subs{$i}="$k\t$inf"; $reap{$k}=$i; $r="#n$i"; $i++; $r } } sub possibleNP{ my $w = shift; !(rad($w)) or (ok({CAT=>"np"},fea($w))) } sub vazia{ my $w = shift; my $CATs= join(" ",map {$_->{CAT}||""} fea($w)); return ( $CATs =~ m!\b(adv|p|art|con|cp)!)?1:0 } sub feaWithNewFlags{ my $w = shift; setmode("+flags"); setmode("+af"); setmode("+nm"); my @fs=fea($w); # print Dumper(\@fs); my @r = map { validaCatFlag($_)? ($_): () } @fs; } sub validaCatFlag{ my $f=shift; my $flag=$f->{flags}; my @fs=fea($f->{rad}); # print Dumper(\@fs); my @comp = map { (defined($_->{CAT}) && $validas{$flag}{$_->{CAT}}) ? (1) : () } @fs ; scalar(@comp) } BEGIN{ %validas=( 'A' => { adj => 1 , a_nc => 1,}, 'd' => { adj => 1 , a_nc => 1,}, 'f' => { adj => 1 , a_nc => 1,}, 'j' => { adj => 1 , a_nc => 1,}, 'm' => { adj => 1 , a_nc => 1,}, 's' => { adj => 1 , a_nc => 1,}, 'U' => { adj => 1 , a_nc => 1,}, 'F' => { adj => 1 , a_nc => 1,}, 'I' => { adj => 1 , a_nc => 1,}, 'T' => { adj => 1 , a_nc => 1,}, 'U' => { adj => 1 , a_nc => 1,}, 'a' => { nc =>1, adj => 1 , a_nc => 1,}, 'h' => { nc =>1, adj => 1 , a_nc => 1,}, 'p' => { nc =>1, adj => 1 , a_nc => 1,}, 'i' => { nc =>1, a_nc => 1,}, 't' => { nc =>1, a_nc => 1,}, 'u' => { nc =>1, a_nc => 1,}, 'w' => { nc =>1, a_nc => 1,}, 'y' => { nc =>1, a_nc => 1,}, 'C' => { v => 1 }, 'c' => { v => 1 }, 'D' => { v => 1 }, 'M' => { v => 1 }, 'n' => { v => 1 }, 'o' => { v => 1 }, 'v' => { v => 1 }, 'L' => { v => 1 }, 'P' => { v => 1 }, 'G' => { nc => 1, adj => 1 , a_nc => 1,}, ); } package analisa; use jspell; sub brasil { my ($brasil, $portuguesa) = @_; my @feas = fea($portuguesa); for (@feas) { $_->{'unknown'} = 'Bras'; } return @feas; } sub analisa { my $word = shift; setmode("-nm"); if (check_word($word)) { return fea($word); } # deve ser portuguesa; elsif ($word=~m!-!) { my @feas = map { my @fs = fea($_); [@fs] } split /-/, $word; my $count = 0; for (@feas) { my @xs = @$_; ++$count if (@xs); } if (scalar(@feas)==$count) { return ({CAT=>'nc', unknown=>'wordlist'}) } else { return () }} else { # uma palavra e' brasileira se... my $original = $word; # 1. antes do ç colocar um c if ($word =~ s/ç/cç/ and check_word($word)) { return brasil($original, $word); } $word = $original; # 2. colocar um c antes de um t if($word =~ s/([aáée])t/$1ct/ and check_word($word)) { return brasil($original, $word); } $word = $original; # 3. substituir um ü num u if($word =~ s/ü/u/ and check_word($word)) { return brasil($original, $word); } $word = $original; # 4. tirar o Antônio para António if($word =~ s/ô([nm])/ó$1/ and check_word($word)) { return brasil($original, $word); } $word = $original; # 5. comitê para comité if ($word =~ s/ê/é/ and check_word($word)) { return brasil($original, $word); } return (); } } sub check_word { my $word = shift; setmode("-nm"); my @list = rad($word); return (@list)?1:0; } 1;