#!/usr/bin/perl use POSIX qw(locale_h); setlocale(&POSIX::LC_ALL, "pt_PT"); use locale; use strict; use Lingua::Jspell; use XML::DT ; use Text::RewriteRules; ### ainda em inicio de escrita... use utf8; use Memoize; memoize('fe'); binmode(STDOUT,":utf8"); my $d= Lingua::Jspell->new("port","./dicPessoal") or die ("Cant open dict port"); $d->setmode({nm=>"off"}); open(F,">:encoding(UTF-8)", "DicNovo") or die; RULES nn1 mph==>nf ph==>f th==>t chr==>cr y==>i ý==>í ^á$==>à ^Á$==>À aes$==>ais \bsc==>c \bSc==>C ([bcflmnpt])\1==>$1 Ph==>F Th==>T Chr==>Cr ([ea])hir$==>$1ir ([ea])hi(nm)==>$1i$2 ([ea])hi==>$1í ([eao])h([eao])==>$1$2 quási==>quase ENDRULES RULES/m norm (\W+|\d+)==>$1 _==>_ (\w+)==>$1!! fe($1) (\w+)_=e=> nm($1) . "_" (\w+)=e=> nm($1) ENDRULES sub fe{$d->fea($_[0]) } sub nm{ my $w= shift; my ( $n, $r) = (nn1($w),""); return $n if length($n) == 1 ; if( fe($n)) { print F "1: $w => $n\n" } elsif(($r=$n)=~s/chi/qui/ and fe($r)) { print F "2: $w => $r\n" } elsif(($r=unacc($n)) and fe($r)) { print F "3: $w => $r\n" } elsif(($r=$n) =~ tr/ê/é/ and fe($r)) { print F "4: $w => $r\n" } elsif(($r=$n)=~s/oi/ou/ and fe($r)) { print F "5: $w => $r\n" } elsif(($r=$n)=~s/chí/quí/ and fe($r)) { print F "2: $w => $r\n" } elsif(($r=$n)=~s/[cp]([tç])/$1/ and fe($r)) { print F "7:$w => $r\n" } elsif(($r=$n)=~s/Chi/Qui/ and fe($r)) { print F "2: $w => $r\n" } elsif(($r=$n)=~s/Chí/Quí/ and fe($r)) { print F "2: $w => $r\n" } else { print F "8: $w =?=> $n\n" } $w; } sub unacc{ my $a=shift; $a =~ tr/áéíóúàèìòùâêîôûÊ/aeiouaeiouaeiouE/; $a; } my %handler=( # 'dict' => sub{ }, # 1 occurrences; -pcdata => sub{ if(ctxt(1) eq 'entrada' and $c =~ /\w/){ norm $c } else {$c} }, # 5088 occurrences; # 'etim' => sub{ }, # 1945 occurrences; attributes: ori, orig # 'exemplo' => sub{ }, # 2 occurrences; # 'morf' => sub{ }, # 5932 occurrences; # 'termo' => sub{ }, # 5088 occurrences; attributes: num, fon, chaveta, ast # 'ver' => sub{ }, # 138 occurrences; ); for my $filename ( @ARGV ){ dt($filename,%handler); } __END__