#!/usr/bin/perl use POSIX qw:setlocale:; setlocale(LC_CTYPE, "pt_PT.utf8"); setlocale(LC_COLLATE, "pt_PT.utf8"); use locale; use utf8; use warnings; use strict; use Text::RewriteRules; use XML::Writer::Simple partial => 1, tags => [qw/dic head form orth phon gramGrp def entry sense etym cit usg quote bibl ref term/]; my @Genres = (qr/Fig\. alg\./i, qr/(?:Chul|Gír)\.? (?:lisb\.?|(?:ant\. )?d[aeo] \S+(?: lis\S+)?)/i, qr/Pleb\. do Bras\.?/i, qr/t\. onom/i, "p. us","us. p", qw/Gír Fam Pop Des Fig Vulg Ant Chul Euph Ext Irón Infant Deprec Neol Improp Burl Obsol Poét Restrict Prolóq Pleb Mod/); my @Domains = qw/Agr Anat Anthrop Apicult Arith Artilh Archit Arm Astrol Astron Archeol Bacter Biol Eccles Bot Carp Chapel Chím Chron Cir Constr Cutel Cyn Comm Enol Entom Equit Escol Espir Fin Geneal Geod Geogr Geol For Fort Geom Glot Gram Gymn Heráld Hortic Hydrogr Ichthyol Jur Marit Ind Marcen Marn Mathem Med Mil Mecan Miner Mús Náut Myth Ópt Ornit Pathol Pesc Pharm Rhet Philol Philos Phot Phýs Physiol Veloc Pint Polit Sal Serralh Serr Taur Terat Theol Typ Techn Ven Vers Topogr Veter Vit Vin Zool/; my @Geos = (qr/(?:Açor|Port|Bras)\.? d[aoe]s? (?:S(?:ão|\.) Paulo|Rio Grande|\S+(?: d[aeo] \S+)?)(?: do [NS]| do Rio| e S(?:\.|ão) Paulo)?/i, qr/Bras\.? riograndense\.?/i, qr/Lisbôa/i, qw/sanscrit Mad Afr Bras Açor trasm minh alent Angl Gal Port Caboverd Gal Lisb(?:ôa)? sertanejo lat/); my $Domains = "(?:".join("|", @Domains)."|Econ. polít|Hist. Nat".")"; my $Genres = join("|", @Genres); my $Geos = join("|", @Geos); my $PTR = qr/\b(?:alent|alg|Ribatejo|beir|Beira-Baixa|dur|extrem|baía|minh|trasm|minas|N)\b/i; my $n = qr{(?:\^\d+)}; # ^3 my $fon = qr{(?: *\(.*?\))}; #(...) my $letter = shift; $letter =~ s!xml/(.+).xml!$1!; $letter = substr $letter, 0, 1 unless $letter eq "Geo" or $letter eq "Names"; $letter = uc($letter) unless $letter eq "Geo" or $letter eq "Names"; print STDERR "txt2xml: Originais/$letter.txt => xml/$letter.xml "; open IN, "Originais/$letter.txt" or die $!; open OUT, ">xml/$letter.xml" or die $!; binmode IN, ":utf8"; binmode OUT, ":utf8"; print OUT xml_header; print OUT start_dic; print OUT " ",head($letter),"\n"; $/=""; my $i = 0; while(){ $i++; print STDERR "\rtxt2xml: $letter.txt => xml/$letter.xml [$i entries]" unless $i % 100; # chomp; s/[ \t]+/ /g; s/\n\n+/\n/; # tentei colocar estes no sistema de reescrita, mas nao consegui. # devo estar a dormir :-S s/&/&/g; s/>/>/g; s/ xml/$letter.xml [$i entries]\n"; sub add_xmlid { my $string = shift; $string =~ m!orth>(.*)/) { $string =~ s///; } else { $string =~ s/ 'part'},$c) } return "\n".start_entry({%args}).form("\n", orth($a), $phon, "\n") } RULES/m examp («(?:[^«»]++|(?-1))*+»)=e=>$a = $1; cit({type=>'example'},quote($a)) ENDRULES RULES fixXML \*([^\* ]+)\*==>$1 ==> \n==> (.+)==>\n$1 ()\n==>\n$1 ()\n==>\n$1 \n=e=>" " (.*?)[_;](.*?)=e=>$a="$1 $2";gramGrp($a) \s+==> \s+==> \b(T\.\s+us.\s+por\s+clássicos)\s*=i=e=>$a=$1;"\n".usg({type=>'obs'}, $a) \b(us.\s+ainda\s+na\s+.aía)\s*=i=e=>$a=$1;"\n".usg({type=>'geo'}, $a) \b(Us\.\s+na\s+loc\.\s+adv\.)\s*=i=e=>$a=$1;"\n".usg({type=>'obs'}, $a) (.*?)\s*($Genres)\b\.?\s*(.*?)=i=e=>($a,$b) = ("$1 $3",$2);gramGrp($a)."\n".usg({type=>'style'},"$b.") (.*?)\s*(\bT\. (?:afr|eccles|caboverd|santomense|d[oea]s? .*?))\b\.?=i=e=>($a,$b)=($1,$2);gramGrp($a)."\n".usg({type => 'geo'},$b) (.*?)\s*(\bProv\.? (?:d[aeo] )?)($PTR\.?)\s+e\s+($PTR\.?)(.*?)=i=e=>gramGrp("$1 $2 $3 $2 $4 $5") (.*?)\s*(?:\bProv\.?\s+(?:d[aeo]\s+)?)($PTR\.?)(.*?)=i=e=>gramGrp("$1 Prov. $3")."\n".usg({type => 'geo'}, "$2") Prov\. Prov\.=i=>Prov. (.*?)\s*(\bLoc\. (?:escolar\.?|d[aeo] \S+))\s*(.*)=i=e=>$a="$1 $3 Loc."; $b = $2; gramGrp($a)."\n".usg({type => 'geo'},$b) (.*?)\s*((?:\bT\. )?\b$Domains\b\.?\s+de\s+.oimbra)\.?\s*(.*?)=i=e=>($a,$b) = ("$1 $3",$2);gramGrp($a)."\n".usg({type=>'dom'},"$b.")."\n".usg({type=>'geo'},"Coimbra") (.*?)\s*((?:\bT\. )?\b$Domains\b)\.?\s*(.*?)=i=e=>($a,$b) = ("$1 $3",$2);gramGrp($a)."\n".usg({type=>'dom'},"$b.") (.*?)\s*\b($Geos)\b\.?\s*(.*?)=i=e=>($a,$b)=("$1 $3",$2);gramGrp($a)."\n".usg({type => 'geo'},$b) ($PTR\.?)=i=e=>$a=$1;"".usg({type=>'geo'}, $a) Prov\.\s*(.*?)=i=e=>"$1\n".usg({type=>'style'},"Prov.") inútil\.?(.*?)=i=e=>"$1\n".usg({type=>'obs'}, "inútil") dispensável\.?(.*?)=i=e=>"$1\n".usg({type=>'obs'}, "dispensável") \s+==> \s+\.==> \s*,==> \s+==> ([^<]*)\s*\*\s*([^<]*)=e=>$a="$1 $2"; "ast=\"1\"".gramGrp($a) \nast="1"==> \be\s*==> \s*e\s+==> ==> ==> \n\n==>\n \.?\s*([^<].+)\n=e=>$a=$1;bibl($a)."" ([^_]+)_\s*;\s*_([^_]+)=e=>($a,$b)=($1,$2); term($a)."; ".term($b) ENDRULES RULES/m pentrada & \* \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, chaveta=>1, name=>1) & \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,chaveta=>1, name=>1) & \* \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, name=>1) & \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,name=>1) @ \* \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, chaveta=>1, geo=>1) @ \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,chaveta=>1, geo=>1) @ \* \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, geo=>1) @ \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,geo=>1) \* \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, chaveta=>1) \* \+ \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1, mais=>1) \} \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,chaveta=>1) \* \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3,ast=>1) \*([^*\{\n]+)\*\s*,($n?)($fon?)\n=e=> gterm($1,$2,$3) ((\* )?_([^_\n]|_[^_\n]{1,5}_)+_( \*)?)\n=e=> $a=$1; end_def.end_sense.start_sense.gramGrp($a)."\n".start_def \*\n=e=> end_def.end_sense.start_sense.gramGrp("*")."\n".start_def (\(De (.*)n\. p\.\))\n=e=> $a=$1; etym({ori=>"nome"},$a)."\n" (\(De .*\))\n=e=> $a=$1; etym($a)."\n" (\((Fr|Gr|Lat|Ingl|It|Hebr|Al)\. (.*)\))\n=e=> $a=$1; $b=$2; etym({orig=>$b},$a)."\n" (\(([Dd]o (\S+)\. (.*?))\))\.?\n=e=> $a=$3;$b=$1; etym({ori=>"$a"},"$b")."\n" (\(T\. .*\))\n=e=> $a=$1; etym($a)."\n" \(V\. _(.*)_\)=e=> $a=$1; ref("(V. ".term($a).")") (.+)==>$1 \n==>\n ENDRULES __END__