#!/usr/bin/perl -w -s use Encode; use strict; use utf8; our ($aprox,$debug,$decode,$htmlent,$fast,$mm); my %ent = # em utf8... qw( aacute á Aacute Á acirc â Acirc  agrave à Agrave À aring å Aring Å atilde ã Atilde à auml ä Auml Ä aelig æ AElig Æ ccedil ç Ccedil Ç eacute é Eacute É ecirc ê Ecirc Ê egrave è Egrave È euml ë Euml Ë iacute í Iacute Í icirc î Icirc Î igrave ì Igrave Ì iuml ï Iuml Ï ntilde ñ Ntilde Ñ oacute ó Oacute Ó ocirc ô Ocirc Ô ograve ò Ograve Ò oslash ø Oslash Ø otilde õ Otilde Õ ouml ö Ouml Ö szlig ß uacute ú Uacute Ú ucirc û Ucirc Û ugrave ù Ugrave Ù uuml ü Uuml Ü yacute ý Yacute Ý yuml ÿ deg ° ordm º ordf ª copy © quot ' euro € scaron š laquo « raquo » amp & lt < gt > nbsp _SPACE_ ); my $c=0; binmode(STDOUT,":utf8"); my $f; undef $/ if $fast; #$final .= decode("utf-8", $f, Encode::FB_QUIET); while($f=){ binmode(ARGV,":bytes") if $.==1 ; my $final=''; my $aux; while($f){ $final .= decode("utf-8", $f, Encode::FB_QUIET); $final .= pack("U",ord(substr($f,0,1))) if substr($f,0,1); substr($f,0,1)=''; $aux = decode("utf-8", $f, Encode::FB_QUIET); $final .= $aux; } if($htmlent){ $final =~ s#\&(\w+);#$ent{$1} || $& #ge; $final =~ s#\&\#(\d+);# pack("U",$1) #ge; $final =~ s#\&\#x([\dA-Fa-f]+);# pack("U",hex($1)) #ge; } if($decode){ $final =~ s/\\unicode\{(\d+)\}/chr($1)/ge; } print rep($final); } # print STDERR "----- fixed: $c double utf8 chars($file)\n" if $c;; sub rep{ my $a = shift; $a =~ s/ÃÂ/Ã/g if $mm; $a =~ s/([ÃÂÅÄÈÎÏÐÑ])([\x{80}-\x{BF}])/n($1,$2)/ge; $a =~ s/â€/â\x{80}/g if $mm; $a =~ s/([Ã])([‰“š˜‘‡•ƒ‚—])/n4($2)/ge; $a =~ s/([â])([\x{80}-\x{BF}]{2})/n2($1,$2)/ge; $a =~ s/ï¬\x{81}/fi/g if $mm; $a =~ s/fl/fl/g if $mm; $a =~ s/ç/ç/g if $mm; $a =~ s/ã/ã/g if $mm; $a =~ s/õ/õ/g if $mm; $a =~ s/aÌ€/à/g if $mm; $a =~ s/aÌ\x{81}/á/g if $mm; $a =~ s/eÌ\x{81}/é/g if $mm; $a =~ s/oÌ\x{81}/ó/g if $mm; $a =~ s/iÌ\x{81}/í/g if $mm; $a =~ s/uÌ\x{81}/ú/g if $mm; $a =~ s/aÌ‚/â/g if $mm; $a =~ s/eÌ‚/ê/g if $mm; $a =~ s/â\x{80}(.)/n3($1)/ge; $a; } sub n2{ my ($a,$b) = @_; if ($a eq "Â"){return $b } # return chr((ord(substr($b,0,1))-128)*2**6+ord(substr($b,1,1))-128+ # (ord($a)-0b11100000)*2**12) } elsif($a eq "â"){ return chr((ord(substr($b,0,1))-128)*2**6+ord(substr($b,1,1))-128+8192) } elsif("$a$b" eq "ï¬\x81"){ return "fi" } elsif("$a$b" eq "fl" ){ return "fl" } else {return("???")} } sub n{ my ($a,$b) = @_; if ($a eq "Â"){return $b } elsif($a eq "Ã"){return chr(ord(substr($b,0,1))+64) } elsif($a eq "Å"){return chr(ord(substr($b,0,1))+192) } elsif($a eq "Ä"){return chr(ord(substr($b,0,1))+128) } elsif($a eq "È"){return chr(ord(substr($b,0,1))+384) } elsif($a eq "Î"){return chr(ord(substr($b,0,1))+768) } elsif($a eq "Ï"){return chr(ord(substr($b,0,1))+832) } elsif($a eq "Ð"){return chr(ord(substr($b,0,1))+896) } elsif($a eq "Ñ"){return chr(ord(substr($b,0,1))+960) } else {return("???")} } sub n3{ my ($a) = @_; if($a eq "™"){return "’" } elsif($a eq "œ"){return "’"} elsif($a eq "˜"){return "“"} # elsif($a eq "\x{9D}"){return "”" } # elsif($a eq "\x{9C}"){return "‘" } elsif($a eq "”"){return "–"} elsif($a eq "“"){return "–"} # elsif($a eq "¦"){return "…"} elsif($a =~ /[\x{91}-\x{9E}¦]/) { return chr(ord(substr($a,0,1))-128+8192) } else {return("â€$a")} } sub n4{ my ($a) = @_; # $a =~ y/˜‘/??/; $a =~ y/‰“šƒ‡•‚—/ÉÓÚÃÇÕÂ/; $a } __END__ =head1 NAME utf8mm - a restorer for doubled utf8 files with mixed latin1 elements... =head1 SYNOPSIS utf8mm filename > f =head1 DESCRIPTION =head2 Options -debug -- generates a \debug{code} before each latin1 character -decode -- replaces \unicode{3232} with the utf8 char -htmlent -- replaces p ٦ á with utf8 chars & < > are not changed. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). unicode(7). latin1(7). ascii(7) perlunicode perlopentut =cut