#!/usr/bin/perl -w -s use Encode; use strict; use utf8; our ($aprox,$debug,$tolatin1,$mm,$mkmiracles,$decode,$htmlent); $mkmiracles=1 if $mm; if($tolatin1){ binmode(STDOUT,":bytes"); } else { binmode(STDOUT,":utf8"); } my $f; undef $/; my $file = shift; if ($file){ open(F,"<:bytes",$file) or die; $f= ; close F;} else { binmode(STDIN,":bytes"); $f= ; } my $final=''; my $aux; $final .= decode("utf-8", $f, Encode::FB_QUIET); while($f){ # print STDERR sprintf('\unicode{%d}="%s"',ord(substr($f,0,1)),pack("C0U",ord(substr($f,0,1)))),"\n"; if($debug){ $final .= sprintf('\debug{%d}%s',ord(substr($f,0,1)),pack("U",ord(substr($f,0,1)))); } else { $final .= pack("U",ord(substr($f,0,1))); } substr($f,0,1)=''; $aux = decode("utf-8", $f, Encode::FB_QUIET); $final .= $aux; } if($htmlent){ my %ent = # em utf8... qw( aacute á Aacute Á acirc â Acirc  agrave à Agrave À aring å Aring Å atilde ã Atilde à auml ä Auml Ä aelig æ AElig Æ ccedil ç Ccedil Ç eacute é Eacute É ecirc ê Ecirc Ê egrave è Egrave È euml ë Euml Ë iacute í Iacute Í icirc î Icirc Î igrave ì Igrave Ì iuml ï Iuml Ï ntilde ñ Ntilde Ñ oacute ó Oacute Ó ocirc ô Ocirc Ô ograve ò Ograve Ò oslash ø Oslash Ø otilde õ Otilde Õ ouml ö Ouml Ö szlig ß uacute ú Uacute Ú ucirc û Ucirc Û ugrave ù Ugrave Ù uuml ü Uuml Ü yacute ý Yacute Ý yuml ÿ deg ° ordm º ordf ª copy © quot ' euro € scaron š laquo « raquo » amp & lt < gt > nbsp _SPACE_ ); $final =~ s#\&(\w+);#$ent{$1} || $& #ge; $final =~ s#\&\#(\d+);# pack("U",$1) #ge; $final =~ s#\&\#x([\dA-Fa-f]+);# pack("U",hex($1)) #ge; } if($mkmiracles){ $final =~ s/à/à/g; $final =~ s/è/è/g; $final =~ s/ì/ì/g; $final =~ s/ò/ò/g; $final =~ s/ù/ù/g; $final =~ s/á/á/g; $final =~ s/é/é/g; $final =~ s/í/í/g; $final =~ s/ó/ó/g; $final =~ s/ú/ú/g; $final =~ s/ã/ã/g; $final =~ s/õ/õ/g; $final =~ s/â/â/g; $final =~ s/ê/ê/g; $final =~ s/î/î/g; $final =~ s/ô/ô/g; $final =~ s/û/û/g; $final =~ s/ç/ç/g; $final =~ s/ñ/ñ/g; $final =~ s/ü/ü/g; $final =~ s/ö/ö/g; $final =~ s/ë/ë/g; $final =~ s/ï/ï/g; $final =~ s/ä/ä/g; $final =~ s/æ/æ/g; $final =~ s/Ø/Ø/g; $final =~ s/ø/ø/g; $final =~ s/Č/Č/g; $final =~ s/ě/ě/g; $final =~ s/ý/ý/g; $final =~ s/ż/ż/g; $final =~ s/ź/ź/g; $final =~ s/ę/ę/g; $final =~ s/ś/ś/g; $final =~ s/ą/ą/g; $final =~ s/ń/ń/g; $final =~ s/ć/ć/g; $final =~ s/Ł/Ł/g; $final =~ s/ř/ř/g; # $final =~ s/Å£/ţ/g; $final =~ s/ł/ł/g; $final =~ s/Ã¥/å/g; $final =~ s/ß/ß/g; $final =~ s/ă/ă/g; $final =~ s/ş/ş/g; $final =~ s/À/À/g; $final =~ s/È/È/g; $final =~ s/Ì/Ì/g; $final =~ s/Ò/Ò/g; $final =~ s/Ù/Ù/g; $final =~ s/Á/Á/g; $final =~ s/É/É/g; $final =~ s/Í/Í/g; $final =~ s/Ó/Ó/g; $final =~ s/Ú/Ú/g; $final =~ s/Õ/Õ/g; $final =~ s/Ê/Ê/g; $final =~ s/Î/Î/g; $final =~ s/Ô/Ô/g; $final =~ s/Û/Û/g; $final =~ s/Ç/Ç/g; $final =~ s/Ñ/Ñ/g; $final =~ s/Ü/Ü/g; $final =~ s/Ö/Ö/g; $final =~ s/Ë/Ë/g; $final =~ s/Ï/Ï/g; $final =~ s/Ä/Ä/g; $final =~ s/°/°/g; $final =~ s/§/§/g; $final =~ s/»/»/g; $final =~ s/«/«/g; $final =~ s/º/º/g; $final =~ s/ª/ª/g; $final =~ s/©/©/g; $final =~ s/¿/¿/g; $final =~ s/Ø/Ø/g; $final =~ s/À/À/g; $final =~ s/É/É/g; $final =~ s/£/£/g; $final =~ s/Ó/Ó/g; $final =~ s/Ñ/Ñ/g; $final =~ s/Ú/Ú/g; $final =~ s/Ü/Ü/g; $final =~ s/ / /g; $final =~ s/­/­/g; $final =~ s/·/·/g; $final =~ s/µ/µ/g; $final =~ s/±/±/g; $final =~ s/³/³/g; $final =~ s/¡/¡/g; $final =~ s/ß/ß/g; $final =~ s/ÃŽ/Î/g; $final =~ s/´/´/g; $final =~ s/¶/¶/g; $final =~ s/ÿ/ÿ/g; # 3 ¨ 3 ¨ # 3 ¬ 3 ¬ # 9 ¸ 9 ¸ # 4 Â' 4 ' # 2 ¤ 2 ¤ # 7 ¢ 7 ¢ # 3 Â¥ 3 ¥ # 8 ¼ 8 ¼ # 1 ½ 1 ½ $final =~ s/Ã/Ã/g; $final =~ s/Â/Â/g; $final =~ s/â¬/€/g; ## suspeito... $final =~ s/–/–/g; $final =~ s/—/—/g; $final =~ s/“/"/g; $final =~ s/”/"/g; $final =~ s/’/'/g; ## tetum... $final =~ s/\x{2019}/'/g; ## tetum... 2019 /’/ 8217 = (e28099) $final =~ s/\x{2e}\x{80}\x{99}/'/g; ## tetum... 2019 $final =~ s/…/"/g; } if($decode){ $final =~ s/\\unicode\{(\d+)\}/chr($1)/ge; } if($tolatin1){ $final =~ s/([\x{100}-\x{ffff}])/sprintf('\unicode{%d}',ord($1))/ge; if($aprox){ my %utf2lat = ( 8230 => "..." , # … 8364 => " Euros " , # € 8217 => "'" , # ’ 226? 8216 => "'" , # ‘ 8220 => "\"" , # “ 8221 => "\"" , # ” 8208 => "-" , # ‐ 8209 => "-" , # ‑ 8211 => "--" , # – 8212 => "---" , # — ); $final =~ s/(\\unicode\{(\d+)\})/(defined $utf2lat{$2})? $utf2lat{$2}:$1/ge; } } print $final; __END__ =head1 NAME utf8restore - a restorer for utf8 files with mixed latin1 elements... =head1 SYNOPSIS utf8restore filename > f utf8restore -tolatin1 filename > f utf8restore -mkmiracles filename > f utf8restore -debug filename > f =head1 DESCRIPTION =head2 Options -debug -- generates a \debug{code} before each latin1 character -tolatin1 -- generates Latin1 output and \uniode{code} for each char outside latin1 range -aprox -- makes aproximate char in latin1 output (not injective) -mkmiracles -- tries do translate double utf8 in utf8 -decode -- replaces \unicode{3232} with the utf8 char -htmlent -- replaces p ٦ á with utf8 chars & < > are not changed. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). unicode(7). latin1(7). ascii(7) perlunicode perlopentut =cut