#!/usr/bin/perl -s our ($f,$force); $f = $force if $force; use strict; my $charset; for(@ARGV){ $charset="?"; next if /.bak$/; my $tipo = `file -b -i '$_'`; chomp $tipo; if($tipo =~ /(\S+);\s+charset=(\S+)/){$tipo=$1; $charset=lc($2)}; if($charset eq "utf-8") { } elsif($charset eq "binary") { } elsif($charset eq "us-ascii") { } elsif($tipo eq "application/postscript"){ } elsif(($tipo eq "application/xml" or $tipo eq "text/xml") and ($charset eq "iso-8859" or $charset eq "iso-8859-1")){ rename($_ ,"$_.bak"); ## system ("iconv -f CP1252 -t utf-8 '$_.bak' > '$_.aux'\n") ; JJsystem ("xmllint --encode utf-8 '$_.bak' > '$_'\n") } elsif($tipo eq "text/x-tex" and ($charset eq "iso-8859" or $charset eq "iso-8859-1")){ rename($_ ,"$_.bak"); JJsystem ("iconv -f CP1252 -t utf-8 '$_.bak' | sed 's/latin1\]\{inputenc\}/utf8]{inputenc}/g' > '$_'") } elsif($tipo eq "application/xml" or $tipo eq "text/xml"){ rename($_ ,"$_.bak"); JJsystem ("xmllint --encode utf-8 '$_.bak' > '$_'\n") } elsif($charset eq "non-iso" or $charset eq "unknown-8bit" ) { rename($_ ,"$_.bak"); JJsystem ("iconv -f CP1252 -t utf-8 '$_.bak' > '$_'\n") } elsif($charset eq "utf-16le"){ rename($_ ,"$_.bak"); JJsystem ("iconv -f $charset -t utf-8 '$_.bak' > '$_'\n") } elsif($charset eq "iso-8859" or $charset eq "iso-8859-1"){ rename($_ ,"$_.bak"); JJsystem ("iconv -f latin1 -t utf-8 '$_.bak' > '$_'\n") } else { print "socorro: '$tipo:$charset' '$_'\n"; } } sub JJsystem{ my ($com)=@_; if(system($com)!=0){ print stderr "Error in $com:$!;\nReverting...\n"; rename("$_.bak",$_); } } __END__ dominio/Dominio_10_A.txt: text/plain; charset=utf-8 dominio/Dominio_10_A.txt.bak: text/plain; charset=iso-8859-1 text/plain; charset=us-ascii application/xml; charset=unknown-8bit application/xml; charset=utf-8 application/xml; charset=utf-8 application/xml; charset=iso-8859-1 __END__ =head1 NAME ensureutf8 - convert text files to unicode utf8 =head1 SYNOPSIS ensureutf8 file* -f -force continue when error detected error =head1 DESCRIPTION Each file is converted to utf8 (original is saved with extension .bak). This command uses "file" to obtain original encoding. * file -i -b ( to get type and charset) * for TXT-file: ( iconv ...) * for XML-files: ( xmllint --encode utf-8 ) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut