#!/usr/bin/perl use Getopt::Long qw(:config no_auto_abbrev bundling); use Pod::Usage; my($l,$of,$dpi,$out,$nompsep,$r,$rr,$rrr,$b); my $debug; GetOptions( "help|h|?" => \$help, "man|m" => \$man, "rrr" => \$rrr, "rr" => \$rr, "r" => \$r, "b" => \$b, "n" => \$nompsep, "output|o=s" => \$out, "l=s" => \$l, "d=s" => \$dpi, "debug" => \$debug, "of|f=s" => \$of, ) or die "Specify the --help (or -?) option for usage information.\n"; pod2usage(2) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; $l ||= "por"; $of ||= "smarttext"; $dpi ||= "300"; $r = "-rotate 90" if $r; $r = "-rotate 180" if $rr; $r = "-rotate 270" if $rrr; print STDERR "Lingua: $l\n" if $debug; print STDERR "Output format: $of\n" if $debug; print STDERR "dpi: $dpi\n" if $debug; use strict; my $f=shift or pod2usage(2); my $aux = $f; my $out = $f."_ocr.txt"; my $pat = $f; if ($f =~ /(.*)\.(pdf)$/i) { $aux = "_$$..bmp" ; $out = $1."_ocr.txt"; print STDERR "*...$f --> bmp\n"; syst("pdftoppm -mono -r $dpi '$f' $aux"); $pat = "_$$."; } elsif ($f =~ /(.*)\.(ps)$/i) { $aux = "_$$..bmp" ; $out = $1."_ocr.txt"; print STDERR "*...$f --> bmp\n"; syst("convert $r -compress none -density $dpi -monochrome '$f' BMP3:$aux"); $pat = "_$$."; } elsif ($f =~ /(.*)\.(tiff?|png|gif|jpg|pbm|ppm)$/i) { $aux = "_$$..bmp" ; $out = $1."_ocr.txt"; print STDERR "...$f --> bmp\n"; syst("convert $r -compress none -monochrome '$f' BMP3:$aux"); $pat = "_$$."; } elsif ($f =~ /(.*)\.([^.]+)$/) { $aux = "_$$..bmp" ; $out = $1."_ocr.txt"; print STDERR "...$f --> bmp\n"; syst("convert $r -compress none -monochrome '$f' BMP3:$aux"); $pat = "_$$."; } my $n = 1; print STDERR "...$f --> ocr\n"; for( <$pat*.bmp>,<$pat*.pbm> ){ print STDERR "$_\n" if $debug; my $k = sprintf("%03d",$n); syst("cuneiform -l $l -f $of -o '$f.txt.$k' '$_' "); $n ++; unlink ($_) unless $debug; } undef $/; open(O,">$out") or die ("cant create $out\n"); for (<$f.txt.*>){ open(F,$_) or die ("cant read $_\n"); print O ; print O "\n\cL\n" unless $nompsep; close F; unlink $_ unless $debug; } close O; sub syst { my $cmd = shift; print STDERR "$cmd\n" if $debug ; if(system ($cmd) == 0){ return 1} else { warn "** ERROR ************ system $cmd failed: $!$?\n"; return 0 } } __END__ =head1 NAME wocr - Wrapper for Cuneiform OCR =head1 SYNOPSIS wocr [-options] imagefile -l eng # language = English (def:por) -f text # output format (def:smarttext) -d 400 # dpi used in translation PDF ou PS to BMP (def: 300) -m # manpage of wocr -h # usage -n # no page separator "CTR F" --debug # dont remove temp. files --r # rotate pages 90 --rr # rotate pages 180 --rrr # rotate pages 270 -b =head1 DESCRIPTION Convert files to BMP V3, run Cuneiform OCR on all bmp files, concatenate outputs. Known extensions and formats: pdf ps png gif jpg tif tiff pbm ppm =head2 OPTION -l eng # language = English (def:por) -f text # output format (def:smarttext) -d 400 # dpi used in translation PDF ou PS to BMP (def: 300) -m # manpage -h # usage -n # no page separator "CTR F" --debug # dont remove temp. files --r # rotate pages 90 --rr # rotate pages 180 --rrr # rotate pages 270 -b =head2 Dependencies Image Magick (to convert to BMP) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl, cuneiform, Image Magick =cut