#!/usr/bin/perl -s use 5.012; use Time::HiRes qw[tv_interval gettimeofday]; use Time::Piece; use Capture::Tiny 'capture'; our $h; our $lang ||= 'por'; our $latin1 ||= undef; if ($h) { print STDERR "$0 [-lang=por] [-latin1] file.png ...\n"; exit; } sub progress($$$) { printf STDERR "\r[%s%%|ETA %s] %- 50s" => @_ } sub format_time($) { my $t = Time::Piece->strptime(int($_[0]),"%s"); $t->hms } my @filenames = @ARGV; my $t0 = [gettimeofday]; my $eta = format_time(1_000_000); my $nr_pages = @filenames; my $i = 1; printf STDERR "Using Tesseract to OCR %d files.\n", $nr_pages; print STDERR "..."; while (my $file = shift @filenames) { my $percent = sprintf "%6.2f" => ($i/$nr_pages)*100; progress $percent, $eta => "Performing OCR on file ".substr($file,0,30)."..."; my $out = $file; $out =~ s/.(?:png|jpg)$//; capture { `tesseract $file $out -l $lang`; }; if ($latin1) { `iconv -c -f utf8 -t latin1 < $out.txt > _$$; mv _$$ $out.txt`; } if ($i % 2) { my $elapsed = tv_interval $t0; ## $i = $elapsed ## $nr_pages-$i == $x $eta = format_time(($elapsed * ($nr_pages-$i))/$i); } $i++; } $eta = format_time(tv_interval $t0); progress 100, '00:00:00' => "done in [$eta]"; printf STDERR "\n";