#!/usr/bin/perl -s our ($img,$m,$fp,$lp,$obase); ## max lines to be read use utf8::all; use strict; use Data::Dumper; my %f; my %c; my %char; my $name = $ARGV[0] || die("$0 f.pdf or F.xml\n"); my $base = $name =~ s!\.(pdf|xml)$!!r; my $ext=$1; $obase //= "$base.$fp-$lp" if ($fp or $lp); $obase //= $base; if($ext eq "pdf"){ if(not -f "_full_$base.xml"){ system("time pdf2txt.py -M 4 -t xml $base.pdf >_full_$base.xml"); } if(not -f "$obase.xml"){ $fp //= 0; $lp //= 100000000; my $l=""; my $s=""; my $intxt=0; my $skip=0; open(F1,"<","_full_$base.xml") or die("cant open _full_$base.xml\n"); open(F2,">","$obase.xml") or die("cant write $base.xml\n"); while(){ if(m! $lp){ print F2 "\n"; last; } else { $skip =0; } } next if $skip; s/\cX/==/; s/bbox=".*?"//; s!<(line|rect|curve)\b[^]*?/>!! unless $img; if ($intxt and m!(.+)! and $1 eq $l){ print F2 "$2"; } elsif($intxt and m!(.+)!){ $l=$1; print F2 "\n$2"; } elsif($intxt){ $intxt=0; print F2 "\n$_"; } elsif( m!(.+)!){ $intxt=1; $l=$1; print F2 "$2";} else{$intxt=0; print F2 $_; } } close F1; close F2; } } open(F,"<","$obase.xml") or die("cant open '$obase.xml'\n"); my $line=1 ; my $pagec=0; $m //=10000000; while(){ if(/(.+?) $m); } close F; my @F2_DATA=(); # print Dumper \%f; pre_open("proc-$obase") ; open(F1,">","proc-$obase"); pre_open("fontreplacer-$obase"); open(F2,">","fontreplacer-$obase"); my $writeto=0; while(){ if(/__F1__/){$writeto = "F1"; next;} if(/__F2__/){$writeto = "F2"; next;} if($writeto eq "F1"){ print F1 $_; } if($writeto eq "F2"){ print F2 $_; } } print F1 "sub fonttabinit{ \n my \$tab={};\n"; print F1 "##pages = $pagec;\n"; my $i=0; for (sort {$c{$b} <=> $c{$a}} keys %c){ my $code= ('a'..'z')[$i]; my $s=$f{$_}; my $j=1; for my $size (sort {$s->{$b} <=> $s->{$a}} keys %$s){ my $chs=$char{$_}{$size}; my $chars= join("",( sort {$chs->{$b} <=> $chs->{$a} || $a cmp $b } keys %$chs)); my $cj="$code$j"; print F1 "\$tab->{'$_'}{'$size'} = '$cj'; ## $f{$_}{$size}\n"; print F1 "#\t\t\t\t",substr($chars,0,40),"\n"; unshift(@F2_DATA, "#\t". join("",( sort keys %$chs) ) . "\n"); unshift(@F2_DATA, "# $cj => ## {$_}{$size} [$f{$_}{$size}]\n"); unshift(@F2_DATA, "\n"); $j++; } $i++; } print F2 @F2_DATA; print F1 "return \$tab\n}\n"; close F2; close F1; sub pre_open{ my $base=shift; if(-f $base){ my $c=1; while( -f "$base.$c"){ $c++ } rename ($base, "$base.$c"); } } __DATA__ __F1__ #!/usr/bin/perl use XML::DT; use warnings; use strict; my $filename = shift; use utf8::all; my $base=$filename; $base =~ s!.*/!!; $base =~ s!\.xml$!!; my $t=fonttabinit(); genposproc($base); my $font=""; open(LOG,">","debug-$filename"); my $pageno=0; my $figno=0; my $imgno=-1; my %handler=( -type => { textline => "SEQ", textbox => "SEQ", }, -pcdata => sub{ $c=~s/&/&/g; $c=~s//>/g; $c }, 'layout' => sub{"" }, # 1 occurrences; ## 'figure' => sub{logit($q,$c) }, ##### FIXME: removing figures 'figure' => sub{$c}, # 'image' => sub{$imgno++, "{imgno=$imgno,pageno=$pageno,$c}" }, # logit($q,$c) 'line' => sub{logit($q,$c) }, # ## 'line' => sub{"{$c}" }, # 'rect' => sub{logit($q,$c) }, # 'curve' => sub{logit($q,$c) }, # 'page' => sub{$pageno++; $font=""; $c}, # pp($c), # att: id, rotate, bbox 'pages' => sub{ ppp($c)}, # 1 ; # 'text' => sub{ }, # 9476 occ; att: size, font, bbox '//text[@font]' => sub{cmpfnt($t->{$v{font}}{$v{size}})."$c"}, 'text' => sub{"$c" } , 'textbox' => sub{ join("",@$c) }, # att: bbox, id 'textline' => sub{ join("",@$c)}, # att: bbox #'textgroup' => sub{ }, # 4 occurrences; att: bbox ); print pathdt($filename, %handler); sub cmpfnt{my ($tag)=@_; if($font eq $tag) { return "" } else { $font=$tag; return "}<$tag>{"; } } sub ppp{my ($p)=@_; $p =~ tr! !!s; $p =~ s!\n(\s*\n)+!\n!g; #{{{{) $p =~ s!^\s*\}!!; $p =~ s!<(?:a1|n|normal-?\w+)>\{(.*?)\}!$1!gs; #normal fonts: a1 n n-... while($p =~ s!\s*<(remove-?\w*)>\{(.*?)\s*\}\s*!\n!s){logit($1,$2)} # header $p =~ s!(\s+)\}!}$1!g; $p =~ s!\n(\s*\n)+!\n!g; $p =~ s!(\w)-\n(\S+)\s!$1$2\n!g; $p =~ s!\s*$!}\n!; $p } sub __off_pp{my ($p)=@_; #delete fonts : remove... while($p =~ s!\s*<(remove-?\w*)>\{(.*?)\s*\}\s*!\n!s){ logit($1,$2)} ## header "$p "; } sub logit{my($a,$b)=@_; print LOG "Removed $a--{$b}\n"; ## header ""; } sub genposproc{my ($base)=shift; $base //= ""; pre_open("pp$base"); open(F3, ">" , "pp$base") or die(); print F3 q{#!/usr/bin/perl -s undef $/; use utf8::all; my $nl=qr{^}m; # Blocos entre chavetas até nível 4 my $bl0 = qr((?:\\[{}]|[^{}])*); my $bl1 = qr(\{$bl0\}); my $bl2 = qr(\{$bl0(?:$bl1*$bl0)*\}); my $bl3 = qr(\{$bl0(?:$bl2*$bl0)*\}); my $cbb = qr(\{$bl0(?:$bl3*$bl0)*\}); my $blmidle = qr($bl0(?:$cbb*$bl0)*); }; for my $f(keys %$t){ for my $s(keys %{$t->{$f}}){ my $code= $t->{$f}{$s}; print F3 "my \$$code = qr{<$code>\\{(.*?)\\}}s;\n"; } } print F3 q{ ### post proc $cpp, $nl, $tag while(<>){ # s!(?<=^|\n)$b1! !sg; print; } sub tee{my ($file)=@_; open(F ,">", $file); print F $_; close F; } }; close F3; } sub pre_open{ my $base=shift; if(-f $base){ my $c=1; while( -f "$base.$c"){ $c++ } rename ($base, "$base.$c"); } } __F2__ #!/usr/bin/perl use XML::DT; use warnings; use strict; use utf8::all; my $rep = initrep(); while(<>){ print replacer($_,$rep); } sub replacer{ my ($p,$rep)=@_; for my $f (keys %$rep){ $p =~ s!<$f>\{(.*?)\}!replacer2($1,$f,$rep->{$f})!ges; } $p; } sub replacer2{ my($s,$f,$r)=@_; $s =~ s!(&\w+;|.)! $r->{$1} || "($f?$1)" !ge; $s; } sub initrep{ ## load replace information from DATA my $r={}; my $f=""; while(){ chomp; next if /^\s*$/ ; #empty lines next if /^#/ ; #comments if (/^(\w+)\s*=>/){ $f=$1; } elsif(/^(\S+)\s+"(.*)"/){ $r->{$f}{$1}=$2; } elsif(/^(\S+)\s+(\S+)/ ){ $r->{$f}{$1}=$2; } else { die("???$_\n"); } } return $r; } __DATA__ ## fontId × char → string