#!/usr/bin/perl -s use strict; use Data::Dumper; use utf8::all; our ($o,$n,$debug, $mincol,$unzip,$notrim,$skip,$q); $n //= 2; ## number of columns $debug //= 0 ; $mincol //= 20; ## min column lenth $o //="_columnunpaste_"; my $trim = not $notrim; my $sep="===="; $sep ="" if $q; $/="\cL"; my $p=1; my $patt; my @cols; my $linha; my @FZ; if($unzip){ for(1..$n){ open($FZ[$_], ">", "$o$_") or die("cant create _column$_\n");} } ##for (@ARGV){ ## open(F,$_) or die; ## while(){ my $isthefirst = 1; while(<>){ chomp; $patt=""; my @f=(); my $end; @cols = gesscols($_,$n,$mincol); $patt="." x ($cols[0]-1); if( $isthefirst){ $isthefirst = 0;} else { print "\cL"; } print "-------------------------------- Page $p:", join(", ",@cols) unless $q; print "\n"; pop(@cols); $p++; print Dumper(\@cols) if $debug; for(1..@cols-1){ $patt .= sprintf('(.{0,%d})',$cols[$_]-$cols[$_-1] ); } my @col=(); for $linha (split(/\n/, $_)){ next if $skip and $linha =~ /$skip/; @f = ($linha =~ m{^$patt(.*)$}g ) ; for(0..@f-1){ $f[$_] =~ s/\h*$//; $col[$_] .= "$f[$_]\n"; } } if($unzip){ if(@col==$n){ for(1..$n){print {$FZ[$_]} "\cL\n",$col[$_-1],"\n"; }} else { print join("\cL$sep\n",@col ); } } else { print join("\cL$sep\n",@col ); } } ## close F; ##} if($unzip){ for(1..$n){close $FZ[$_] } } sub gesscols{ ## page, col-number, min-col-len (? 20) [global $p] my $pag=shift; my $cn=shift; my $mincol=shift || 20; ## min column lenth my $j=0; ## number of lines in the page my $i; my @p; ## number of non space in charCol my @b; ## number of space in charCol $pag =~ s/[ \r]*\n/\n/g; $pag =~ s/\n *\d+ *\n/\n/g; my $firstS=1000000 ; ## = ∞ my $maxlen=0; for (split("\n",$pag)){ next if $skip and /$skip/; $j++; $i=0; for(split('',$_)){ $i++; $b[$i]++ if /\s/; if ( /\S/ ){ $p[$i]++ ; $firstS = $i if $i < $firstS; $maxlen = $i if $i > $maxlen; } } } my @l=([1,1000000]); ## possible (column-start , confidance) for($i=$mincol; $i < @p ; $i++){ my $v= int((4*$p[$i+1] +10*$p[$i] + 10*$b[$i-1] + 4*$b[$i-2] + $b[$i-3])/$j * 100); if ( $p[$i] == 0 && $p[$i+1] != 0){ push(@l,[$i+1,$v]); } } my @r=(); print Dumper({P => \@p, L => [sort {$b->[1] <=> $a->[1]} @l]}) if $debug; for(sort {$b->[1] <=> $a->[1]} @l){ my $g=1; for my $e ( @r ){ if( abs( $e - $_->[0]) < $mincol) {$g =0 } } push ( @r,$_->[0]) if $g; last if @r == $cn; } @r = sort {$a <=> $b} @r; $r[0]=$firstS if $trim; (@r,$maxlen); } __END__ =head1 NAME columnunpaste - linearise two-column texts =head1 SYNOPSIS columnunpaste [options] file Options: -n=3 3 columns (def: 2) -unzip creates 2 files: one with col1 the other with col2 -debug -mincol minimum column lenth (def: 20) -notrim (by default it removes initial common spaces) -skip=regexp skip all alines that match regexp (for headers, footers) -q quiet (does not print page/column seaparator rules) -o=prefixout (def: _columnunpaste_) =head1 DESCRIPTION for each page (split by control-L) guess the column separation print each column =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut