#!/usr/bin/perl  -s

use strict;
use Data::Dumper;
use utf8::all;

our ($o,$n,$debug, $mincol,$unzip,$notrim,$skip,$q);
$n      //= 2;   ## number of columns
$debug  //= 0 ;
$mincol //= 20;         ## min column lenth
$o      //="_columnunpaste_";
my $trim = not $notrim;
my $sep="====";
$sep ="" if $q;

$/="\cL";

my $p=1;

my $patt;
my @cols;
my $linha;
my @FZ;
if($unzip){ 
  for(1..$n){ open($FZ[$_], ">", "$o$_") or die("cant create _column$_\n");}
}

##for (@ARGV){
##  open(F,$_) or die;
##  while(<F>){
my $isthefirst = 1;
  while(<>){
    chomp;
    $patt="";
    my   @f=();
	my $end;
  
    @cols = gesscols($_,$n,$mincol);
    $patt="." x ($cols[0]-1);
    if( $isthefirst){ $isthefirst = 0;}
	else            { print "\cL"; }
    print "-------------------------------- Page $p:", join(", ",@cols) unless $q;
    print "\n";
	pop(@cols);
    $p++;
    print Dumper(\@cols) if $debug;
    for(1..@cols-1){ 
       $patt .= sprintf('(.{0,%d})',$cols[$_]-$cols[$_-1] ); }

    my  @col=();

    for $linha (split(/\n/, $_)){
      next if $skip and $linha =~ /$skip/;
      @f = ($linha =~ m{^$patt(.*)$}g ) ;
      for(0..@f-1){ 
         $f[$_] =~ s/\h*$//;
         $col[$_] .= "$f[$_]\n"; }
    }

    if($unzip){
       if(@col==$n){ for(1..$n){print {$FZ[$_]} "\cL\n",$col[$_-1],"\n"; }}
       else        { print join("\cL$sep\n",@col ); }
    }
    else { print join("\cL$sep\n",@col ); }
  }
  ##  close F;
  ##}
if($unzip){
   for(1..$n){close $FZ[$_] }
}

sub gesscols{        ## page, col-number, min-col-len (? 20) [global $p]
  my $pag=shift;
  my $cn=shift;
  my $mincol=shift || 20;         ## min column lenth
  my $j=0;                        ## number of lines in the page

  my $i;
  my @p;                          ## number of non space in charCol
  my @b;                          ## number of space in charCol
  $pag =~ s/[ 	\r]*\n/\n/g;
  $pag =~ s/\n *\d+ *\n/\n/g;
  
  my $firstS=1000000 ;            ## = ∞
  my $maxlen=0;
  for (split("\n",$pag)){
    next if $skip and /$skip/;
    $j++;
    $i=0;
    for(split('',$_)){
      $i++;
      $b[$i]++ if /\s/;
	  if ( /\S/ ){
        $p[$i]++ ;
        $firstS = $i if $i < $firstS;
	    $maxlen = $i if $i > $maxlen;
	  }
    }
  }

  my @l=([1,1000000]);  ## possible (column-start , confidance)

  for($i=$mincol; $i < @p ; $i++){
    my  $v= int((4*$p[$i+1] +10*$p[$i] + 10*$b[$i-1] + 4*$b[$i-2] + $b[$i-3])/$j * 100);
    if ( $p[$i] == 0 && $p[$i+1] != 0){ push(@l,[$i+1,$v]); } 
  }

  my @r=();

  print Dumper({P => \@p, L => [sort {$b->[1] <=> $a->[1]} @l]}) if $debug;

  for(sort {$b->[1] <=> $a->[1]} @l){ 
    my $g=1;
    for my $e ( @r ){ if( abs( $e - $_->[0]) < $mincol) {$g =0 } }
    push ( @r,$_->[0]) if $g;
    last if @r == $cn;
  }

  @r = sort {$a <=> $b} @r;
  $r[0]=$firstS  if $trim;
  (@r,$maxlen);
}

__END__

=head1 NAME

columnunpaste - linearise two-column texts

=head1 SYNOPSIS

 columnunpaste [options] file
 Options:
   -n=3        3 columns (def: 2)
   -unzip      creates 2 files: one with col1 the other with col2
   -debug 
   -mincol     minimum column lenth (def: 20)
   -notrim    (by default it removes initial common spaces)
   -skip=regexp   skip all alines that match regexp (for headers, footers)
   -q          quiet (does not print page/column seaparator rules)
   -o=prefixout    (def: _columnunpaste_)

=head1 DESCRIPTION

 for each page (split by control-L)
   guess the column separation
   print each column

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut