#!/usr/bin/perl -s

# needs:
# 	abiword
# 	antiword
# 	elinks
# 	lynx
# 	w3m
# 	calibre
# 	gnumeric
#	xmllint
#	docx2txt.pl (found, p.e, in LF aligner)
#	ebook-convert (from calibre)
#   pandoc
#   mdb-tools :  mdb-export -d :: -Q  f.mdb  General
#

use LWP::Simple;
use File::Spec::Functions qw(splitpath catpath);

our ($t,$l,$no_n,$nolayout,$latin1,$fs,$h,$raw,$ly,$elinks,$v,$charset,$noenc,
    $norm, $bookcleaner, $rawpp, $rawpp2, $w ,$htmla, $uno, $rom);

my $romopt = '-norom_off=0' if $rom ;
##$t ||= "html";
$fs ||= ' :: ';
my $n;
$n = not $no_n;

my $w3charset="";
$w3charset="-I $charset" if $charset;
$w3charset="-I cp1252"   if $w;    ## windows legacy and mal formed latin1

$t = "ly" if $ly;
$t = "elinks" if $elinks;

my $layout = "-layout";
$layout = ""        if ($nolayout or $rawpp2);
$layout = "-raw"    if ($raw or $rawpp);

my $w;


if($h > 3) { $h = "| grep '[a-zA-Z]' | head -$h" }
elsif($h)  { $h = "| grep '[a-zA-Z]' | head -5" }

if($rawpp or $rawpp2)
           { $h .= "| lawpp | bookcleaner $romopt -pipe - "; }
if($bookcleaner)
           { $h .= "| bookcleaner $romopt -pipe - "; }
if($v)     { $h .= "| view -" }


if($norm){ mkdir("__TXT"); }

my $cont;
if($l){
   $w = "-w 0";
   $l = 10000; $cont = " | sed -e 's/    */$fs/g'";
}
else {
   $w = "-w 100";
   $l = 100; # number of columns dos html dumps
   $cont = "";
}
use strict;
#undef $/;

my @nottext = qw(
  jpg png gif svg jpeg ico ppm pbm 
  iso db
  c java exe o pm py 
  rpm deb dmg
  css js 
  dtd 
  wmv avi mp4 divx
  wav mp3 mid midi
  jar zip bz2 tgz gz xz tar rar 
);
my %nottext = map {($_ => 1)} @nottext;

my %fun=(
 html => "w3m -dump -cols $l $w3charset -T text/html $cont ",
 csv  => sub {my $f=shift; my $f1=under($f); 
    "ssconvert -O 'separator=\" :: \" format=raw quoting-mode=never' -S '$f' '$f1.txt'; cat '$f1.txt'* "},
 xlsx => sub {my $f=shift; my $f1=under($f); my $tsv= $f =~ s/\.xlsx$/.joptsv/r; $tsv =~ s!.*/!!; 
    "libreoffice --convert-to joptsv:'Text - txt - csv (StarCalc)':9,34,UTF8 --outdir /tmp '$f'; csv2tab -s='	' '/tmp/$tsv' "},
 xlsx2 => sub {my $f=shift; "xlsx2csv '$f' | csv2tab"},
 uno  => sub {my $f=shift; "unoconv --stdout '$f'| pdftotext -enc UTF-8 $layout - -"},
 ebook => sub {my $f=shift; my $f1=under($f); "ebook-convert '$f' '$f1.txt'; cat '$f1.txt' "},
);

my %com = (
 nottext => "echo",
 html => "w3m -dump -cols $l $w3charset -T text/html $cont ",
 htm  => "w3m -dump -cols $l $w3charset -T text/html $cont ",
 elinks   => "elinks -dump -force-html -no-references -no-numbering -dump-width $l $cont ",
 ly   => "lynx -dump -dont_wrap_pre -force_html -stdin -nolist -nomargins -nonumbers -width=$l $cont ",
);

%com = (%com,
 rtf  => sub {my $f=shift; my $f1=under($f); "abiword -t txt '$f' -o '$f1.txt'; cat '$f1.txt' "},
##docx=> sub {my $f=shift; my $f1=under($f); "abiword -t txt '$f' -o '$f1.txt'; cat '$f1.txt' "},
 docx => sub {my $f=shift; my $f1=under($f); "docx2txt '$f' "},
 doc  => sub {my $f=shift; "antiword -m UTF-8.txt $w '$f'"},
# csv  => sub {my $f=shift; "xlscat -s '$fs' '$f' "},
 csv  => $fun{csv},
 xls  => $fun{csv} ,
 gnumeric  => $fun{csv} ,
 ods  => $fun{csv} ,
# xls  => sub {my $f=shift; my $f1=under($f);"ssconvert --export-type=Gnumeric_stf:stf_csv '$f' '$f1.csv'; xlscat -s '$fs' '$f1.csv' "},
 eps  => sub {my $f=shift; "ps2pdf '$f' -| pdftotext -enc UTF-8 $layout - -"} ,
 ps   => sub {my $f=shift; "ps2pdf '$f' -| pdftotext -enc UTF-8 $layout - -"} ,
 pps  => $fun{uno},
 pptx => $fun{uno},
 pps  => $fun{uno},
 ppt  => $fun{uno},
 odt  => $fun{uno},

 tex  => sub {my $f=shift; "pandoc -f latex $f | $com{html} "},

 pdf  => sub {my $f=shift; "pdftotext -enc UTF-8 $layout '$f' -"} ,
 xml  => sub {my $f=shift; "xmllint -format --encode UTF-8 --recover --noent '$f'"} ,
 txt  => sub {my $f=shift; if($noenc){"cat '$f'"} else { ensureutf8($f) }} ,
 lit  => $fun{ebook},
 mobi => $fun{ebook},
 prc  => $fun{ebook},
 epub => $fun{ebook},
 fb2  => sub {my $f=shift; my $f1=under($f); "xmllint -format --encode UTF-8 --recover --noent '$f' > '$f.jop_utf8.fb2';  ebook-convert '$f.jop_utf8.fb2' '$f1.txt' > /dev/null ; rm '$f.jop_utf8.fb2'; cat '$f1.txt' "},
# xlsx => sub {my $f=shift; my $f1=under($f); "ssconvert -T Gnumeric_stf:stf_assistant -O 'separator=\" :: \" format=raw quoting-mode=never' -S '$f' '$f1.txt'; cat '$f1.txt' "},
 xlsx => $fun{xlsx},
# xlsx => $fun{xlsx2},

 mdb => sub { my $f=shift; "mdb-export -d :: -Q  '$f'  General $cont "},

 xml_mtf => sub {my $f=shift; my $f1=under($f); "multiterm2termu '$f' > '$f1.dici'; cat '$f1.dici' "},
);

sub under{                        ## a/b.c →  a/__b.c
 my $f=shift;
 my ($v,$d,$file)=splitpath($f); 
 catpath($v,$d,"__$file");
}

sub dirunder{                     ## a/b.c →  a/__TXT/b.c.txt
 my $f=shift;
 my ($v,$d,$file)=splitpath($f); 
 $file =~ s/\.txt$//;
 catpath($v,"__TXT","$file.txt");
}

sub wesolved{                     ## solved and no need to recalculate...
 my $f=shift;
 my $tmp = under($f). ".txt";
 if( -f $tmp and -M $tmp < -M $f) { return "cat '$tmp' $h"  }  
 else                             { return undef }
}

sub wenorm{                       ## solved and no need to recalc(in __TXT)
 my $f=shift;
 my $tmp = dirunder($f);
 if( -f $tmp and -M $tmp < -M $f) { return "cat '$tmp' $h"  }  
 else                             { return undef }
}

sub gett{
 my $a = shift;
 my $patt  = join("|",keys(%com));
 my $patt2 = join("|",keys(%nottext));
 ##return $1 if ($a =~ /\.(pdf|ps|html?|xml|doc)$/);
 if($a=~ /\.xml$/){
    my $root= `xmllint -format -encode utf8 $a | grep -m 1 -oP '<\K\w+'`;
    return "xml_$root"  if($com{"xml_$root"});
 }
 return $t if $t; ## return t if provided
 return $1 if ($a =~ /\.($patt)$/);
 return "nottext" if ($a =~ /\.($patt2)$/);
 return "html";
}

if($htmla){
  my $h=shift or die("can open html-a arg\n");
  open(F,"<",$h) or die;
  while(<F>){
   if( /^(\S+) :a:/){ 
      print STDERR "$1\n";
      push(@ARGV,$1);}
  }
  close F; 
}

undef $/;

if (@ARGV){
  for my $f (@ARGV){
    my $h2="";
    if($norm){ 
           next if -d $f;
           next if  wenorm($f);
           next if  $f =~ m{\b__.*\.txt$};
           ## $h = "> '" . dirunder($f). "'";
           $h2 = "> '" . dirunder($f). "'";
    }
    print "\n#==>$f ($t)<==\n\n" if $n;

    if   ($f =~ m!^http.*\.(.{1,4})$!){ getstore($f, "__$$.$1"); $f="__$$.$1"  }
    elsif($f =~ m!^http.*! )          { getstore($f, "__$$"   ); $f="__$$"     }

    my $ti = gett($f);
    if (ref($com{$ti}) eq "CODE"){
      if   (my $tmp = wesolved($f)){ system($tmp) } 
      elsif(my $tmp = wenorm($f)  ){ system($tmp) } 
      else                         { system($com{$ti}->($f) . $h .$h2) ;}
    }
    else {
      system ( "cat '$f' | $com{$ti} $h $h2 ") ;
    }
  }
}
else {
    system($com{html}->("-") . $h ) ;
}

sub ensureutf8 {
	my $file = shift;
	my $tipo = `file --mime-encoding '$file'`;
  	chomp $tipo;
  	$tipo =~ s!.*?:\s*!!;       ## remove filename
  	$tipo =~ s! .*!!;
  	chomp $tipo;
	$tipo =~ s/\n+$//;   # chomp not working ???
    if(defined $charset){
        ("iconv -f $charset -t utf-8 '$file'") }
    elsif($tipo eq "utf-8")   {  
        ("cat '$file'"); }
  	elsif($tipo eq "utf-16be") { 
    	("iconv -f utf-16be -t utf-8 '$file'")  }
  	elsif($tipo eq "utf-16le") { 
    	("iconv -f utf-16le -t utf-8 '$file'")  }
  	elsif($tipo eq "unknown-8bit") { 
    	("iconv -f CP1252 -t utf-8 '$file'")  }
  	elsif($tipo eq "iso-8859-1"){
    	("iconv -f latin1 -t utf-8 '$file'")  }
  	elsif($tipo eq "us-ascii"){
		("cat '$file'");}
  	else                      { print STDERR "socorro: '$tipo' '$file'\n";  }
}


__END__

=head1 NAME

jop - jj open (html, doc, pdf, xml, rtf, xls, csv) as a textual pipe...

=head1 SYNOPSIS

 jop [options] file+
 jop [options] url

=head1 DESCRIPTION

C<jop> extracts the type from the extension, and converts the files to
text.

 
  html, 
  doc, pps, docx
  pdf, ps, eps,
  xml, rtf, xls, csv, 
  odt, 
  lit mobi epub fb2 
  mdb

  and others 

=head2 Options

 -t=type        forces a type (html,ly,rtf,pdf,doc,xml...)
 -l             large  mode
 -no_n          skips the output of "#==> file name <=="
 -nolayout      to supress layout mode for pdftotext
 -raw           PDF (pdftotext -raw)
 -rawpp         PDF (pdftotext -raw |lawpp - | bookcleaner -c -pipe - )
 -rawpp2        PDF (pdftotext      |lawpp - | bookcleaner -c -pipe - )
 -bookcleaner   TXT (bookcleaner -c -pipe)
 -rom               bookcleaner looks for chapters with Roman numbering
 -fs=!          define a fiel separator...
 -v             use vi as a pager
 -charset=cp1251    (forces cp1251 interretation for html w3m)
 -w             the same as -charset=cp1252 ("ansi" and mal formed latin1)
 -ly            uses lynx instead of w3m for html-txt converter
 -elinks        uses elinks instead of w3m for html-txt converter
 -h=20          show just the fist 20 lines ( | head -n=20 )
 -noenc			in txt mode: do not change encoding (default is convert 
                     to UTF8)
 -norm          creates a directory "__TXT/" and saves a copy of the converted
                     file
 -uno           forces the use of unoconv

=head2 Dependencies

Jop uses several external tools to make conversions:

 html     w3m -dump -cols ... -T text/html ...
 ly       lynx -dump -dont_wrap_pre -force_html -stdin -nolist 
    -nomargins -nonumbers -width= ...
 elinks   force the use of elinks html converter
 rtf      abiword -t txt ... -o '__....txt'
 docx     abiword -t txt ... -o '__....txt'
 doc      antiword $w ...
 pdf      pdftotext -enc UTF-8  ...  -
 xml      xmllint -format ...
 xls      ssconvert --export-type=Gnumeric_stf:stf_csv ... | xlscat
 csv      xlscat -s 'fs' ...
 mobi,lit,epub,fb2      ebook-convert 
 txt      cat ...
 mdb      access : mdb-tools : mdb-export -d :: -Q  f.mdb  General
 xlsx     libreoffice --convert-to tsv:'Text - txt - csv (StarCalc)':9,34,UTF8 
 xlsx2    xlsx2csv f.xlsx  | csv2tab

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut      

w3m