#!/usr/bin/perl -s # needs: # abiword # antiword # elinks # lynx # w3m # calibre # gnumeric # xmllint # docx2txt.pl (found, p.e, in LF aligner) # ebook-convert (from calibre) # pandoc # mdb-tools : mdb-export -d :: -Q f.mdb General # use LWP::Simple; use File::Spec::Functions qw(splitpath catpath); our ($t,$l,$no_n,$nolayout,$latin1,$fs,$h,$raw,$ly,$elinks,$v,$charset,$noenc, $norm, $bookcleaner, $rawpp, $rawpp2, $w ,$htmla, $uno, $rom); my $romopt = '-norom_off=0' if $rom ; ##$t ||= "html"; $fs ||= ' :: '; my $n; $n = not $no_n; my $w3charset=""; $w3charset="-I $charset" if $charset; $w3charset="-I cp1252" if $w; ## windows legacy and mal formed latin1 $t = "ly" if $ly; $t = "elinks" if $elinks; my $layout = "-layout"; $layout = "" if ($nolayout or $rawpp2); $layout = "-raw" if ($raw or $rawpp); my $w; if($h > 3) { $h = "| grep '[a-zA-Z]' | head -$h" } elsif($h) { $h = "| grep '[a-zA-Z]' | head -5" } if($rawpp or $rawpp2) { $h .= "| lawpp | bookcleaner $romopt -pipe - "; } if($bookcleaner) { $h .= "| bookcleaner $romopt -pipe - "; } if($v) { $h .= "| view -" } if($norm){ mkdir("__TXT"); } my $cont; if($l){ $w = "-w 0"; $l = 10000; $cont = " | sed -e 's/ */$fs/g'"; } else { $w = "-w 100"; $l = 100; # number of columns dos html dumps $cont = ""; } use strict; #undef $/; my @nottext = qw( jpg png gif svg jpeg ico ppm pbm iso db c java exe o pm py rpm deb dmg css js dtd wmv avi mp4 divx wav mp3 mid midi jar zip bz2 tgz gz xz tar rar ); my %nottext = map {($_ => 1)} @nottext; my %fun=( html => "w3m -dump -cols $l $w3charset -T text/html $cont ", csv => sub {my $f=shift; my $f1=under($f); "ssconvert -O 'separator=\" :: \" format=raw quoting-mode=never' -S '$f' '$f1.txt'; cat '$f1.txt'* "}, xlsx => sub {my $f=shift; my $f1=under($f); my $tsv= $f =~ s/\.xlsx$/.joptsv/r; $tsv =~ s!.*/!!; "libreoffice --convert-to joptsv:'Text - txt - csv (StarCalc)':9,34,UTF8 --outdir /tmp '$f'; csv2tab -s=' ' '/tmp/$tsv' "}, xlsx2 => sub {my $f=shift; "xlsx2csv '$f' | csv2tab"}, uno => sub {my $f=shift; "unoconv --stdout '$f'| pdftotext -enc UTF-8 $layout - -"}, ebook => sub {my $f=shift; my $f1=under($f); "ebook-convert '$f' '$f1.txt'; cat '$f1.txt' "}, ); my %com = ( nottext => "echo", html => "w3m -dump -cols $l $w3charset -T text/html $cont ", htm => "w3m -dump -cols $l $w3charset -T text/html $cont ", elinks => "elinks -dump -force-html -no-references -no-numbering -dump-width $l $cont ", ly => "lynx -dump -dont_wrap_pre -force_html -stdin -nolist -nomargins -nonumbers -width=$l $cont ", ); %com = (%com, rtf => sub {my $f=shift; my $f1=under($f); "abiword -t txt '$f' -o '$f1.txt'; cat '$f1.txt' "}, ##docx=> sub {my $f=shift; my $f1=under($f); "abiword -t txt '$f' -o '$f1.txt'; cat '$f1.txt' "}, docx => sub {my $f=shift; my $f1=under($f); "docx2txt '$f' "}, doc => sub {my $f=shift; "antiword -m UTF-8.txt $w '$f'"}, # csv => sub {my $f=shift; "xlscat -s '$fs' '$f' "}, csv => $fun{csv}, xls => $fun{csv} , gnumeric => $fun{csv} , ods => $fun{csv} , # xls => sub {my $f=shift; my $f1=under($f);"ssconvert --export-type=Gnumeric_stf:stf_csv '$f' '$f1.csv'; xlscat -s '$fs' '$f1.csv' "}, eps => sub {my $f=shift; "ps2pdf '$f' -| pdftotext -enc UTF-8 $layout - -"} , ps => sub {my $f=shift; "ps2pdf '$f' -| pdftotext -enc UTF-8 $layout - -"} , pps => $fun{uno}, pptx => $fun{uno}, pps => $fun{uno}, ppt => $fun{uno}, odt => $fun{uno}, tex => sub {my $f=shift; "pandoc -f latex $f | $com{html} "}, pdf => sub {my $f=shift; "pdftotext -enc UTF-8 $layout '$f' -"} , xml => sub {my $f=shift; "xmllint -format --encode UTF-8 --recover --noent '$f'"} , txt => sub {my $f=shift; if($noenc){"cat '$f'"} else { ensureutf8($f) }} , lit => $fun{ebook}, mobi => $fun{ebook}, prc => $fun{ebook}, epub => $fun{ebook}, fb2 => sub {my $f=shift; my $f1=under($f); "xmllint -format --encode UTF-8 --recover --noent '$f' > '$f.jop_utf8.fb2'; ebook-convert '$f.jop_utf8.fb2' '$f1.txt' > /dev/null ; rm '$f.jop_utf8.fb2'; cat '$f1.txt' "}, # xlsx => sub {my $f=shift; my $f1=under($f); "ssconvert -T Gnumeric_stf:stf_assistant -O 'separator=\" :: \" format=raw quoting-mode=never' -S '$f' '$f1.txt'; cat '$f1.txt' "}, xlsx => $fun{xlsx}, # xlsx => $fun{xlsx2}, mdb => sub { my $f=shift; "mdb-export -d :: -Q '$f' General $cont "}, xml_mtf => sub {my $f=shift; my $f1=under($f); "multiterm2termu '$f' > '$f1.dici'; cat '$f1.dici' "}, ); sub under{ ## a/b.c → a/__b.c my $f=shift; my ($v,$d,$file)=splitpath($f); catpath($v,$d,"__$file"); } sub dirunder{ ## a/b.c → a/__TXT/b.c.txt my $f=shift; my ($v,$d,$file)=splitpath($f); $file =~ s/\.txt$//; catpath($v,"__TXT","$file.txt"); } sub wesolved{ ## solved and no need to recalculate... my $f=shift; my $tmp = under($f). ".txt"; if( -f $tmp and -M $tmp < -M $f) { return "cat '$tmp' $h" } else { return undef } } sub wenorm{ ## solved and no need to recalc(in __TXT) my $f=shift; my $tmp = dirunder($f); if( -f $tmp and -M $tmp < -M $f) { return "cat '$tmp' $h" } else { return undef } } sub gett{ my $a = shift; my $patt = join("|",keys(%com)); my $patt2 = join("|",keys(%nottext)); ##return $1 if ($a =~ /\.(pdf|ps|html?|xml|doc)$/); if($a=~ /\.xml$/){ my $root= `xmllint -format -encode utf8 $a | grep -m 1 -oP '<\K\w+'`; return "xml_$root" if($com{"xml_$root"}); } return $t if $t; ## return t if provided return $1 if ($a =~ /\.($patt)$/); return "nottext" if ($a =~ /\.($patt2)$/); return "html"; } if($htmla){ my $h=shift or die("can open html-a arg\n"); open(F,"<",$h) or die; while(){ if( /^(\S+) :a:/){ print STDERR "$1\n"; push(@ARGV,$1);} } close F; } undef $/; if (@ARGV){ for my $f (@ARGV){ my $h2=""; if($norm){ next if -d $f; next if wenorm($f); next if $f =~ m{\b__.*\.txt$}; ## $h = "> '" . dirunder($f). "'"; $h2 = "> '" . dirunder($f). "'"; } print "\n#==>$f ($t)<==\n\n" if $n; if ($f =~ m!^http.*\.(.{1,4})$!){ getstore($f, "__$$.$1"); $f="__$$.$1" } elsif($f =~ m!^http.*! ) { getstore($f, "__$$" ); $f="__$$" } my $ti = gett($f); if (ref($com{$ti}) eq "CODE"){ if (my $tmp = wesolved($f)){ system($tmp) } elsif(my $tmp = wenorm($f) ){ system($tmp) } else { system($com{$ti}->($f) . $h .$h2) ;} } else { system ( "cat '$f' | $com{$ti} $h $h2 ") ; } } } else { system($com{html}->("-") . $h ) ; } sub ensureutf8 { my $file = shift; my $tipo = `file --mime-encoding '$file'`; chomp $tipo; $tipo =~ s!.*?:\s*!!; ## remove filename $tipo =~ s! .*!!; chomp $tipo; $tipo =~ s/\n+$//; # chomp not working ??? if(defined $charset){ ("iconv -f $charset -t utf-8 '$file'") } elsif($tipo eq "utf-8") { ("cat '$file'"); } elsif($tipo eq "utf-16be") { ("iconv -f utf-16be -t utf-8 '$file'") } elsif($tipo eq "utf-16le") { ("iconv -f utf-16le -t utf-8 '$file'") } elsif($tipo eq "unknown-8bit") { ("iconv -f CP1252 -t utf-8 '$file'") } elsif($tipo eq "iso-8859-1"){ ("iconv -f latin1 -t utf-8 '$file'") } elsif($tipo eq "us-ascii"){ ("cat '$file'");} else { print STDERR "socorro: '$tipo' '$file'\n"; } } __END__ =head1 NAME jop - jj open (html, doc, pdf, xml, rtf, xls, csv) as a textual pipe... =head1 SYNOPSIS jop [options] file+ jop [options] url =head1 DESCRIPTION C extracts the type from the extension, and converts the files to text. html, doc, pps, docx pdf, ps, eps, xml, rtf, xls, csv, odt, lit mobi epub fb2 mdb and others =head2 Options -t=type forces a type (html,ly,rtf,pdf,doc,xml...) -l large mode -no_n skips the output of "#==> file name <==" -nolayout to supress layout mode for pdftotext -raw PDF (pdftotext -raw) -rawpp PDF (pdftotext -raw |lawpp - | bookcleaner -c -pipe - ) -rawpp2 PDF (pdftotext |lawpp - | bookcleaner -c -pipe - ) -bookcleaner TXT (bookcleaner -c -pipe) -rom bookcleaner looks for chapters with Roman numbering -fs=! define a fiel separator... -v use vi as a pager -charset=cp1251 (forces cp1251 interretation for html w3m) -w the same as -charset=cp1252 ("ansi" and mal formed latin1) -ly uses lynx instead of w3m for html-txt converter -elinks uses elinks instead of w3m for html-txt converter -h=20 show just the fist 20 lines ( | head -n=20 ) -noenc in txt mode: do not change encoding (default is convert to UTF8) -norm creates a directory "__TXT/" and saves a copy of the converted file -uno forces the use of unoconv =head2 Dependencies Jop uses several external tools to make conversions: html w3m -dump -cols ... -T text/html ... ly lynx -dump -dont_wrap_pre -force_html -stdin -nolist -nomargins -nonumbers -width= ... elinks force the use of elinks html converter rtf abiword -t txt ... -o '__....txt' docx abiword -t txt ... -o '__....txt' doc antiword $w ... pdf pdftotext -enc UTF-8 ... - xml xmllint -format ... xls ssconvert --export-type=Gnumeric_stf:stf_csv ... | xlscat csv xlscat -s 'fs' ... mobi,lit,epub,fb2 ebook-convert txt cat ... mdb access : mdb-tools : mdb-export -d :: -Q f.mdb General xlsx libreoffice --convert-to tsv:'Text - txt - csv (StarCalc)':9,34,UTF8 xlsx2 xlsx2csv f.xlsx | csv2tab =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut w3m