#!/usr/bin/perl -s use CGI qw(:standard ); use XML::DT ; my $filename = shift; my @stdintmp=(); ## list of temp files (to remove in the end) my $tmp=""; if(not $filename or $filename eq "-"){ $filename="/tmp/__tabextract__$$"; push (@stdintmp,$filename); open(T,">",$filename) or die("cant create temp. file $filename\n"); while(){print T $_;} close(T); $o="a.out"; } use strict; use utf8; my @fs=($filename, grep {not /^\d+$/} @ARGV ); my @ts=(grep {/^\d+$/} @ARGV ); our($csv,$html,$clean,$cleanatr,$rm, $term, $tab,$fields,$f,$grep,$o, $O,$links,$keep,$th, $imgs, $sl); $tab = $csv if $csv; my $ml = not $sl ; $O=1; my @deffields= map {sprintf("_%03d",$_)} 1..100 ; my @fields = (split(/[ ,;\/]+/ , $fields||$f ),@deffields); my @greptabs=(); my $NF=0; $clean ||= "font"; my @r = split(/,/,$clean); my @k = split(/,/,$keep); #keep tags in -term ou -tab my @rmatt = split(/,/,$cleanatr); my %r = map { ($_,1) } @r; my %rmatt = map { ($_,1) } @rmatt; my %k = map { ($_,1) } @k; my $n =0; my %tab=(); my %handlerhtml=( -html => 1, -default => sub{ if ( $r{$q}){ $c } elsif($rmatt{$q}){ %v=(); toxml} else { toxml } }, -pcdata => sub{ $c =~ s/\s+\n/\n/g; $c =~ s/\&/\&/g; $c =~ s//\>/g; $c; }, 'table' => sub{ $n++; if($grep and $c =~ /$grep/){ push(@ARGV,$n)} $tab{$n} = toxml() ; $NF=0; ":::::" . a({href=>"#$n"},"TABELA $n")." ::::"}, ); my %handlerTermTab=( ## common parts for mode term + tab -html => 1, -type => { 'tr' => 'SEQ' }, -default => sub{ if ($k{$q} ){ toxml } elsif($rmatt{$q}){ %v=(); toxml} else { $c } }, -pcdata => sub{ if( ctxt(1) eq "table" or ctxt(1) eq "tbody" or ctxt(1) eq "tr" ){ $c = trim($c); } $c =~ s/\s+\n/\n/g; $c; }, 'img' => sub{ if ( $r{$q}){ "" } elsif($rmatt{$q}){ %v=(); toxml} elsif($imgs and $v{src} and $v{alt}){"[!$v{src}] $v{alt}"} elsif($imgs and $v{src}){"[!$v{src}] IMG"} else { "" } }, 'a' => sub{ if ( $r{$q}){ $c } elsif($rmatt{$q}){ %v=(); toxml} elsif($links and $v{href}){"[$v{href}] $c"} else { $c } }, 'th' => sub{ $c =~ s/\h*\n\h*/ /g; $c =~ s/\|\s*$//g; $c = trim($c); if (defined($v{colspan}) ){ [$c, ("←...") x ($v{colspan}-1)] } else { $c} }, 'td' => sub{ $c =~ s/\h*\n\h*/ /g; $c =~ s/\|\s*$//g; $c = trim($c) ; if (defined($v{colspan}) ){ [$c, ("←...") x ($v{colspan}-1)] } else { $c } }, 'li' => sub{ "|• $c" }, 'p' => sub{ "$c | " }, 'br' => sub{ if($ml) {"|"} else {" "} }, 'hr' => sub{ "|---- " }, ); my %handlerterm=( %handlerTermTab, 'tr' => sub{ my $n=0 ; if($th and ctxt(1) eq "thead"){ ## warn("..." . father()->{"-q"} . "::". join("::",@$c)."\n"); @fields=@$c; } my @aux = map{ if( ref($_) ) { (@$_) } else { $_ } } @$c ; if ($ml){ @aux = map{ s!\|!\n\t|!gr } @aux; } "\n".join("\n",map{ $fields[$n++]." = $_"} @aux )."\n" }, 'table' => sub{ $n++; if($grep and $c =~ /$grep/){ push(@ARGV,$n)} $tab{$n} = "$c" ; $NF=0; ":::::" . a({href=>"#$n"},"TABELA $n")." ::::" }, ); my %handlertab=( %handlerTermTab, 'tr' => sub{ my $n=0 ; my @aux = map{ if( ref($_) ) { (@$_) } else { $_ } } @$c ; my $r = join(" :: ",map{$n++; $_} @aux )."\n"; $NF=$n if $n > $NF; $r }, 'table' => sub{ $c = join(" :: ",map{ $fields[$_-1]} 1..$NF )."\n$c"; $n++; if($grep and $c =~ /$grep/){ push(@ARGV,$n)} $tab{$n} = "$c" ; $NF=0; ":::::" . a({href=>"#$n"},"TABELA $n")." ::::"}, ); for my $filename (@fs){ %tab=(); $n=0; if($O){ $o //= "$filename-"; $o =~ y{:/?}{___}; $o =~ s/\.(html?|asp)-$/-/; } $o ||= "_T"; if($filename=~ m{^https?://}){ ## warn("URL!\n".$o); if ($term){ dturl($filename,%handlerterm); } elsif($tab) { dturl($filename,%handlertab); } else { dturl($filename,%handlerhtml); } } else { if ($term){ dt($filename,%handlerterm); } elsif($tab) { dt($filename,%handlertab); } else { dt($filename,%handlerhtml); } } dumptabs(); } sub dumptabs{ if($html && scalar(@ARGV) ==1){ print start_html(-encoding =>'UTF-8'); for (@ts){ my $x; if (s/\-// ){ $x = expandit($tab{$_}) } else { $x = $tab{$_} } print $x; } print end_html(), } elsif($html){ print start_html(-encoding =>'UTF-8'), ol( li( [ map { a({href => "#$_"}, "Tabela $_") } @ts ] )) ; @ARGV = (sort {$a <=> $b} keys %tab) unless @ts; for (@ts){ my $x; if (s/\-// ){ $x = expandit($tab{$_}) } else { $x = $tab{$_} } print h1( a({name => "$_"}, "Tabela $_")) , $x; } print end_html(), } elsif(@ts){ for (@ts){ if(isstdin()){ print "#==>$_<==\n",$tab{$_}; } else { open(F,">:utf8",sprintf("%s%04d",$o,$_)) or die("cant create '_T$_'\n"); print F $tab{$_}; close F; } } } else { for (keys %tab){ if(isstdin()){ print "#==>$_<==\n",$tab{$_}; } else{ open(F,">:utf8",sprintf("%s%04d",$o,$_)) or die("cant create '_T$_'\n". sprintf("%s%04d",$o,$_)); print F $tab{$_}; close F; } } } } unlink(@stdintmp); if($O){ $o = "$filename-"; $o =~ s/\.(html?|asp)-$/-/; } $o ||= "_T"; sub expandit{ my $t=shift; while( $t =~ s{:::::.*? ::::}{$tab{$1}}g){}; $t } sub isstdin{ return scalar @stdintmp } ## for pipes and similar sub trim{my $a=shift; ## remove initial and final spaces $a =~ s/^\s+|\s+$//gr } __END__ =encoding utf8 =head1 NAME tabextract - extract tables from HTML files =head1 SYNOPSIS tabextract -html file.html+ tabnumber* =head1 DESCRIPTION For each C extract and creats tables (C , C, etc). If tabnumbers are provided, just the correspondent tables are created. The f-0001, etc table files may have the following formats: html (default) csv for options -csv or -tab term for option -term =head2 Options -html -builds a complete HTML output file -clean -list of tags of elements to be removed -clean=img,hr,font -cleanatr=p,li -remove all atributes for elements p and li -grep=piano - extract tables that contain the pattern /piano/ -o=tab_PT - output tables tab_PT0001 ... -O - output tables prefixed by html-file basename -term -"terminology" format ((field=value\n)*\n\n) removing \n, and tags (see -keep, -links) -csv or -tab -"csv" table, columns separated by " :: " removing \n, and tags (see -keep, -links) Just for term and tab format: -links - preserve a/href links "ttt" => "[URL] ttt" -imgs - preserve img/src "ALT/" => "[!URL] ALT" "" => "[!URL] IMG" -keep - preserve list of tags -keep=strong,i,b Just for term format: -sl - value in a single line (separated by |) default: "\n\t|" In mode "term" and "tab"
  • XXX
  • => |• XXX

    XXX

    => XXX |
    => " " (or "\n\t|" if term)
    => |---- \n => " " =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut -rm=a,b,c -remove tags a b c