#!/usr/bin/perl -s use XML::DT ; use URI::WithBase ; use LWP::Simple ; our ($b,$get,$dir,$if); use strict; binmode STDOUT, ':utf8'; $dir ||="./"; $dir .="/" unless $dir =~ m!/$!; mkdir $dir; my $n=0; my %handler=( # '-outputenc' => 'ISO-8859-1', # '-default' => sub{...}, -html => 1, 'a' => sub{print nurl($v{href})," :a: ",n($c) ,"\n"}, 'img' => sub{print nurl($v{src}) ," :i: ",n($v{alt}),"\n"}, ); my %handler2=( # '-outputenc' => 'ISO-8859-1', # '-default' => sub{...}, -begin => sub{open(LOG,">>","__LOG") or die("cant open __LOG file\n"); print LOG "------\n"}, -end => sub{close(LOG)}, -html => 1, 'a' => sub{ my $u= nurl($v{href}); return 0 if($if and $u !~ /$if/); my $dest= sprintf('%s%03d-%s',$dir,++$n,n2($c)); print "$u\n"; print LOG "$dest._ $u\n"; system "curl '$u' > $dest._";}, ); sub nurl{ ## url normaliser my $u = shift; if($b){ return $u if ($u =~ m/^(https?|mailto|ftp):/); my $u1 = URI::WithBase->new($u, $b); return $u1->abs; ### return "$b/$u"; } else { return $u } } sub n{ my $a = shift; $a =~ s/<.*?>/ /g; $a =~ s/\s+/ /g; $a; } sub n2{ my $a = shift; $a =~ s/<.*?>/ /g; $a =~ s/\s+/_/g; $a =~ s/\W/-/g; $a; } if($get){ for (@ARGV){ if(/^http/){ $b=$_; dturl($_, %handler2) } else { dt ($_, %handler2);} }} else { for (@ARGV){ if(/^http/){ $b=$_; dturl($_, %handler) } else { dt ($_, %handler);} }} __END__ =head1 NAME html-a - extract HTML links form a html page or file =head1 SYNOPSIS html-a file.html+ html-a url+ html-a -b=http://.../.html file1... html-a -get -dir="DIR" file1... extract and download links html-a -get -dir="DIR" -if="regexp" file1... idem if url=~ /regexp/ =head1 DESCRIPTION Extracts the list of a/href and img/src and respective texts. Output looks like this: file or url :i: alternate text or file or url :a: link text Example: img/flautista.gif :i: flute.gif contrib.html :a: contributions and thanks vpopular.pdf :a: very popular terms mailto:jj@di.uminho.pt :a: jj@di.uminho.pt http://ipato.org :a: When the dick is playing =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut