package HTML::REng;

use 5.008;
use strict;
use warnings;
use XML::DT;
use Data::Dumper;
use LWP::Simple;
#use CGI qw(:all);

require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use HTML::REng ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 
	'all' => [ qw(tables_str tables links links_str alt_fill_str alt_fill br_after_a_str br_after_a
	              is_menu is_menu_str links_relat links_relat_str less_tables less_tables_str 
				  content_detect content_detect_str de_ascii de_unicode menus menus_str line_count
				  almost_eq java_menus_str java_menus relevant_links_str relevant_links links_content_str
				  links_content) ],
	'tables' => [ qw(tables tables_str less_tables less_tables_str) ],
	'links' => [ qw(links_str links br_after_a_str br_after_a is_menu is_menu_str 
					links_relat links_relat_str menus menus_str java_menus_str java_menus ) ],
	'images' => [ qw(alt_fill_str alt_fill) ],
	'content' => [ qw(content_detect content_detect_str) ],
	'encoding' => [ qw(de_ascii de_unicode) ],
	'aux' => [ qw(line_count almost_eq) ]
);

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw();

our $VERSION = '0.01';

my %unicode_latin1_table = (
   '\xC2\xA1' => '¡', '\xC2\xA2' => '¢', '\xC2\xA3' => '£', '\xC2\xA4' => '¤',
   '\xC2\xA5' => '¥', '\xC2\xA6' => '¦', '\xC2\xA7' => '§', '\xC2\xA8' => '¨',
   '\xC2\xA9' => '©', '\xC2\xAA' => 'ª', '\xC2\xAB' => '«', '\xC2\xAC' => '¬',
   '\xC2\xAD' => '­', '\xC2\xAE' => '®', '\xC2\xAF' => '¯', '\xC2\xB0' => '°',
   '\xC2\xB1' => '±', '\xC2\xB2' => '²', '\xC2\xB3' => '³', '\xC2\xB4' => '´',
   '\xC2\xB5' => 'µ', '\xC2\xB6' => '¶', '\xC2\xB7' => '·', '\xC2\xB8' => '¸',
   '\xC2\xB9' => '¹', '\xC2\xBA' => 'º', '\xC2\xBB' => '»', '\xC2\xBC' => '¼',
   '\xC2\xBD' => '½', '\xC2\xBE' => '¾', '\xC2\xBF' => '¿', '\xC3\x80' => 'À',
   '\xC3\x81' => 'Á', '\xC3\x82' => 'Â', '\xC3\x83' => 'Ã', '\xC3\x84' => 'Ä',
   '\xC3\x85' => 'Å', '\xC3\x86' => 'Æ', '\xC3\x87' => 'Ç', '\xC3\x88' => 'È',
   '\xC3\x89' => 'É', '\xC3\x8A' => 'Ê', '\xC3\x8B' => 'Ë', '\xC3\x8C' => 'Ì',
   '\xC3\x8D' => 'Í', '\xC3\x8E' => 'Î', '\xC3\x8F' => 'Ï', '\xC3\x90' => 'Ð',
   '\xC3\x91' => 'Ñ', '\xC3\x92' => 'Ò', '\xC3\x93' => 'Ó', '\xC3\x94' => 'Ô',
   '\xC3\x95' => 'Õ', '\xC3\x96' => 'Ö', '\xC3\x97' => '×', '\xC3\x98' => 'Ø',
   '\xC3\x99' => 'Ù', '\xC3\x9A' => 'Ú', '\xC3\x9B' => 'Û', '\xC3\x9C' => 'Ü',
   '\xC3\x9D' => 'Ý', '\xC3\x9E' => 'Þ', '\xC3\x9F' => 'ß', '\xC3\xA0' => 'à',
   '\xC3\xA1' => 'á', '\xC3\xA2' => 'â', '\xC3\xA3' => 'ã', '\xC3\xA4' => 'ä',
   '\xC3\xA5' => 'å', '\xC3\xA6' => 'æ', '\xC3\xA7' => 'ç', '\xC3\xA8' => 'è',
   '\xC3\xA9' => 'é', '\xC3\xAA' => 'ê', '\xC3\xAB' => 'ë', '\xC3\xAC' => 'ì',
   '\xC3\xAD' => 'í', '\xC3\xAE' => 'î', '\xC3\xAF' => 'ï', '\xC3\xB0' => 'ð',
   '\xC3\xB1' => 'ñ', '\xC3\xB2' => 'ò', '\xC3\xB3' => 'ó', '\xC3\xB4' => 'ô',
   '\xC3\xB5' => 'õ', '\xC3\xB6' => 'ö', '\xC3\xB7' => '÷', '\xC3\xB8' => 'ø',
   '\xC3\xB9' => 'ù', '\xC3\xBA' => 'ú', '\xC3\xBB' => 'û', '\xC3\xBC' => 'ü',
   '\xC3\xBD' => 'ý', '\xC3\xBE' => 'þ', '\xC3\xBF' => 'ÿ',

   '\xE2\x80\x93' => '--',  '\xE2\x80\x94' => '--',
   '\xE2\x80\x98' => '`',   '\xE2\x80\x99' => '\'',
   '\xE2\x80\x9C' => '``',  #'\xE2\x80\x9D' => '''',
   '\xE2\x80\xA6' => '...',
);

my %ascii_latin1_table = (
#'x{00}' => 'NUL', 'x{01}' => 'SOH',
'x{02}' => 'STX', 'x{03}' => 'ETX', 'x{09}' => 'HT',  'x{7f}' => 'DEL',
'x{0b}' => 'VT',  'x{0c}' => 'FF',  'x{0d}' => 'CR',  'x{0e}' => 'SO',  'x{0f}' => 'OF',  'x{10}' => 'DLE',
'x{11}' => 'DC1', 'x{12}' => 'DC2', 'x{13}' => 'DC3', 'x{0a}' => 'LF',  'x{0b}' => 'BS',  'x{18}' => 'CAN',
'x{19}' => 'EM',  'x{1c}' => 'FS',  'x{1d}' => 'GS',  'x{1e}' => 'RS',  'x{1f}' => 'US',  'x{1c}' => 'FS',
'x{0A}' => 'LF',  'x{17}' => 'ETB', 'x{1b}' => 'ESC', 'x{05}' => 'ENQ', 'x{06}' => 'ACK', 'x{07}' => 'BEL',
'x{16}' => 'SYN', 'x{04}' => 'EOT', 'x{14}' => 'DC4', 'x{15}' => 'NAK', 'x{1a}' => 'SUB',
'x{e2}' => 'â',   'x{e4}' => 'ä',   'x{e0}' => 'à',   'x{e1}' => 'á',   'x{e3}' => 'ã',   'x{e5}' => 'å',
'x{e7}' => 'ç',   'x{f0}' => 'ñ',   'x{a2}' => '¢',   'x{2e}' => '.',   'x{3c}' => '<',   'x{28}' => '(',
'x{2b}' => '+',   'x{7c}' => '|',   'x{26}' => '&',   'x{e9}' => 'é',   'x{ea}' => 'ê',   'x{eb}' => 'ë',
'x{e8}' => 'è',   'x{ed}' => 'í',   'x{ee}' => 'î',   'x{ef}' => 'ï',   'x{ec}' => 'ì',   'x{df}' => 'ß',
'x{21}' => '!',   'x{24}' => '$',   'x{2a}' => '*',   'x{29}' => ')',   'x{3b}' => ';',   'x{5e}' => '^',
'x{2d}' => '-',   'x{2f}' => '/',   'x{c2}' => 'Â',   'x{c4}' => 'Ä',   'x{c0}' => 'À',   'x{c1}' => 'Á',
'x{c3}' => 'Ã',   'x{c5}' => 'Å',   'x{c7}' => 'Ç',   'x{d1}' => 'Ñ',   'x{a6}' => '|',   'x{2c}' => ',',
'x{25}' => '%',   'x{5f}' => '_',   'x{3e}' => '>',   'x{3f}' => '?',   'x{f8}' => 'ø',   'x{c9}' => 'É',
'x{ca}' => 'Ê',   'x{cb}' => 'Ë',   'x{c8}' => 'È',   'x{cd}' => 'Í',   'x{ce}' => 'Î',   'x{cf}' => 'Ï',
'x{cc}' => 'Ì',   'x{5f}' => '`',   'x{3a}' => ':',   'x{23}' => '#',   'x{40}' => '@',   'x{27}' => '\'',
'x{3d}' => '=',   'x{22}' => '"',   'x{d8}' => 'Ø',   'x{61}' => 'a',   'x{62}' => 'b',   'x{63}' => 'c',
'x{64}' => 'd',   'x{65}' => 'e',   'x{66}' => 'f',   'x{67}' => 'g',   'x{68}' => 'h',   'x{69}' => 'i',
'x{ab}' => '«',   'x{bb}' => '»',   'x{f0}' => 'ð',   'x{0d}' => 'ý',   'x{de}' => 'Þ',   'x{b1}' => '±',
'x{b0}' => '°',   'x{6a}' => 'j',   'x{6b}' => 'k',   'x{6c}' => 'l',   'x{6d}' => 'm',   'x{6e}' => 'n',
'x{6f}' => 'o',   'x{70}' => 'p',   'x{71}' => 'q',   'x{72}' => 'r',   'x{aa}' => 'ª',   'x{ba}' => 'º',
'x{e6}' => 'æ',   'x{b8}' => '',    'x{c6}' => 'Æ',   'x{a4}' => '',    'x{b5}' => 'µ',   'x{7e}' => '~',#TODO xb8,xa4
'x{73}' => 's',   'x{74}' => 't',   'x{75}' => 'u',   'x{76}' => 'v',   'x{77}' => 'w',   'x{78}' => 'x',
'x{79}' => 'y',   'x{7a}' => 'z',   'x{a1}' => '¡',   'x{bf}' => '¿',   'x{fe}' => 'Ð',   'x{5B}' => '[',
'x{fe}' => 'þ',   'x{ae}' => '®',   'x{ac}' => '¬',   'x{a3}' => '£',   'x{a5}' => '¥',   'x{95}' => '', #TODO x95
'x{a9}' => '©',   'x{a7}' => '§',   'x{b6}' => '¶',   'x{bc}' => '',    'x{bd}' => '',    'x{be}' => '', #TODO xbc,xbd,xbe
'x{dd}' => 'Ý',   'x{a8}' => '"',   'x{af}' => '¯',   'x{5d}' => ']',   'x{92}' => '',    'x{d7}' => '×',#TODO x92
'x{7b}' => '{',   'x{41}' => 'A',   'x{42}' => 'B',   'x{43}' => 'C',   'x{44}' => 'D',   'x{45}' => 'E ',
'x{46}' => 'F',   'x{47}' => 'G',   'x{48}' => 'H',   'x{49}' => 'I',   'x{9b}' => '-',   'x{f4}' => 'ô',
'x{f6}' => 'ö',   'x{f2}' => 'ò',   'x{f3}' => 'ó',   'x{f5}' => 'õ',   'x{7d}' => '}',   'x{4a}' => 'J',
'x{4b}' => 'K',   'x{4c}' => 'L',   'x{4d}' => 'M',   'x{4e}' => 'N',   'x{4f}' => 'O',   'x{50}' => 'P',
'x{51}' => 'Q',   'x{52}' => 'R',   'x{b9}' => '¹',   'x{fb}' => 'û',   'x{fc}' => 'ü',   'x{f9}' => 'ù',
'x{fa}' => 'ú',   'x{ff}' => 'ÿ',   'x{5c}' => '\\',  'x{f7}' => '÷',   'x{53}' => 'S',   'x{54}' => 'T',
'x{55}' => 'U',   'x{56}' => 'V',   'x{57}' => 'W',   'x{58}' => 'X',   'x{59}' => 'Y',   'x{5a}' => 'Z',
'x{b2}' => '²',   'x{d4}' => 'Ô',   'x{d6}' => 'Ö',   'x{d2}' => 'Ò',   'x{d3}' => 'Ó',   'x{d5}' => 'Õ',
'x{30}' => '0',   'x{31}' => '1',   'x{32}' => '2',   'x{33}' => '3',   'x{34}' => '4',   'x{35}' => '5',
'x{36}' => '6',   'x{37}' => '7',   'x{38}' => '8',   'x{39}' => '9',   'x{b3}' => '³',   'x{db}' => 'Û',
'x{dc}' => 'Ü',   'x{d9}' => 'Ù',   'x{da}' => 'Ú'
);

sub de_ascii {
    my $text = shift;
    map { $text =~ s#\\$_#$ascii_latin1_table{$_}#gs; } keys %ascii_latin1_table;
    $text;
}

sub de_unicode {
    my $text = shift;
    map {$text =~ s#\\$_#$unicode_latin1_table{$_}#gs;} keys %unicode_latin1_table;
    $text;
}

sub content_detect {
	my ($url,$options) = @_;
	my $html = get($url);
    if(wantarray) {my @foo = content_detect_str($html,$options); return @foo}
      else {my $scalar = content_detect_str($html,$options); return $scalar;}
}

sub content_detect_str {
	my ($html,$options) = @_;
	my $estrat = 1;
	if(!defined($options->{est})) {$estrat=1} else {$estrat=$options->{est}}
	
#	Estratégia 1 - (contar words do texto.)
	if($estrat==1) {
		my $words;
		my @found;
		my %handler = (
			'-html' => 1,
			'script' => sub{},
			'a' => sub{},
			'-default' => sub{
				my $content = $c;
				while($content=~m/\w{5,}/gis) { $words++; }
				if($words>30 && length(toxml)>100) {push(@found,toxml);}
			},
		);
		dtstring($html,%handler);
		if(wantarray) {	return @found }
		else {return join("\n",@found)}
	}
	
#	Estratégia 2 - (comparar com meia dúzia de palavras chave)
	if($estrat==2) {
		my @keywords = qw(mas quando porque porquê onde para pelo é que ele ela
						que este isto uma seu sua não sem dos das nos assim tem
						como nos aos ao este esta está do da no na que de
						but when why because where it is him his her for that the
						she he and this
						mais quand parce raison où pour par lui c'est que lui 
						elle que celui-ci ceci sien sien sans dans ainsi des pas
						comme dans des
						);
		my @found; my $words;
		$keywords[-1] = $keywords[-1]." ";
		$keywords[0] = " ".$keywords[0];
		my $keys = join(" ",@keywords);
		
		my %handler = (
			'-html' => 1,
			'script' => sub{},
			'a' => sub{toxml},
			'-default' => sub {
				my $certain;
				my $content = $c;
				while($content=~m/\w{2,}/gis) { 
					my $match = " ".$&." ";
					$words++;
					if($keys =~ m/$match/gis) {$certain++;}
				}
				if($certain>1 && $words > 40) {push(@found,toxml)}
			}
		);
		dtstring($html,%handler);
		if(wantarray) {	return @found }
		else {return join("\n",@found)}
	}
	
}

sub relevant_links {
	my ($url,$options) = @_;
	my $html = get($url);
    if(wantarray) {my @foo = relevant_links_str($html,$options); return @foo}
      else {my $scalar = relevant_links_str($html,$options); return $scalar;}
}

sub relevant_links_str {
	my ($html,$options) = @_;
	my $estrat = 1;
    if(!defined($options->{est})) {$estrat=1} else {$estrat=$options->{est}}
	my @keywords = qw(mas quando porque porquê onde para pelo é que ele ela
                        que este isto uma seu sua não sem dos das nos assim tem
                        como nos aos ao este esta está do da no na que de
                        but when why because where it is him his her for that the
                        she he and this
                        mais quand parce raison où pour par lui c'est que lui
                        elle que celui-ci ceci sien sien sans dans ainsi des pas
                        comme dans des
                        );
    my @found; my $words;
    $keywords[-1] = $keywords[-1]." ";
    $keywords[0] = " ".$keywords[0];
    my $keys = join(" ",@keywords);
	my %dt;
	if($estrat==2) {
	  %dt=(
		'-html' => 1,
	    'img' => sub{},
		a => sub {
			my $certain;
            my $content = $c;
            while($content=~m/\w{2,}/gis) {
                my $match = " ".$&." ";
                $words++;
                if($keys =~ m/$match/gis) {$certain++;}
            }
            if($certain>1 && $words > 3) {
				my $hr = $v{href};
				%v = (href=>$hr);
				push(@found,toxml);
			}
		}
	  );
	} else {
	  %dt = (
		'-html' => 1,
	    'img' => sub{},
		a => sub {
			my $content = $c;
			while($content=~m/\w{2,}/gis) { $words++; }
			if($words > 2) {
				my $hr = $v{href};
				%v = (href=>$hr);
				push(@found,toxml)
			}
		}
	  );	
	}
	dtstring($html,%dt);
	if(wantarray) { return @found }
	else {return join("\n",@found)}
}

sub links_content {
	my ($url,$options) = @_;
	my $html = get($url);
	return links_content_str($html,$options);
}

sub links_content_str {
	my ($html,$options) = @_;
	my $o;
	my %dt = (
        '-html' => 1,
		'-default' => sub {
				my $words;
                my $content = $c;
                while($content=~m/\w{5,}/gis) { $words++; }
                if($words>10 && length(toxml)>30) {print toxml."\n";}
		},
        'a' => sub {
			my $words;
            my $content = $c;
            while($content=~m/\w{2,}/gis) { $words++; }
            if($words > 2) {
                my $hr = $v{href};
                %v = (href=>$hr);
				print toxml."\n";
            }
        },
        'img' => sub{}
      );
	dtstring($html,%dt);
	$o;
}


sub debug {
	my $msg = shift;
	print "DEBUG: $msg\n";
}


#TODO -> o pod que se segue está por fazer e por mover (lá para o fundo)!

=pod

B<content_detect()>

This function 'tries' to detect where is the important content of an HTML
document. By important we mean, for instance, the text of an article in a
news website.
It can follow two different heuristics to accomplish the detection. One is
to count the number of words of every 

=cut

sub less_tables {
	my ($url,$options) = @_;
	my $html = get($url);
	return less_tables_str($html,$options);
}

sub less_tables_str { # removes tables (<table> tags) and it's content
	my ($html,$options) = @_;
	my %handler = (
		'-html' => 1,
		'-default' => sub{toxml},
		'table' => sub{""}
	);
	return dtstring($html,%handler);
}

sub links_relat {
	my ($url,$options) = @_;
	my $html = get($url);
	return links_relat_str($html,$options);
}

sub links_relat_str {
	my ($html,$options) = @_;
	my ($relat,$n,$total_content_length);
	my ($max_content_size,$min_content_size) = (0,0);
	my %handler = (
		'-html' => 1,
		'-default' => sub{toxml},
		'a' => sub {
				$n++;
				$min_content_size = length($c) if($n==1);
				$relat->{$n}->{html}=toxml;
				$relat->{$n}->{content_size}=length($c);
				$total_content_length+=length($c);
				$max_content_size = length($c) if(length($c)>$max_content_size);
				$min_content_size = length($c) if($min_content_size>length($c));
				#$relat->{$n}->{atts}=$v;
			}
	);
	dtstring($html,%handler);
	$relat->{avg_content_size} = $total_content_length / $n;
	$relat->{max_content_size} = $max_content_size;
	$relat->{min_content_size} = $min_content_size;
	$relat;
}

sub is_menu {
	my ($url,$options) = @_;
	my $html = get($url);
	return is_menu_str($html,$options);
}

sub is_menu_str {
	my ($html,$options) = @_;
#	debug($html);
	my $amount;
	if(defined($options->{'ratio'})) {$amount=$options->{ratio};} else {$amount = 0.6;}
	my $html_l = length($html);
	my $a_l = 0;
	my %handler = (
		'-html' => 1,
		'-default' => sub{toxml},
		'a' => sub{$a_l+=length(toxml)}
	);
	dtstring($html,%handler);
	($a_l/$html_l)>$amount? return 1 : return 0; 
}

sub menus {
	my ($url,$options) = @_;
	my $html = get($url);
	return menus_str($html,$options);
}

#Tries do detect menus inside HTML documents. First it will extract
#all the <table>s and for each table executes function is_menu.
#When all tables are processed we'll have (or not) a set of tables
#claiming to be menus, after that we will check if any of them are similar
#outputting only the essential ones (less redundant on results). The result
#will be an array containing (or not) the <table>s looking like menus.

sub menus_str { 
	my ($html,$options) = @_;
	my @menus;
	my @final;
	my $hash = tables_str($html);
	my $n = 0;
	for(keys %$hash) {
		my $number = $_;
		my $text = $hash->{$_};
		if(is_menu_str($text)) {
			$n++;
			if($n == 1) {push(@menus,$text)} 
			else { 
				my $m = 0;
				my @a = @menus;
				my $bool = 0;
				for my $item (@a) {
					my $t2 = almost_eq($item,$text);
					if($t2) { 
						$menus[$m] = $t2; 
					} else {  if(!$bool) {push(@menus,$text); $bool++;} }
					$m++;
				}
			}
		}
	}
	\@menus;
}

sub java_menus {
	my ($url,$options) = @_;
	my $html = get($url);
	return java_menus_str($html,$options);
}

sub java_menus_str {
	my ($html,$options) = @_;
	my $html2;
	my @h = tables_str($html);
	my %handler = (
		'-html' => 1,
		'script' => sub {$html2.=toxml}
	);
	dtstring($html,%handler);
	for my $html3 (@h) {
		my $oc_l = 0;
		my %handler = (
			'-html' => 1,
			'-default' => sub {
				if(defined($v{'onclick'})) {
					$oc_l+=length(toxml);
				}
			}
		);
		dtstring($html3,%handler);
		if($oc_l / length($html3) > 0.5) {$html2.=$html3}
	}
		
	$html2;
}

sub title_detect {
	my ($url,$options) = @_;
	my $html = get($url);
	return title_detect_str($html,$options);
}

sub title_detect_str {
	my ($html,$options) = @_;
}

sub almost_eq { #verifica se 2 strings são 'muito' diferentes. O algoritmo é:
					#- conta numero de linhas de uma e outra string;
					#- faz um 'diff' entre os 2 ficehiros
					#- se (número de linhas que diferem)/(numero de linhas de uma das strings)
					#for maior que um ratio que pode ser especificado entao são almost_eq(uals)
	my ($str1,$str2,$options) = @_;
	my $amount;
	if(defined($options->{'ratio'})) {$amount=$options->{ratio};} else {$amount = 0.7;}
	my ($l1,$l2) = (line_count($str1),line_count($str2));
	my $rand1 = int(rand(50)); 
	my $rand2 = int(rand(50)); 
	my $file1 = "_file$rand1"; my $file2 = "_file$rand2"; 
	open(A,">$file1"); print A $str1; close A;
	open(A,">$file2"); print A $str2; close A;
	my $diff = `diff $file1 $file2`;
	system("rm","-f",$file1,$file2);
	my $changes;
	while($diff=~/(.*)\n/g) { if($1=~m/^(<|>)/) {$changes++;} }
	if(($changes/$l1)<0.2) { 
		if($l1<$l2) { return $str1 } 
		else { return $str2 }
	} else {return 0}
}

sub line_count {
	my $str = shift;
	my $wc = 1;
	while($str=~m/(.*?)\n/gis) {$wc++}
	$wc;
}

sub br_after_a { # Insere <br /> após cada <a>...</a>
	my ($url,$options) = @_;
	my $html = get($url);
	return br_after_a($html,$options);
}

sub br_after_a_str { # Insere <br /> após cada <a>...</a>
	my ($html,$options) = @_;
	my $enc;
	defined($options->{outputenc})? $enc = $options->{outputenc} : $enc = 'ISO-8859-1';
	my %handler = (
	    '-outputenc' => $enc,
		'-html' => 1,
		'-default' => sub{toxml;},
		'a' => sub{
			toxml."<br />";
		}
	);
	return dtstring($html,%handler);
}

sub alt_fill {
	my ($url,$options) = @_;
	my $html = get($url);
	return alt_fill_str($html,$options);
}

sub alt_fill_str {
	my ($html,$options) = @_;
	my $enc;
	defined($options->{outputenc})? $enc = $options->{outputenc} : $enc = 'ISO-8859-1';
	my %handler = (
	    '-outputenc' => $enc,
		'-html' => 1,
		'-default' => sub{toxml;},
		'img' => sub{
			if(!defined($v{alt}) || ($v{alt}!~m/.+/g)) {
				my $src = $v{src};
				$src=~s/(.+\/)*(.+?)(\..+)*/$2/g;
				$v{alt} = $src;
			}
			toxml;
		}
	);
	return dtstring($html,%handler);
}

sub links_str {
	my ($html,$options) = @_;
	my $enc;
	defined($options->{outputenc})? $enc = $options->{outputenc} : $enc = 'ISO-8859-1';
	my (%foo,$id);
	my %handler = (
	    '-outputenc' => $enc,
    	'-html' => 1,
    	'-default'   => sub{toxml;},
     	'a' => sub{
			$id++;
			$foo{$id} = toxml;
		});

	dtstring($html,%handler);
	return \%foo if !wantarray;
	my @foo;
	map {push(@foo,$foo{$_})} sort {$a <=> $b} keys %foo;
	return @foo if wantarray;
}

sub links {
    my ($url,$options) = @_;
    my $html = get($url);
    if(wantarray) {my @foo = links_str($html,$options); return @foo}
      else {my $hash = links_str($html,$options); return $hash;}
}

sub tables_str {
	my ($html,$options) = @_;
	my $enc;
	defined($options->{outputenc})? $enc = $options->{outputenc} : $enc = 'ISO-8859-1';
	my (%foo,$id);
	my %dt_handler1=(
	    '-outputenc' => $enc,
    	'-html'   => 1,
    	'-default'=> sub{toxml;},
    	'table'   => sub{
        	$id++;
        	$foo{$id} = toxml;
    	});
	
	dtstring($html,%dt_handler1);
	return \%foo if !wantarray;
	my @foo;
	map {push(@foo,$foo{$_})} sort {$a <=> $b} keys %foo;
	return @foo if wantarray;
}

sub tables {
	my ($url,$options) = @_;
	my $html = get($url);
	if(wantarray) {my @foo = tables_str($html); return @foo}
	  else {my $hash = tables_str($html); return $hash;}
}


# Preloaded methods go here.

1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

HTML::REng - Perl extension for reverse engineering of HTML documents

=head1 SYNOPSIS

  use HTML::REng;
  $hashLinks = links_relat($html);
    #   (url * text * name * length)*
    #   (HtmltextOfthelink --> (length|lengthOfTheXML --> int))
  

=head1 DESCRIPTION

B<links_relat()>

This function produces a report concerning the links (I<a> tags) found in an HTML
document. It returns (a reference to) an associative array containing the HTML
for each link and it's content size.
It also contains the shortest/largest content length of all the links and the
average length for all of them.

B<is_menu()>

This function 'tries' to detect whether an HTML document is a menu. It measures
the length of I<a> tags in comparison to the whole document length. It then
tests for a ratio that can be given as an option (default will be 0.7).

  $bool = is_menu($html,{ratio=>0.7});

I<$html> holds an HTML document.

=cut

B<alt_fill()>

  $html = alt_fill($html);

I<$html> holds an HTML document.

Wouldn't it be wonderfull if everyone filled the I<alt> option of the I<img>
tags? This function does it whenever the option is omitted or it's value is
blank. The heuristic adopted to fill is to extract it from the I<src> option.

Given the following html:

  <html>
  <head><title>HTML para testes</title></head>
  <body>
  <h1>My Bookmarks:</h1>
  <a href="http://natura.di.uminho.pt">Projecto Natura</a>
  <a href="http://sim.di.uminho.pt">Grupo de Software, Interacção e Multimédia</a>
  <img src="http://amcarvalho.dyndns.org/ab/image.png">
  </body>
  </html>

calling alt_fill_str will produce:

  <html>
  <head><title>HTML para testes</title></head>
  <body>
  <h1>My Bookmarks:</h1>
  <a href="http://natura.di.uminho.pt">Projecto Natura</a>
  <a href="http://sim.di.uminho.pt">Grupo de Software, Interacção e Multimédia</a>
  <img src="http://amcarvalho.dyndns.org/ab/image.png" alt="image">
  </body>
  </html>

as illustrated above, the function will fill the I<alt> value with the image filename.

B<alt_fill()>

  $html = alt_fill($url);

This function acts in similar way to alt_fill(), it will fetch
and process an HTML document from the URL given as argument.

B<links_str()>

  $hash  = links_str($html);
  @array = links_str($html);

I<$html> holds an HTML document.

Called in scalar context this function returns an associative array with
all the links on an HTML document. Keys will be numbers respecting links
order of appearance on the HTML document.

Called in array context this funtion returns an array with all the links 
on an HTML document. Elements in the array will be sorted to match links
appearance order on the HTML document.

Given the following html:

  $html = '<html>
  <h1>My Bookmarks:</h1>
  <a href="http://natura.di.uminho.pt">Projecto Natura</a><br/>
  <a href="http://sim.di.uminho.pt">Grupo de Soft., Inter. e Multim.</a>
  </html>';

calling links_str in both contexts will produce:

  my $hash = links_str($html);   

  $hash = {
    1 => '<a href="http://natura.di.uminho.pt">Projecto Natura</a><br/>',
    2 => '<a href="http://sim.di.uminho.pt">Grupo de Soft., Inter. e Multim.</a>'
  }

  @array = links_str($html);

  @array = (
    '<a href="http://natura.di.uminho.pt">Projecto Natura</a><br/>',
    '<a href="http://sim.di.uminho.pt">Grupo de Soft., Inter. e Multim.</a>'
  )

B<links()>

  $hash  = links($url);
  @array = links($url);

This function acts in similar way to links_str(), it will fetch
and process an HTML document from the URL given as argument.

B<tables_str()>

  $hash  = tables_str($html);
  @array = tables_str($html);

I<$html> holds an HTML document.

Called in scalar context this function returns an associative array with
all the tables on an HTML document. Keys will be numbers respecting tables 
order of appearance on the HTML document.

Called in array context this funtion returns an array with all the tables
on an HTML document. Elements in the array will be sorted to match tables
appearance order on the HTML document.

Given the following html:

  $html = "<html>
  <table>
  <tr> <th>Country</th>  <th>Population</th> </tr>
  <tr> <td>Portugal</td>  <td>11000000</td>  </tr>
  <tr>  <td>Brasil</td>  <td>184101109</td>  </tr>
  </table>

  <table>
  <tr> <th>Animal</th>  <th>Wings</th> </tr>
  <tr>   <td>Cat</td>    <td>No</td>   </tr>
  <tr> <td>Swallow</td> <td>Yes</td>   </tr>
  </table>

  </html>"

calling tables_str in both contexts will produce:

  my $hash = tables_str($html);   @array = tables_str($html);

  $hash ={                        @array =(
    1 => "<table>                    "<table>
    <tr><th>Country</th><th>         <tr><th>Country</th><th>
    Population</th></tr>             <tr><th>Country</th><th>
    <tr> <td>Portugal</td>           <tr><td>Portugal</td>
    <td>11000000</td></tr>           <td>11000000</td></tr>
    ... ",                           ...",
    2 => "<table>                    "<table>
    <tr><th>Animal</th>              <tr><th>Animal</th>
    <th>Wings</th></tr>              <th>Wings</th></tr>
    <tr><td>Cat</td>                 <tr><td>Cat</td>
    ... "                            ..."
  }                                 )

B<tables()>

  $hash  = tables($url);
  @array = tables($url);

This function acts in similar way to tables_str(), it will fetch 
and process an HTML document from the URL given as argument.

=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

Alexandre Martins de Carvalho, E<lt>alexandre@localdomainE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2005 by Alexandre Martins de Carvalho

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut
