#!/usr/bin/perl -w -s use XML::DT; use Data::Dumper; use File::Temp; use Term::ReadLine; #use locale; use strict; our ($latin1,$html,$show_att,$expand_att_id,$class); our ($lines,$t,$shell); my (@files)=@ARGV; @ARGV=(); mkxmltypes (@files); sub mkxmltypes { my %type=(); my @files = @_; my %root = (); my %att=(); my %dom=(); my %ele=(); my %elel=(); my %atl=(); my %handler=( # '-outputenc' => 'ISO-8859-1', '-default' => sub{ $c =~ s/,$//; if(not $class){ push(@{$type{$q}}, (eval("[$c]") || "?$c")); $elel{$q}++; if(ctxt(1)){ $ele{ctxt(1)}{$q} ++;} else { $root{$q}++} for(keys(%v)){ $atl{$_}++; $att{$q}{$_}{tipo($v{$_})||"_str"} ++ ; $dom{$q}{$_}{$v{$_}} ++ } "'$q',"; } else{ my $qcl=$q; if($v{class}){$qcl .="+$v{class}"} if($v{id} ){$qcl .="+$v{id}"} push(@{$type{$qcl}}, (eval("[$c]") || "?$c")); $elel{$qcl}++; if(ctxt(1)){ my $fcl=ctxt(1); if(father->{class}){$fcl .="+".father->{class}} if(father->{id} ){$fcl .="+".father->{id}} $ele{$fcl}{$qcl} ++;} else { $root{$qcl}++} for(keys(%v)){ next if ($_ eq "class"); next if ($_ eq "id"); $atl{$_}++; $att{$qcl}{$_}{tipo($v{$_})||"_str"} ++ ; $dom{$qcl}{$_}{$v{$_}} ++ } "'$qcl',"; } }, '-pcdata' => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1; "'#PCDATA'," } else {""}}, ); if ($html) { $handler{'-html'} = 1;} if($latin1) { $handler{'-inputenc'}='ISO-8859-1';} for my $fname (@files){ if($lines){ my $tmpfile = File::Temp->new( UNLINK => 0 )->filename; system("head -$lines $fname | xmllint --recover - > $tmpfile"); $fname = $tmpfile; } dt($fname,%handler); unlink($fname) if $lines; } ### print "DEBUG",Dumper(\%att,\%ele,\%dom,\%atl); print "# ", join(" ",keys %root)," ...", scalar(localtime(time)) ,"\n"; my %resumofinal=(); for (keys %type){ my @tipo=(); for my $lista (@{$type{$_}}){ push (@tipo, processa($lista)) } $resumofinal{$_}=resumele(processa2([@tipo])).resumeatts($att{$_}); } if($shell){ shell($t,\%root,\%ele,\%att,\%dom,\%resumofinal,\%atl); } else{ pprint(\%resumofinal,ordem(\%ele,(($t) ||(keys %root) ))); } } sub shell{ my ($t,$root,$ele,$att,$dom,$resumofinal,$atl) = @_; my $last=(keys %$root)[0]; my $elepat = q{[\w:]+}; my $max = 10; my $term = new Term::ReadLine 'sample'; my $tas = $term->Attribs; $tas->{completion_entry_function}= $tas->{list_completion_function}; $tas->{completion_word} = [ keys(%$ele), keys(%$atl) ]; pprint($resumofinal,ordem($ele,(($t) ||(keys %$root) ))); while ( defined ($_ = $term->readline("\npfs> ")) ) { chomp(); $term->addhistory($_) if /\S/; s/^\s*(.*?)\s*$/$1/; if(/($elepat)\[\@?($elepat)\]/){ print resumeatt($att->{$1}{$2},$dom->{$1}{$2},$max); $last = $1} elsif(/\!max\s*=?\s*(\d+)/){$max=$1;} elsif(/\.($elepat)/){ print resumeatt($att->{$last}{$1},$dom->{$last}{$1},$max);} elsif(!$_ or defined $ele->{$_}) { $last=$_; pprint($resumofinal,ordem($ele,(($_) ||(keys %$root) ))); } else{ for my $e (keys %$att){ for my $a (keys %{$att->{$e}}){ print "$e($a):", resumeatt($att->{$e}{$a},$dom->{$e}{$a},$max) if($a eq $_) } } } } } sub ordem{ my ($rel,@st)=@_; my @r=(); my %visited = ('#PCDATA' => 1); while(@st){ my $next = shift(@st); next if $visited{$next}; push(@r,$next); $visited{$next} = 1; push(@st, (grep {! $visited{$_}} (keys %{$rel->{$next}}))); } \@r; } sub pprint{ my $r = shift; my $order = shift; for (@$order){ print "$_ \t=> $r->{$_}\n";} } sub resumeatts{ my $a=shift; my $r=""; for (keys(%{$a})) { if($expand_att_id){ $r .= "\n\t\t * $_:(".join(",",keys %{$a->{$_}}) . ")" } else { $r .= " * $_" } } $r } sub resumeatt{ my $a=shift; my $d=shift; my $max = shift(@_) || 10; my $r= join("|",keys %{$a}) ; my @domact = (grep {defined $_} ((keys %{$d}))[0..$max]); $domact[$max] = '...' if $domact[$max]; $r . " = {". join(",",@domact) . "}\n"; } sub processa{ my $a=shift; if( @$a == 0 ) { +{ _isa =>"empty"} } elsif( @$a == 1 && $a->[0] eq '#PCDATA') { +{ _isa =>"text" ,$a->[0] =>[1,1]} } elsif( @$a == 1 ) { +{ _isa =>"singleton",$a->[0] =>[1,1]} } else{ my %f = (); for (@$a){$f{$_}[0]++,$f{$_}[1]++} my $dif = scalar keys %f; if($dif == 1) { +{ _isa =>"seq", %f} ; } elsif($dif == @$a) { +{ _isa =>"tup", %f}; } elsif($f{'#PCDATA'}){ +{ _isa =>"mixed", %f }; } else { +{ _isa =>"mtup", %f } } } } sub processa2{ my $a=shift; if ( @$a == 0 ) { die("no sons????") } elsif( @$a == 1 ) { $a->[0] } else{ my %f = (); my %maybe = (); for (@$a){$f{sons2str($_)}++; $maybe{$_->{_isa}}++ } my $dif = scalar keys %f; if ($dif == 1) { $a->[0]; } elsif($maybe{mixed} || $maybe{text}){ +{%{join_sons($a)}, _isa=> "mixed"} } else { my %s= %{join_sons($a)}; if(keys %s == 1) { +{%s, _isa => "seq"}} else { +{%s, _isa => "mtup"} } } } } sub resumele{ my $a=shift; ## print Dumper($a); my $i = $a->{_isa}; delete $a->{_isa}; if ($i eq "text") {"text"} elsif ($i eq "empty") {"empty"} elsif ($i eq "singleton") {join(", ", keys %{$a}) } elsif ($i eq "mixed") {delete $a->{'#PCDATA'}; if(keys %{$a}){ "mixed(".join(", ", keys %{$a}).")"} else {"text"} } elsif ($i eq "tup") {"tup(".join(", ", keys %{$a}).")"} elsif ($i eq "seq") {"seq(".join(", ", keys %{$a}).")"} else { my $r= "mtup("; for(sort keys %$a){ $r .= "$_, " if ( $a->{$_}[0] == 1 && $a->{$_}[1] == 1 ); $r .= "$_?, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] == 1 ); $r .= "$_*, " if ( $a->{$_}[0] == 0 && $a->{$_}[1] > 1 ); $r .= "$_+, " if ( $a->{$_}[0] > 0 && $a->{$_}[1] > 1 ); $r .= Dumper($a) if($r =~ /\($/ ); } $r =~ s/, $//; $r.=")"; } } sub join_sons{ my $a = shift; my %final = ( map { ($_ => [$a->[0]{$_}[0], $a->[0]{$_}[1]])} grep {$_ ne "_isa"} keys %{$a->[0]}); my %todas=(); for (@$a){ my @novas=keys %{$_}; @todas{@novas}= @novas; for my $k (keys %todas){ next if $k =~ /_isa/; $final{$k}[0]=0 unless $final{$k}[0]; unless (exists $_->{$k}){ $final{$k}[0]=0; next} $final{$k}[1]=$_->{$k}[1] if $_->{$k}[1] > ($final{$k}[1] || 0); $final{$k}[0]=$_->{$k}[0] if $_->{$k}[0] < $final{$k}[0]; } } \%final } sub sons2str{ my $a = shift; join(' ',($a->{_isa},map { $_ . ($a->{$_}[0]==1 ? "" : "+") } grep {$_ ne "_isa"} sort keys %$a)); } sub tipo{ my $a=shift; for ($a){ if(/^\s*\d+\s*$/) {return "_int" } elsif(m{^\s*(https?|ftp|file)://\w[~&=?\w:/.-]+\s*$}i){return "_url" } elsif(/^\s*\d+\.\d+\s*$/) {return "_real" } elsif(/^\w+$/) {return "_id" } elsif(m{^\s*[\w.-]+\@\w[\w_:/.-]+\s*$}) {return "_email" } else {return undef } } } __END__ =head1 NAME mkxmltype - Make XML analysis using XML::DT =head1 SYNOPSIS mkxmltype <xmlfile> =head1 DESCRIPTION This command tries to infer DTD and Camlila-like types for a specific XML file; =head1 Options -latin1 input file encoding is forced to be latin1 -html uses html (libxml2) parser -show_att Show attribute values -expand_att_id -lines=20000 just reads the first 20000 lines of the XML file -t -class '<div class="a" id="b">'... is treated as 'div+a+b' -shell Enter interactive shell mode =head1 SEE ALSO XML::DT(1), mkdtskel(1), mkdtdskel and perl(1) =cut