package XML::DT; BEGIN{ use XML::Parser; use Exporter (); use vars qw($c %v $q @dtcontext @dtatributes ); eval "use bytes"; if (my $m = $INC{"bytes.pm"}) {require bytes; import bytes;} @ISA=qw(Exporter); @EXPORT=qw(dt dtstring inctxt ctxt mkdtskel mkdtdskel toxml MMAPON $c %v $q @dtcontext @dtatributes ); $VERSION = '0.16'; } =head1 NAME XML::DT - a package for down translation of XML to strings =head1 SYNOPSIS use XML::DT; %xml=( 'music' => sub{"Music from: $c\n"}, 'lyrics' => sub{"Lyrics from:$c\n (the value of attribute IN is:$v{IN}\n)"}, 'title' => sub{ uc($c) }, '-default' => sub{"$q:$c"}, '-outputenc' => 'ISO-8859-1'); print dt($filename,%xml); print dtstring(" Vejam Bem Zeca Afonso ",%xml); inctxt('music/lyrics') inctxt('music.*') ctxt(1) /* the father element */ mkdtskel($file) mkdtdskel($file) =head1 DESCRIPTION This module processes XML files with an approach similar to OMNIMARK. Down translation function C
receives a filename and a set of expressions (functions) defining the processing and associated values for each element. C is similar but takes input from a string instead of a file. =head2 C function C is true if the actual element path matches the provided pattern. This function is meant to be used in the element functions in order to achieve context dependent processing. =head2 User provided element processing functions The user must provide an HASH with a function for each element, that computes element output. Functions can use the element name C<$q>, the element content C<$c> and the attribute values hash C<%v>. All those global variables are defined in C<$CALLER::>. Each time an element is find the associated function is called. Content is calculated by concatenation of element contents strings and interior elements return values. =head2 C<-default> function When a element has no associated function, the function associated with C<-default> called. If no C<-default> function is defined the default function returns a XML like string for the element. =head2 C<-outputenc> option C<-outputenc> defines the output encoding (default is Unicode UTF8). =head2 C<-inputenc> option C<-inputenc> forces a input encoding type. Whenever that is possible, define the input encoding in the XML file: =head2 C<-pcdata> function C<-pcdata> function is used to define transformation over the contents. Typically this function should look at context (see C function) The default C<-pcdata> function is the identity =head2 C<-begin> function Function to be executed before processing XML file. Example of use: initialization of side-effect variables =head2 C<-end> function Function to be executed after processing XML file. I can use C<$c> content value. The value returned by C<-end> will be the C
return value. Example of use: post-processing of returned contents =head2 C function This is the default "-default" function. It can be used to generate xml based on C<$c> C<$q> and C<%v> variables. Example: add a new attribute to element C without changing it: %handler=( ... ele1 => sub { $v{at1} = "v1"; toxml(); }, ) =head1 Elements with values other than strings (C<-type>) By default all elements return strings, and contents (C<$c>) is the concatenation of the strings returned by the sub-elements. In some situations the XML text contains values that are better processed as a structured type. The following types (functors) are available: STR -> concatenates all the subelements returned values (DEFAULT) all the subelement should return strings to be concatenated SEQ -> makes an ARRAY with all the sub elements contents; attributes are ignored (they should be processed in the subelement). (returns a ref) SEQH -> makes an ARRAY of HASH with all the sub elements (returns a ref); for each subelement: -q => element name -c => contents at1 => at value1 for each attribute MAP -> makes an HASH with the sub elements; keys are the sub-element names, values are their contents. Attributes are ignored. (they should be processed in the subelement) (returns a ref) MULTIMAP -> makes an HASH of ARRAY; keys are the sub-element names; values are lists of contents; attributes are ignored (they should be processed in the subelement); (returns a ref) MMAPON(elementlist) -> makes an HASH with the subelements; keys are the sub-element names, values are their contents; attributes are ignored (they should be processed in the subelement); for all the elements contained in the elementlist, it is created an ARRAY with their contents. (returns a ref) =head2 An example: use XML::DT; %handler = ( contacts => sub{ [ split(";",$c)] }, -default => sub{$c}, -type => { institution => 'MAP', degrees => MMAPON('name') tels => 'SEQ' } ); $a = dt ("f.xml", %handler); with the following f.xml U.M. University of Minho 1111 1112 1113 Portugal J.Joao; J.Rocha; J.Ramalho Computer science Informatica history would make $a { 'name' => [ 'Computer science', 'Informatica ', ' history ' ], 'institution' => { 'tels' => [ 1111, 1112, 1113 ], 'name' => 'University of Minho', 'where' => 'Portugal', 'id' => 'U.M.', 'contacts' => [ 'J.Joao', ' J.Rocha', ' J.Ramalho' ] } }; =head1 DT Skeleton generation It is possible to build an initial processor program based on an example To do this use the function C. Example: perl -MXML::DT -e 'mkdtskel "f.xml"' > f.pl =head1 DTD skeleton generation It makes a naive DTD based on an example(s). To do this use the function C. Example: perl -MXML::DT -e 'mkdtdskel "f.xml"' > f.dtd =head1 BUGS This section is out of date... =head1 Author Jose Joao, jj@di.uminho.pt http://www.di.uminho.pt/~jj/perl/XML/ thanks to Michel Rodriguez José Carlos Ramanlho =cut %ty=(); sub dt { my ($file,%xml)=@_; %ty=(); %ty=(%{$xml{'-type'}}) if defined($xml{'-type'}); $ty{-ROOT} = "NONE"; # create a new XML::Parser instance using Tree Style if (defined($xml{-inputenc}) && ($xml{-inputenc} eq 'ISO-8859-1')){ $parser = new XML::Parser(Style => 'Tree', ErrorContext => 2 , ProtocolEncoding => 'ISO-8859-1'); } else { $parser = new XML::Parser(Style => 'Tree', ErrorContext => 2 , ); } #execute Begin action if she exists if ($xml{-begin}){ &{$xml{-begin}} } # Convert XML to Perl code $tree = $parser->parsefile($file); if($xml{-end}){ $c= omni("-ROOT",\%xml,@$tree); &{$xml{-end}} } else { omni("-ROOT",\%xml,@$tree) } } sub ctxt { my $level = $_[0]; $dtcontext[-$level-1]; } sub inctxt { my $pattern = shift ; join("/",@dtcontext) =~ m!$pattern/[^/]*$! ; } sub dtstring { my ($string,%xml)=@_; %ty=(%{$xml{'-type'}}, -ROOT => "NONE"); # create a new XML::Parser instance using Tree Style if (defined($xml{-inputenc}) && ($xml{-inputenc} eq 'ISO-8859-1')){ $parser = new XML::Parser(Style => 'Tree', ErrorContext => 2 , ProtocolEncoding => 'ISO-8859-1'); } else { $parser = new XML::Parser(Style => 'Tree', ErrorContext => 2 , ); } #execute Begin action if she exists if ($xml{-begin}){ &{$xml{-begin}} } # Convert XML to Perl code (Tree) $tree = $parser->parse($string); if($xml{-end}){ $c= omni("-ROOT",\%xml,@$tree); &{$xml{-end}} } else { omni("-ROOT",\%xml,@$tree) } } sub omni{ my ($par,$xml,@l) = @_; my $type = $ty{$par} || "STR"; my %typeargs=(); if(ref($type) eq "mmapon"){ for(@$type){$typeargs{$_}=1;} $type = "MMAPON"; }; my $r ; if( $type eq 'STR') { $r = "" } elsif( $type eq 'SEQ' or $type eq "ARRAY") { $r = [] } elsif( $type eq 'SEQH' or $type eq "ARRAYOFHASH") { $r = [] } elsif( $type eq 'MAP' or $type eq "HASH") { $r = {} } elsif( $type eq 'MULTIMAP') { $r = {} } elsif( $type eq 'MMAPON' or $type eq "HASHOFARRAY") { $r = {} } elsif( $type eq 'NONE') { $r = "" } my ($name, $val, @val,$atr, $aux); while( @l) { ($name, $val, @l) = @l; if ($name eq "0"){ $name="-pcdata"; $aux= (defined($xml->{-outputenc}) && $xml->{-outputenc} eq 'ISO-8859-1') ?lat1::utf8($val): $val ; if(defined $xml->{-pcdata}) { push(@dtcontext,"-pcdata"); $c=$aux; $aux=&{$xml->{-pcdata}}; pop(@dtcontext); } } else {($atr,@val) = @$val; push(@dtcontext,$name); unshift(@dtatributes,$atr); $aux = omniele($xml, $name, omni($name,$xml,@val), $atr); shift(@dtatributes); pop(@dtcontext); } if ($type eq "STR"){ $r .= $aux ;} elsif($type eq "SEQ" or $type eq "ARRAY"){ push(@$r,$aux) unless whitepc($aux,$name);} elsif($type eq "SEQH" or $type eq "ARRAYHASH"){ push(@$r,{"-c" => $aux, "-q" => $name, %$atr }) unless whitepc($aux,$name);} elsif($type eq "MMAPON"){ if(not whitepc($aux,$name)){ if(! $typeargs{$name}) { warn "duplicated tag ´$name´\n" if(defined($r->{$name})); $r->{$name}=$aux } else { push(@{$r->{$name}},$aux) unless whitepc($aux,$name)}} } elsif($type eq "MAP" or $type eq "HASH"){ if(not whitepc($aux,$name)){ warn "duplicated tag ´$name´\n" if(defined($r->{$name})); $r->{$name}=$aux }} elsif($type eq "MULTIMAP"){ push(@{$r->{$name}},$aux) unless whitepc($aux,$name)} elsif($type eq "NONE"){ $r=$aux;} else { $r="undefined type !!!"} } $r; } sub omniele { my $xml=shift; my $aux; ($q,$c,$aux)=@_; %v=%$aux; if (defined($xml->{-outputenc}) && $xml->{-outputenc} eq 'ISO-8859-1'){ for (keys %v){ $v{$_} = lat1::utf8($v{$_}) ; } } if (defined $xml->{$q}) {&{$xml->{$q}} } elsif(defined $xml->{'-default'}) {&{$xml->{'-default'} }} else {toxml();} } sub toxmlp { my($q,$v,$c ) = @_; if($q eq "-pcdata") { $c} else {"<$q". join("",map {" $_=\"$v->{$_}\""} keys %$v ) . ">$c" } } sub toxml { my %at=(seqitem => "item", @_); if(not ref($c)){ toxmlp($q,\%v,$c)} elsif (ref($c) eq "ARRAY") { if($ty{$q} eq "SEQH") { toxmlp($q,{}, join("",map {my %a=%$_; delete @a{"-q","-c"}; toxmlp($_->{-q},\%a,$_->{-c}) } @{$c} )) } else { toxmlp($q,\%v, join("",map {"<$at{seqitem}>$_\n" } @{$c} ) )} } elsif (ref($c) eq "HASH") { "<$q". join("",map {" $_=\"$v{$_}\""} keys %v ) . ">" . join("",map {($_ ne "-pcdata") ? ( (ref($c->{$_}) eq "ARRAY") ? "<$_>". join("\n<$_>", @{$c->{$_}}). "\n" : "<$_>$c->{$_}\n" ) : () } keys %{$c} ) . "$c->{-pcdata}" } } %mkdtskel=( '-default' => sub{$element{$q}=1; for (keys %v){$att{$q}{$_}=1 }; ""}, '-end' => sub{ print <<'END'; #!/usr/bin/perl use XML::DT ; my $filename = shift; %handler=( # '-outputenc' => 'ISO-8859-1', # '-default' => sub{"<$q>$c"}, END for $name (keys %element){ print " '$name' => sub{\"\$q:\$c\"},"; print '# remember $v{', join('},$v{',keys %{$att{$name}}), '}' if $att{$name}; print "\n"; } print <<'END'; ); print dt($filename,%handler); END }); sub mkdtskel{ dt(shift,%mkdtskel)} sub mkdtdskel { my @files = @_; my %handler=( '-outputenc' => 'ISO-8859-1', '-default' => sub{ $elel{$q}=1; $root = $q unless ctxt(1); $ele{ctxt(1)}{$q} = 1; for(keys(%v)){$att{$q}{$_} = 1 } ; }, '-pcdata' => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1 }}, ); for $filename (@files){ dt($filename,%handler); } print "\n\n"; delete $elel{$root}; for ($root, keys %elel){ putele($_); for $name (keys(%{$att{$_}})) { print( "\t\n"); print( "\t\n"); } } } sub putele { my $e = shift; my @f ; if($ele{$e}){ @f = keys %{$ele{$e}}; print "\n"; } else { print "\n";} } sub whitepc{ $_[1] eq '-pcdata' and $_[0] =~ /^[ \t\n]*$/ }; sub MMAPON{ bless([@_],"mmapon") }; sub SEQOF{ bless([@_],"seqof") }; package lat1; =head1 NAME C - module for unicode utf8 to latin1 translation =head1 SYNOPSIS $latin1string = lat1::utf8($utf8string) =head1 Bugs Translating the latin1 subset of unicode utf8 is very simples and needs no tables. If you need more complex translation, see the perl modules about unicode and the C command. =cut sub utf8{ my $t=shift; $t =~ s/([ÃÂ])(.)/ $1 eq "Ã" ? chr( ord($2) | 0100): $2 /ge; $t; } #sub utf8{ # my $t=shift; # if($] >= 5.006){$t =~ tr/\200-\377//UC;} # else {$t =~ s/([ÃÂ])(.)/ $1 eq "Ã" ? chr( ord($2) | 0100): $2 /ge;} # $t; #} 1;