#!/usr/bin/perl -s #!/usr/contrib/bin/perl -s # option -rc -- remove latex like comments (%...\n) # option -rv -- remove verb and begin{verbatim} keywords # option -ne -- no eval :disable the perl comand # This is used by revert_to_raw_tex sub revert_verbatim { # Modifies $_ s/$verbatim_mark(verbatim_)(\d+)/$verbatim[$2]/go; if($rv){ s/$verbatim_mark(verbatim)(\d+)/$verbatim[$2]/go;} else { s/$verbatim_mark(verbatim)(\d+)/\\begin{verbatim}$verbatim[$2]\\end{verbatim}/go; } } sub revert_verb { # Modifies $_ if($rv){ s/$verb_mark(\d+)/$verb[$1]/go;} else { s/$verb_mark(\d+)/\\verb$verb_delim[$1]$verb[$1]$verb_delim[$1]/go;} s/$verb_mark2(\d+)/$verb[$1]/go; } ####################### Processing Meta Commands ############################ # The special commands (newcommand, newenvironment etc.) # must be processed before translating their arguments, # and before we cut up the document into sections # (there might be sectioning commands in the new definitions etc.). sub substitute_meta_cmds { local ($next_def); local ($cmd, $argn, $body, $before, $after, $new_cmd_rx, $new_env_rx); &tokenize($meta_cmd_rx); # Inserts a space after meta commands ... while (/$meta_cmd_rx /o) { # ... and uses the space ($before, $cmd, $after) = ($`, $1, $'); $next_def = "\\$cmd"; local($cmd_sub) = "get_body_$cmd"; $_ = join('',$before, &$cmd_sub($after)); } # All the definitions bodies # are stored in %new_command and %new_environment # # Now substitute the new commands and environments: # (must do them all together because of cross definitions) ($new_cmd_rx, $new_env_rx) = (&make_new_cmd_rx, &make_new_env_rx); do { while (($cmd, $code) = each %new_command) { $new_command{$cmd} = &expand_code($code);} while (($cmd, $code) = each %new_environment) { $new_environment{$cmd} = &expand_code($code);} &tokenize($new_cmd_rx); # Inserts a space after the new commands ... if ($new_cmd_rx) { while (/$new_cmd_rx /o && (($before, $cmd, $after) = ($`, $1, $'))) { $_ = join('',$before, &substitute_newcmd);} } if ($new_env_rx) { while (/$new_env_rx/o && (($before, $cmd, $after) = ($`, $2, $'))) { $_ = join('',$before, &substitute_newenv);}} } if (%new_command || %new_environment); $_; } sub expand_code { local($_) = @_; # Uses $new_cmd_rx and $new_env_rx set in the caller if ($new_cmd_rx eq "0") { $new_cmd_rx = "<<{this cant possibly match}>>"; } if ($new_env_rx eq "0") { $new_env_rx = "<<{this cant possibly match}>>"; } local($cmd, $before, $after); &tokenize($new_cmd_rx); # Inserts a space after the new commands ... while ((/$new_cmd_rx /o && (($before, $cmd, $after) = ($`, $1, $'))) || (/$new_env_rx/o && (($before, $cmd, $after) = ($`, $2, $')))) { if ($new_command{$cmd}) { # We have a command $_ = join('',$before, &substitute_newcmd); } else { $_ = join('',$before, &substitute_newenv); } &tokenize($new_cmd_rx); # Must do it for any newly inserted code $cmd = 0; } $_; } # Removes the definition from the input string, adds to the preamble # and stores the body in %new_command; sub get_body_newcommand { local($_) = @_; local($argn,$cmd,$body,$tmp); $cmd = &get_next(1); # Get command name $cmd =~ s/^\s*\\//; $argn = &get_next(0); # Get optional no. of args $argn = 0 unless $argn; # Get the body of the code and store it with the name and number of args # UNLESS THE COMMAND IS ALREADY DEFINED $body = &get_next(1); $tmp = "do_cmd_$cmd"; $new_command{$cmd} = join(':!:',$argn,$body) unless (defined &$tmp); $_; } # Like get_body_newcommand above, but for simple raw TeX \defs sub get_body_def { local($_) = @_; local($argn,$cmd,$body,$is_simple_def,$tmp); $cmd = &get_next(2); $cmd =~ s/^\s*\\//; $argn = &get_next(3); $argn = 0 unless $argn; $body = &get_next(1); $tmp = "do_cmd_$cmd"; if ($is_simple_def && !defined (&$tmp)) { $new_command{$cmd} = join(':!:',$argn,$body); } $_; } # Removes the definition from the input string, adds to the preamble # and stores the body in %new_environment; sub get_body_newenvironment { local($_) = @_; local($argn,$env,$begin,$end,$tmp); $env = &get_next(1); # Get the environment name $env =~ s/^\s*\\//; $argn = &get_next(0); # Get optional no. of args $argn = 0 unless $argn; # Get the body of the code and store it with the name and number of args # UNLESS THE COMMAND IS ALREADY DEFINED (see get_body_newcommand) $tmp = "do_env_$env"; $begin = &get_next(1); $end = &get_next(1); $new_environment{$env} = join(':!:', $argn, $begin, $end) unless defined &$tmp; $_; } sub get_body_renewcommand { &get_body_newcommand(@_[0]); } sub get_body_renewenvironment { &get_body_newenvironment(@_[0]); } sub substitute_newcmd { # Modifies $cmd and $after in the caller # Get the body from the new_command array local($argn, $_) = split(/:!:/, $new_command{$cmd}); local($arg); foreach $i (1..$argn) { $arg = $undef_mark; $after =~ s/$next_pair_rx/$arg = $2;''/eo; # Get the next argument # Next argument may not be in braces - get next character - ARGG! $after =~ s/\s*(.)/$arg = $1;''/eo if ($arg eq $undef_mark); s/\#$i/$arg/g;} # Substitute the arguments in the body # Make the body unique (give unique id's to the brackets), # translate, and return it $_ = &revert_to_raw_tex($_); &mark_string; join('',$_,$after); } sub substitute_newenv { # Modifies $cmd and $after in the caller # Get the body from the new_environment array local($argn, $begdef, $enddef) = split(/:!:/, $new_environment{$cmd}); local($arg,$new_def_rx); # Note that latex allows argument substitution only in the # \begin part of the new definition local($_) = $begdef; foreach $i (1..$argn) { $after =~ s/$next_pair_rx/$arg = $2;''/eo; # Get the next argument s/\#$i/$arg/g;} # Substitute the arguments in the body # Make the body unique (Give unique id's to the brackets), # translate, and return it $_ = &revert_to_raw_tex($_); $begdef = &mark_string; # Make unique # Now substitute the \end part: $_ = &revert_to_raw_tex($enddef); $result_str = ''; $enddef = &mark_string; # Make unique $new_def_rx = &make_end_env_rx($cmd); $after =~ s/$new_def_rx/$enddef/; join('',$begdef,$after); } # Modifies $_ in the caller and as a side-effect it modifies $next_def # which is local to substitute_meta_cmds sub get_next { local($what) = @_; local($next, $pat, $tmp); if ($what == 1) { ($next, $tmp, $pat) = &get_next_argument;} elsif ($what == 2) { ($next, $pat) = &get_next_tex_cmd;} elsif ($what == 3) { ($next, $pat) = &get_next_def_arg;} else { ($next, $pat) = &get_next_optional_argument;} $next_def .= &revert_to_raw_tex($pat) if $pat; $next; } # The following get_next_ ARE ALL DESTRUCTIVE. sub get_next_argument { local($next, $br_id, $pat); s/$next_pair_rx/$br_id=$1;$next=$2;$pat=$&;''/eo; ($next, $br_id, $pat); } sub get_next_pair_or_char_pr { local($next, $br_id, $pat); if ( (/^\s*([\w])/o && (! $`))) { ($next, $pat) = ($1, $&) } elsif ( /$next_pair_pr_rx/o && (! $`)) { ($next, $br_id, $pat) = ($2, $1, $&) }; s/$pat// if $pat; ($next, $br_id, $pat); } sub get_next_optional_argument { local($next, $pat); s/$optional_arg_rx/$next=$1;$pat=$&;''/eo if (/\s*[[]/ && (! $`)); # if the first character is a [ # (/^[]/ does not work because it may match the beginning of ANY line s/^\s*\[\]//g unless $pat; # This is not picked by $optional_arg_rx ($next, $pat); } sub get_next_tex_cmd { local($next, $pat); s/$single_cmd_rx/$next = $1; $pat=$&; ''/eo; ($next, $pat); } sub get_next_def_arg { local($next, $pat); # Sets is_simple_def for caller. Start by turning it off, then # turn it on if we find one of the "simple" patterns. # This has got to be hit-or-miss to an extent, given the # thoroughly incestuous relationship between the TeX macroprocessor # ('mouth') and typesetting back-end ('stomach'). Anything which # even does catcode hacking is going to lose BAD. $is_simple_def = 0; # no arguments if (/^$O/ && (! $`)) { $next=0; $pat=''; $is_simple_def=1; $O } # 'simple' arguments if (! $is_simple_def && /$tex_def_arg_rx/o && (! $`)) { s/$tex_def_arg_rx/$next=$1; $pat=$&; $is_simple_def=1; $O/eo; } # MESSY arguments if (! $is_simple_def) { print "Arguments to $cmd are too complex ...\n"; print "It will not be processed unless used in another environment\n"; print "which is passed to LaTeX whole for processing.\n"; s/^[^<]*(<[^<]+)*<) { $result_str .= $_;}; $_ = $result_str; # undef $result_str; # $_; } sub initialise { ############################ Global variables ############################### $* = 1; # Enable multi-line patterns ($O , $C, $OP, $CP) = ('<<' , '>>', '<#', '#>'); # Open/Close Markers $name = 0; # Used in the HREF NAME= field $wrap_toggle = 'end'; $delim = '%:%'; # Delimits items of sectioning information # stored in a string $verbatim_mark = ''; $verb_mark = ''; $verb_mark2 = ''; $undef_mark = ''; # Commands which need to be passed, ALONG WITH THEIR ARGUMENTS, to TeX. # Note that this means that the arguments should *not* be translated, # This is handled by wrapping the commands in the dummy tex2html_wrap # environment before translation begins ... # Note that this code squeezes spaces out of the args of psfig; # that's what the last round did ... ################### Frequently used regular expressions ################### $include_line_rx = "^\\s*\\\\(include|input)\\W[^\n]*\n"; $delimiters = '\'\\s[\\\\<>(=).,#;:~\/!-'; $delimiter_rx = "([$delimiters])"; # $1 : br_id # $2 : $begin_env_rx = "[\\\\]begin\\s*$O(\\d+)$C\\s*([^$delimiters]+)\\s*$O\\1$C\\s*"; $match_br_rx = "\\s*$O\\d+$C\\s*"; $optional_arg_rx = "^\\s*\\[([^]]+)\\]"; # Cannot handle nested []s! # $1 : br_id $begin_cmd_rx = "$O(\\d+)$C"; # $1 : largest argument number $tex_def_arg_rx = "^[#0-9]*#([0-9])$O"; # $1 : declaration or command or newline (\\) $cmd_delims = q|-#,.~/\'`^"=|; # Commands which are also delimiters! $single_cmd_rx = "\\\\([$cmd_delims]|[^$delimiters]+|\\\\)"; # Matches a pair of matching brackets # $1 : br_id # $2 : contents $next_pair_rx = "^\\s*$O(\\d+)$C([\\s\\S]*)$O\\1$C"; $any_next_pair_rx = "$O(\\d+)$C([\\s\\S]*)$O\\1$C"; # Matches a pair of matching brackets # USING PROCESSED DELIMITERS; # (the delimiters are processed during command translation) # $1 : br_id # $2 : contents $next_pair_pr_rx = "^\\s*$OP(\\d+)$CP([\\s\\S]*)$OP\\1$CP"; $any_next_pair_pr_rx = "$OP(\\d+)$CP([\\s\\S]*)$OP\\1$CP"; $meta_cmd_rx = '[\\\\](renewcommand|renewenvironment|newcommand|newenvironment|def)'; # Matches environments that should not be touched during the translation $verbatim_env_rx = "\\s*{(verbatim_?)[*]?}"; } &driver; sub driver { local($FILE, $texfilepath, %unknown_commands); &initialise; # Initialise some global variablesb local($max_id) = (0); local(@verbatim, @verb, @verb_delim, $verb_counter, $verbatim_counter); # local (%new_command, %new_environment); # name-code associations &slurp_input; &mark_string; # and pre_process \verb and \verbatim # Handle newcommand, newenvironment ... &substitute_meta_cmds if /$meta_cmd_rx/; &jj_revert_and_exec; print; } sub jj_revert_and_exec{ local %i =(); s/\\perl(\*)?$O([0-9]+)$C/$perl{$2}++;"\\perl$1\001"/geo; # s/\\perl$O([0-9]+)$C/$perl{$1}++;"\\perl\001"/geo; s/$O([0-9]+)$C/($perl{$1})? "\001" :($i{$1} ++ % 2) ? '}':'{'/geo; &revert_verbatim; if($ne){s/\\perl\001([^\001]*)\001/***** sorry no eval *****/go;} else{s/\\perl\001([^\001]*)\001/$1/geeo; s/\\perl\*\001([^\001]*)\001/ &jjtlpp(scalar eval($1))/geo;} &revert_verb; # s/\\perl\001([^\001]*)\001/eval $1/geo; } # This should not have been necessary if tokenization was done properly ... # It inserts a space after anything that $rx$delimiter_rx matches sub tokenize { local($rx) = @_; # Modifies $_; s/$rx$delimiter_rx/\\\1 \2/g if $rx; } # It is necessary to filter some parts of the document back to raw # tex before passing them to latex for processing. sub revert_to_raw_tex { local($_) = @_; while (s/$O\s_*\d+\s*$C/\{/) { s/$&/\}/; } # The same for processed markers ... while (s/$OP\s*\d+\s*$CP/\{/) { s/$&/\}/; } &revert_verbatim; # &revert_verb; $_; } sub make_new_cmd_rx { "[\\\\](". join("|", keys %new_command) . ")" if %new_command; } sub make_new_env_rx { "[\\\\]begin\\s*$O(\\d+)$C\\s*(". join("|", keys %new_environment) . ")\\s*$O\\1$C" if %new_environment; } # When part of the input text contains special perl characters and the text # is to be used as a pattern then these specials must be escaped. sub escape_rx_chars { local($_) = @_; s/(\W)/\\$1/g; $_; } # Frequently used regular expressions with arguments sub make_end_env_rx { local($env) = @_; $env = &escape_rx_chars($env); "[\\\\]end\\s*$O(\\d+)$C\\s*$env\\s*$O\\1$C"; } sub jjtlpp{ local($_)=shift; &mark_string; # and pre_process \verb and \verbatim # Handle newcommand, newenvironment ... &substitute_meta_cmds; # if /$meta_cmd_rx/; &jj_revert_and_exec; $_; }