#!/usr/bin/perl -s use strict; use warnings; use Data::Dumper; $Data::Dumper::Indent=0; $Data::Dumper::Terse=1; use Text::Perfide::BookCleaner; use File::Basename; our ($b,$p1,$p2,$p3,$p4,$p5,$simplify,$normpar,$j,$v,$c,$minhf, $pipe,$latin1,$enc,$o,$dir,$dry,$outlist,$nrom_off); my ($v1,$v2,$v3,$v4,$v5); my $commit =1 if $c || $pipe; nrom_off() if $nrom_off; ## turns off Roman Number titles if (defined($j)){ if($j =~ '1') { $p1=1 } else { $p1=0; } if($j =~ '2') { $p2=1 } else { $p2=0; } if($j =~ '3') { $p3=1 } else { $p3=0; } if($j =~ '4') { $p4=1 } else { $p4=0; } if($j =~ '5') { $p5=1 } else { $p5=0; } if($j =~ 'c') { $commit=1 } else { $commit=0; } if($j =~ 'p') { $pipe=1;$commit=1 } else { $pipe=0; } #if($j =~ 's') { $simplify=1 } else { $simplify=0; } #if($j =~ 'np') { $normpar=1 } else { $normpar=0; } } if (defined($v)){ if($v =~ '1') { $v1=1 } else { $v1=0; } if($v =~ '2') { $v2=1 } else { $v2=0; } if($v =~ '3') { $v3=1 } else { $v3=0; } if($v =~ '4') { $v4=1 } else { $v4=0; } if($v =~ '5') { $v5=1 } else { $v5=0; } } $p1 //= 1; $p2 //= 1; $p3 //= 1; $p4 //= 0; $p5 //= 1; $commit //= 0; $simplify //= 1; $normpar //= 1; $pipe //= 0; $enc //=""; $minhf //= 5; $enc = "CP1252" if $latin1; my $htreshold = $minhf; my $ftreshold = $minhf; my $cp1252_simplification = $simplify; my $utf_simplification = $simplify; my @books; if(@ARGV == 1 and $ARGV[0] =~ /dbooks$/){ print STDERR "Argument '$ARGV[0]' contains list of books to clean.\n"; open BOOKS,'<',$ARGV[0]; @books = ; chomp @books; } else{ @books = @ARGV; } files(@books); sub files{ my $di1 ={}; my $di2 ={}; my $di3 ={}; my $di4 ={}; my $di5 ={}; my $pbtab={}; my $txt; my $outl; if(defined($outlist)){ open $outl, '>', $outlist or die "Could not open file '$outlist'"; } __print("\n==="); for my $f (@_) { next if $f =~ /^#/; if(defined($dry)){print "$f\n"; next;} __print( " $f:\n"); $txt = gettxt($f); $dir //= dirname($f); $f = basename($f); my $out = $o; $out //= "$dir/$f.bc_out"; ($di1,$txt,$pbtab) = pages($txt) if($p1) ; printdi($di1); puttxt("$dir/$f.ou1",$txt) if $v1; ($di2,$txt) = sections($txt) if($p2) ; printdi($di2); puttxt("$dir/$f.ou2",$txt) if $v2; ($di3,$txt) = paragraphs($txt) if($p3) ; printdi($di3); puttxt("$dir/$f.ou3",$txt) if $v3; ($di4,$txt) = footnotes($txt) if($p4) ; printdi($di4); puttxt("$dir/$f.ou4",$txt) if $v4; ($di5,$txt) = chars($txt) if($p5) ; printdi($di5); puttxt("$dir/$f.ou5",$txt) if $v5; $txt = commit($txt) if($commit) ; print $outl "$out\n" if defined($outl); puttxt($out,$txt); } } sub puttxt { my ($file,$txt)=@_; if($pipe){ writefile($txt) } else { open(my $fd,'>',$file) or die "Could not open file '$file' for writing!"; writefile($txt,$fd); close $fd; } } sub __print{ if ($pipe) { print STDERR @_ } else { print @_ } } sub printdi{ my $di=shift; ## print "DEBUG:", Dumper($di); if(ref($di) eq "HASH"){ for(keys %$di){ if(ref($di->{$_})){ __print("$_=",Dumper($di->{$_}),";\n"); } else { __print("$_=$di->{$_};\n") if $di->{$_}; } } } else { __print("??=$di\n")} } __END__ =head1 NAME bookcleaner - prepare books for alignment and other operations =head1 SYNOPSIS bookcleaner [options] file* bookcleaner [options] file.dbooks =head1 DESCRIPTION Prepare a textual book (or a list of books in a file with the extension "dbooks", with one book path per line) for future align operations. The following steps are done: =head2 Step1 -- pages, headers footers Step1 -- pages, headers footers (-p1=0 to skip this step) =head2 Step2 -- sections Step2 -- sections (-p2=0 to skip this step) =head2 Step3 -- paragraphs Step3 -- paragraphs (-p3=0 to skip this step) =head2 Step4 -- footnotes Step4 -- footnotes (Deactivated by default. -p4=1 to perform this step. ) =head2 Step5 -- char level cleaning Step5 -- char level cleaning (-p5=0 to skip this step) =head2 Commit Commit =head1 Options -c Commit at the end (removes several debug marks (_pb, etc) before creating output file -j=1c Just do step 1 and commit -j=...p Just ... and send output to STDOUT -simplify to do several char level simplifications: translate some CP1252 chars to unicode translate several dashes, quotes and double quotes to ascii defaul=1 use -simplify=0 to avoid simplification -v=34 Create temporary output files of the step 3 (file.ou3) and 4 (file-ou4) -minhf=3 removes headers or footers if they appear more than 3 times (def:5) -pipe send output to STDOUT -latin1 -o=FILE send output to FILE (default is original file with extension bc_out) -dry Dry run (DEBUG option, makes bookcleaner do nothing and just output the names of the files received as input -dir=DIR create all output files under DIR/ -nrom_off turns off Roman Number titles =head1 AUTHOR Andre Santos J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). Text::Perfide::BookCleaner(3pm) Ontology sections.the =cut