#!/usr/bin/perl -s #undef $/; #$/=''; use strict; ##use File::Finder qw(find); my $f1=shift; my $f2=shift or die ("usage: $0 [options] [-l1=pt] [-l2=en] [-t=pdf] [-o=a.tmx] f1.pdf f2.pdf\n"); my $enc=""; #my $outenc = "-latin1output"; my $outenc = ""; our ($tdef, $t, $l1, $l2, $o, $nosentbreak, $isutf8, $wl, $rawpdf, $q0, $debug, $num, $sync, $syncatt, $getalt, $cwbconfig, $local); ## options my $nosentb = $nosentbreak ? "-nosentbreak": ""; my $noimg = "-noimg"; if($local){ $local = "-local"; } else { $local = ""; } if($cwbconfig){ $cwbconfig = "-cwbconfig='$cwbconfig'"; } else { $cwbconfig = ""; } $getalt = "-getalt" if $getalt; $noimg = "" if $getalt; ##$sync="-sync=$sync" if $sync; ##$sync .=" -syncatt=$syncatt" if $syncatt; if ($syncatt && $syncatt eq 1){ $sync = "-syncatt" } elsif($syncatt) { $sync = "-syncatt=$syncatt" } elsif($sync && $sync eq 1) { $sync = "-sync" } elsif($sync) { $sync = "-sync=$sync" } $enc = "-isutf8" if $isutf8; $debug = "-debug" if $debug; $q0 = "-debug" if $q0; $wl = "-wl=$wl" if $wl; if (!$wl){ my $dir = $ENV{'BIWORDSDIR'} || "/usr/share/biwords"; my $biwords = "$dir/biwords-$l1-$l2.csv"; my $biwordsrev = "$dir/biwords-$l2-$l1.csv.rev"; if (-e $biwords ){ $wl = "-wl=$biwords" } elsif(-e $biwordsrev){ $wl = "-wl=$biwordsrev"} } $l1 ||= $1 if($f1 =~ /(.*)\..*/); $l2 ||= $1 if($f2 =~ /(.*)\..*/); $l1 ||= $f1; $l2 ||= $f2; $num = "_" . ($num || $$); if($f1 =~ /\.sync$/i){ $t ||= "txt" ; $sync = "-syncatt" ; $enc ||= "-isutf8"; } if($f1 =~ /\.bc_out$/i){ $t ||= "txt" ; $enc ||= "-isutf8"; } $t ||= "epub" if($f1 =~ /\.epub$/i); $t ||= "html" if($f1 =~ /\.x?html?$/i); $t ||= "html" if($f1 =~ /\.aspx?$/i); $t ||= "pdf++" if($f1 =~ /\.pdf\+\+$/i ); $t ||= "pdf++" if($f1 =~ /\.pdf$/i && $rawpdf); $t ||= "pdf" if($f1 =~ /\.pdf$/i); $t ||= "txt" if($f1 =~ /\.txt$/i); $t ||= "pml" if($f1 =~ /\.pml$/i); $t ||= "docx" if($f1 =~ /\.docx$/i); $t ||= "doc" if($f1 =~ /\.doc$/i); $t ||= "rtf" if($f1 =~ /\.rtf$/i); $t ||= "lit" if($f1 =~ /\.lit$/i); $t ||= "ignore" if($f1 =~ /\.(zip|jpg|wav|mp3|png|gif)$/i); $t ||= "html" if($f1 =~ /[?].*=/i); $t ||= $tdef if $tdef; exit 1 if($t eq "ignore"); $o ||= "_$l1-$l2.tmx"; ## html2pml -latin1output -listofpairs -noimg p$G.pairs p$N.$O p$N.$T ## LANG=pt_PT xmlalign2cqp p$N.$O p$N.$T if($t eq "aligned"){ aligned2tmx(); } else { if($t ne "pml"){ file2pml($t,$f1,"_$l1$num.pml"); file2pml($t,$f2,"_$l2$num.pml"); } else { symlink($f1,"_$l1$num.pml"); symlink($f2,"_$l2$num.pml");} executa("xmlalign2cqp $local $cwbconfig $sync $wl _$l1$num.pml _$l2$num.pml"); my $v=`align2tmx $local $q0 -l1=$l1 -l2=$l2 _$l1$num.pml-_$l2$num.pml.align $o`; print $v; if ($v =~ /R=(\d+)/ && $1 > 80 ) { warn("problems: bad alignment($1)\n"); rename($o,"$o.BADALIGNMENT") } unlink ("_$l1$num.txt"); unlink ("_$l2$num.txt"); unlink ("_$l1$num.pml") unless $debug; unlink ("_$l2$num.pml") unless $debug; unlink ("_$l1$num.pml-_$l2$num.pml.align") unless $debug; unlink ("_$l2$num.pml-_$l1$num.pml.align") unless $debug; unlink () unless $debug; } sub aligned2tmx{ open(F1,"<",$f1); open(F2,"<",$f2); open(F3,">",$o); print F3 tmxinit(); print F3 "$f1\n"; print F3 "$f2\n"; while(){ my $u2=; chomp($_); chomp($u2); print F3 "\n\n"; print F3 " ", xmlprotect($_ ), "\n"; print F3 " ", xmlprotect($u2), "\n"; print F3 "\n"; } print F3 "\n\n\n"; close F1; close F2; close F3; } sub xmlprotect{ my $f=shift; $f =~ s/\&/\&/g; $f =~ s/\/\>/g; $f =~ s/\x{15}/#/g; $f } sub tmxinit{ qq{
}; } sub file2pml{ my ($t,$i,$o)=@_; if ($t eq "html"){ executa("html2pml $getalt $outenc $noimg $nosentb '$i' > $o"); } elsif($t eq "rawpdf2"){ executa("jop -rawpp2 '$i' > _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "rawpdf"){ executa("jop -rawpp '$i' > _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "pdf++"){ executa("pdftotext -raw -enc UTF-8 '$i' _$l1--.txt"); executa("pdfrawpp _$l1--.txt > _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "pdf"){ executa("pdftotext -enc UTF-8 '$i' _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "txtll"){ executa("html2pml $getalt $outenc $enc $nosentb -txtll '$i' > $o"); } elsif($t eq "txt"){ executa("html2pml $getalt $outenc $enc $nosentb -txt '$i' > $o"); } elsif($t eq "epub"){ executa("jop '$i' > _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "doc" or $t eq "docx" or $t eq "rtf"){ executa("jop '$i' > _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } elsif($t eq "lit"){ executa("ebook-convert '$i' _$l1$num.txt"); executa("html2pml $getalt -isutf8 $outenc -txt _$l1$num.txt > $o"); } else { die("please informe me of the type of files [$i:$t] (txt|html|pdf|pdf++)\n"); } } sub executa { my $cmd = shift; print STDERR "$cmd\n"; system ($cmd) == 0 or warn "system $cmd failed: $?\n"; } __END__ =head1 NAME filealigner - align bitexts (html, pdf, or text) and builds a TMX =head1 SYNOPSIS filealigner [-t=html|pdf|txt|pdf++] -l1=en -l2=pt -o=en.pt.tmx f_en.html f_pt.html filealigner [-t=html|pdf|txt|pdf++] -l1=en -l2=pt -o=en.pt.tmx f_en.pdf f_pt.pdf filealigner [-isutf8] -l1=en -l2=pt -o=en.pt.tmx f_en.txt f_pt.txt filealigner -num=777 ... temporary files ans corpora has _777 in his id to avoid clashing with other filealigners =head1 DESCRIPTION Makes file alignment for HTML, PDF, or TXT inputs. Output is a TMX (translasion memory exchange) file. =head2 Options =head3 C<-t> Unless C<-t> option is provided, filealigner tries to guess type from the extension. -pdf++ pdfrawpp(bookcleaner(pdftotex-raw(file.pdf))) -t=txtll split text by \n -t=txt =head3 C<-tdef> If C<-tdef> option is provided, and file extension is unknown, tdef is used as filetype. =head3 C<-q0> By default "0:1" correspondences are removed from TMX. Use the option C<-q0> to optain 0:1 correspondences in the output. =head3 C<-nosentbreak> Unless C<-nosentbreak> option is provided, filealigner tries to split sentences inside the paragraphs (in HTML files). If the alignmente is very bad, output is renamed to "file.BADALIGNMENT" =head3 C<-isutf8> Unless C<-isutf8> option is provided, filealigner assumes that latin1 input is used. =head3 C<-wl=filename>. The user can provide a (word-pair)-list of words and translations in order to help alignment process. =head3 C<-sync=synctag> C<-sync=synctag>. (synctag defaults to "sync" if no synctag is provided) use C<> as syncronization tag; (the number of syncronization tag should de equal. =head3 C<-syncatt=synctag> C<-syncatt=synctag>. (synctag defaults to "sync" if no synctag is provided) use C<> as syncronization tag for equal "id" values; =head3 C<-rawpdf> C<-rawpdf> is provided the PDF files are preprocessed with C If C<-rawpdf> is provided the PDF files are preprocessed with C =head3 C<-debug> if C<-debug> option is provided, temporary files are kept. =head3 C<-getalt> if C<-getalt> in HTML, images are expanded to "IMG image-alt-atribute". =head2 Pre-requesits cwb-utils ( html2pml xmlalign2cqp align2tmx pdfrawpp jop ebook-convert pdftotext cwb -- corpora workbench (Stutgard) pdftotext -- from xpdf =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =head1 TODO build translators for other types of files (PS, LaTeX, XML =cut