#!/usr/bin/perl -s #undef $/; #$/=''; my $f1=shift; my $f2=shift; our ( $l1, $l2,$o,$local); ## options $local = ($local ? "-local" : ""); $l1 ||= $f1; $l2 ||= $f2; $o ||= "_$l1-$l2.tmx"; ## html2pml -latin1output -listofpairs -noimg p$G.pairs p$N.$O p$N.$T ## LANG=pt_PT xmlalign2cqp p$N.$O p$N.$T my $f1base= $f1; system("pdftotext -enc UTF-8 $f1 _$l1.txt"); system("html2pml -isutf8 -latin1output -txt _$l1.txt > _$l1.pml"); my $f2base= $f2; system("pdftotext -enc UTF-8 $f2 _$l2.txt"); system("html2pml -isutf8 -latin1output -txt _$l2.txt > _$l2.pml"); system("xmlalign2cqp $local _$l1.pml _$l2.pml"); system("cqpalign2tmx $local _$l1.pml-_$l2.pml.align $o"); unlink ("_$l1.txt"); unlink ("_$l2.txt"); unlink ("_$l1.pml"); unlink ("_$l2.pml"); unlink ("_$l1.pml-_$l2.pml.align"); unlink ("_$l2.pml-_$l1.pml.align"); unlink (); __END__ =head1 NAME pdfaligner - align html bitexts =head1 SYNOPSIS pdfaligner -l1=en -l2=pt -o=en.pt.tmx f_en.pdf f_pt.pdf =head1 DESCRIPTION =head2 Pre-requesits cwb-utils (http://natura.di.uminho.pt) cwb -- Corpora WorkBench (Stutgard) pdftotext -- Xpdf =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut