#!/usr/bin/perl -s use Guesser; our($h,$q,$u); $usage = " Usage : bitextcheck [OPTIONS] file1 file2 [ file3 ... ] Compares files for similarity Options: -f processes all lines of a file comparing the ones in each line -h prints this help and exits -lang1=... forces the language of the first file -lang2=... forces the language of the second file -outdir=... specifies the directory to use when retrieving files from the web -q quiet mode (displays only the result) -u processes all lines of a file comparing the urls in each line -v verbose mode "; $outdir = "." unless defined $outdir; if ($h) {print $usage;exit} while ($file = shift) {push(@files,$file)} if ($f) { for (@files) { open (File,$_) || die ("could not open file $_ ($!).\n"); while (chomp ($line = )) { @fs = split(/\t/,$line); if ($lang1) { chomp ($res = `langident '$fs[0]'`); unless ($res eq $lang1) { print $line,"\tunwanted languages\n" if $v; next } } if ($lang2) { chomp ($res = `langident '$fs[1]'`); unless ($res eq $lang2) { print $line,"\tunwanted languages\n" if $v; next } } $r = Guesser::compareFiles(@fs); if ($r) { print $line; print "\t$r" if $v; print "\n" } else { print $line,"\n" if $v } } close (File) } } elsif ($u) { open (FILELIST,">$outdir/index.html"); print FILELIST "\n"; mkdir "$outdir" || die("could not create dir '~$outdir' ($!).\n"); mkdir "$outdir/files" || die("could not create dir '$outdir/files' ($!).\n"); $local_dir = `pwd`; for (@files) { open (File,$_) || die ("could not open file $_ ($!).\n"); chdir "$outdir/files" || die ("could not change to dir '$outdir/files' ($!).\n"); while (chomp ($line = )) { @fs = split(/\t/,$line); for (@fs) {`wget -q --no-clobber --force-directories '$_'`} if ($lang1) { chomp ($res = `langident '$fs[0]'`); unless ($res eq $lang1) { print join("\t",$line),"\tunwanted languages\n" if $v; next } } if ($lang2) { chomp ($res = `langident '$fs[1]'`); unless ($res eq $lang2) { print join("\t",$line),"\tunwanted languages\n" if $v; next } } $r = Guesser::compareFiles(@fs); if ($r) { print join("\t",$line),"\t"; print "$r" if $v; print "\n"; $c++; print FILELIST " "; } else { for (@fs) {unlink $_} print join("\t",$line),"\n" if $v } } chdir $local_dir; } print FILELIST "\n
$c $fs[0] $fs[1] $r
\n"; close FILELIST; } else { $r = Guesser::compareFiles(@files); print join("\t",@files),"\t" unless $q; print "$r\n" } __END__ =head1 NAME bitextcheck - compares files for similarity =head1 SYNOPSIS bitextcheck [options] file1 file2 [ file3 ... ] =head1 DESCRIPTION bitextcheck compares files for similarity using perl module Guesser (these can pairs or, more generally, tuples). =head1 USAGE ... =head1 OPTIONS =head2 -f processes all lines of a file comparing the ones in each line =head2 -h prints help and exits =head2 -lang1=.. forces the language of the first file =head2 -lang2=.. forces the language of the second file =head2 -q quiet mode (displays only the result) =head1 AUTHOR José Alves de Castro, jac@natura.di.uminho.pt =head1 SEE ALSO perl(1), Guesser(3) =cut