#!/usr/bin/perl -ws use strict; use warnings; use Text::Perfide::WordBags; use Text::Perfide::BookPairs; use utf8::all; our($bpairs,$rv,$av,$warn,$debug,$v,$nr,$same,$dv,$recalc,$normbf); $nr //= 3; $rv //= 0.2; $av //= 0.4; $dv //= 0.9; my $options = {}; $options->{warn} = 1 if defined($warn); $options->{debug} = 1 if defined($debug); $options->{recalc} = 1 if defined($recalc); $options->{v} = 1 if defined($v); $options->{nr} = $nr; $options->{rv} = $rv; $options->{av} = $av; $options->{dv} = $dv; if(@ARGV<1) { usage(); } elsif(@ARGV==1 and defined($same)){ print STDERR "File '$ARGV[0]' contains list of books to process.\n"; my $list = open_list($ARGV[0]); calcbagfiles($list,$options); calc_dupvers($list,$options); rmbagfiles($list) unless defined($normbf); } elsif(@ARGV==2) { print STDERR "Files '$ARGV[0]' and '$ARGV[1]' are lists of books to pair.\n"; my ($list_file1,$list_file2) = @ARGV; my ($list1,$list2) = double_open_lists($list_file1,$list_file2); foreach my $file1 (@$list1){ listpairs([$file1,@$list2],$options); } rmbagfiles($list1) unless defined($normbf); rmbagfiles($list2) unless defined($normbf); } else { print STDERR "Finding pairs for book '$ARGV[0]'.\n"; calcbagfiles(\@ARGV,$options); listpairs(\@ARGV,$options); rmbagfiles(\@ARGV) unless defined($normbf); } sub listpairs{ if ($same) { calc_dupvers(@_); } elsif ($bpairs) { calc_bpairs(@_); } else { calc_default(@_); } } sub usage { print STDERR "Usage:\n\tpairbooks [options] book candidates*\n"; print STDERR "\tpairbooks [options] list1 list2\n"; print STDERR "\t\t(pair books from list1 with books from list2)\n"; print STDERR "\tpairbooks -same [options] book_list\n"; print STDERR "\t\t(find similar versions in books from book_list)\n"; } sub open_list { open LIST,'<',$_[0]; my @list = ; chomp @list; return \@list; } sub double_open_lists { my ($list_file1,$list_file2) = @_; open LIST1,'<',$list_file1 or die "Could not open '$list_file1'"; open LIST2,'<',$list_file2 or die "Could not open '$list_file2'"; my @list1 = ; my @list2 = ; chomp @list1; chomp @list2; return (\@list1,\@list2); } __END__ =head1 NAME pairbooks - for a given book, finds its most probable pairs in a collection =head1 SYNOPSIS pairbooks [options] book candidates* pairbooks [options] book_list1 book_list2 =head1 DESCRIPTION =head1 Options -nr=3 Returns the 3 most similar candidates -bpairs Output results in .bpairs format (1 pair of books per line, separated with a \t -rv=LOW Reject value - book pairs with pairability value lower that LOW will be automatically rejected. Default is 0.2 -av=HIGH Accept value - book pairs with pairability value equal or above HIGH will be automatically approved. Default is 0.4 -dv=VAL Duplicates value - book pairs with pairability value equal or above VAL will be considered duplicates (the same book in the same language). Default is 0.9. Use with -same. -warn Comments pairs with pairability value under HIGH (see -av). Rejected pairs will have a leading '# X', while dubious pairs will have a leading '# ?'. -same Instead of finding pairs, tries to find candidates to be *the same book in the same language* (see -dv) -debug Prints debug information -recalc Calculate file.bag even if it exists already. -normbf Do not remove .bag files at the end -v =head1 AUTHOR Andre Santos, andrefs@cpan.org J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut