#!/usr/bin/env perl use strict; use warnings; use Text::Perfide::PartialAlign qw/ usage subcorpora2files calc_pairs _log get_corpus calc_common_tokens build_chain/; use Data::Dumper; use Getopt::Long; my $options = {}; my $result = GetOptions ($options, 'rs=s' , 'sec' , 'debug' , 'v' , 'cf' , 'max=i' , 'all' , ); $options->{rs} //= "\n"; $options->{rs}= '_sec' if $options->{sec}; $/ = $options->{rs}; if($options->{sec}){ $options->{all} = 1; } if($options->{all}){ $options->{max} = 1; } if($options->{max}){ push @ARGV,$options->{max}; } usage(@ARGV) unless (@ARGV == 5 or @ARGV == 6); my $maximalChunkSize = (@ARGV==6 ? $ARGV[5] : 5000); my ($huFilename,$enFilename,$outputFilename,$huLangName,$enLangName) = @ARGV[0..5]; _log("Reading corpora..."); my ($huCorpus, $huOffsets, $huTextRef) = get_corpus($huFilename); my ($enCorpus, $enOffsets, $enTextRef) = get_corpus($enFilename); _log("Done."); my ($commonHap,$huPositions,$enPositions) = calc_common_tokens($huCorpus,$enCorpus); my $pairs = calc_pairs($commonHap,$huPositions,$enPositions,$huCorpus,$enCorpus); my $chain = build_chain($pairs,$maximalChunkSize,$options); subcorpora2files($chain,$huTextRef,$enTextRef,$huOffsets,$enOffsets,$outputFilename,$huLangName,$enLangName); __END__ =head1 NAME partial_align2 - aligner ........ =head1 SYNOPSIS partial_align2 [options] file1 file2 output_prefix l1 l2 =head1 DESCRIPTION =head2 Options --rs=... Define record separator (Perl's $/). Default is the newline character (\n). --sec Split by section annotations added by Text::Perfide::BookCleaner (same as -rs=_sec -all). -v Create several files with dumps of auxiliary structures. --cf=... Pass an additional file containing correspondences between the two languages. File must follow the format term(,term)* = term(,term)* --all Try to split in as many files as possible (same as -max=1). --max=... Maximum size of the split files (in bytes). =head2 EXPORT =head1 AUTHOR Andre Santos, andrefs@cpan.org J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut