#!/usr/bin/perl -ws

use strict; use warnings;
main(@ARGV);

sub _log {
	print STDERR "$_[0]\n";
}

sub tokenFreq {
	my $corpus = shift;
	my $freq = {};
	for my $l (@$corpus) {
		for my $t (@$l) {
			$freq->{$t}++;
		}
	}
	return $freq;
}

sub hapaxes {
	my $freq = shift;
	my $hapaxes = {};
	while(my ($token, $count) = each(%$freq)) {
		$hapaxes->{$token} = 1 if $count == 1;
	}
	return $hapaxes;
}

sub hapaxPositions {
	my ($hapaxes, $corpus) = @_;
	my $hapaxPos = {};
	my $corpus_size = @$corpus;
	for(my $ind = 0; $ind < $corpus_size; $ind++){
		my $l = $corpus->[$ind];
		for my $t (@$l) {
			$hapaxPos->{$t} = $ind if (defined($hapaxes->{$t}));
		}
	}
	return $hapaxPos;
}

sub uniqSort {
	my $l = shift;
	my $hash = {};
	my $uniqSorted = [];
	map { $hash->{$_->[0]}{$_->[1]} = 1 } @$l;
	for my $x (sort { $a <=> $b } keys %$hash){
		for my $y (sort { $a <=> $b } keys %{$hash->{$x}}){
			push @$uniqSorted, [$x,$y];
		}
	}
	return $uniqSorted;
}

sub less {
	my ($a,$b) = @_;
	if ($a->[0] < $b->[0] and $a->[1] < $b->[1])
							{ return 1;	}
	else 					{ return 0; }
}


sub maximalChain {
	my $pairs = shift;
	# print Dumper @$pairs;
	my $lattice = {};
	for my $p (@$pairs) {
		my $bestLength = 0;
		my $bestPredessor = undef;
		for my $q (@$pairs) {
			if(less($q,$p) and defined($lattice->{$q->[0]}{$q->[1]})){
				my ($length,$dummy) = @{$lattice->{$q->[0]}{$q->[1]}};
				if($bestLength < $length+1){
					# print "$bestLength < $length\n";
					$bestLength = $length+1;
					$bestPredessor = $q;

				}
			}
		}
		$lattice->{$p->[0]}{$p->[1]} = [$bestLength,$bestPredessor];
		#print "$bestLength @$p $bestPredessor\n";
	}
	my $x = [ map { [$lattice->{$_->[0]}{$_->[1]}[0],$_] } @$pairs ] ;
	my $y = (sort { $b->[0] <=> $a->[0] } @$x)[0];
	my ($bestLength,$p) = @$y;
	my $chain = [];
	while($p){
		push @$chain, $p;
		(my $length, $p) = @{$lattice->{$p->[0]}{$p->[1]}} ;
	}
	return [reverse @$chain ];
}

sub selectFromChain {
	my ($chain,$maximalChunkSize) = @_;
	my $forced = 0;
	my $cursor;
	my $filteredChain = [];

	my $chain_size = @$chain;
	for (my $ind = 0; $ind < $chain_size; $ind++) {
		my $p = $chain->[$ind];
		if($ind == 0) {
			push @$filteredChain, $p;
			$cursor = $p;
			next;
		}
		if(	$p->[0] - $cursor->[0] > $maximalChunkSize or
			$p->[1] - $cursor->[1] > $maximalChunkSize) 	{
			my $lastPos;
			$lastPos = ($ind!=0 ? $chain->[$ind-1] : [0,0]);
			if ($lastPos != $cursor)	{ push @$filteredChain, $lastPos }
			else						{
				push @$filteredChain,$p;
				$forced = 1;
			}
			$cursor = $filteredChain->[-1];
		}
	}


	push @$filteredChain, $chain->[-1] unless(defined($filteredChain->[-1]) and
										$filteredChain->[-1]==$chain->[-1]);

	return ($filteredChain,$forced);
}

sub main {
	unless (@_ == 5 or @_ == 6){
		_log("A preprocessor for hunalign.");
		_log("Cuts a very large sentence-segmented unaligned bicorpus into smaller parts manageable by hunalign.");
		_log("");
		_log("Usage: $0 huge_text_in_one_language huge_text_in_other_language output_filename name_of_first_lang name_of_second_lang [ maximal_size_of_chunks=5000 ] > hunalign_batch");
		_log("");
		_log("The two input files must have one line per sentence. Whitespace-delimited tokenization is preferred.");
		_log("The output is a set of files named output_filename_[123..].name_of_lang");
		_log("The standard output is a batch job description for hunalign, so this can and should be followed by:");
		_log("hunalign dictionary.dic -batch hunalign_batch");
		exit -1;
	}

	my $maximalChunkSize = (@_==6 ? $_[5] : 5000);
	my ($huFilename,$enFilename,$outputFilename,$huLangName,$enLangName) = @_[0..5];

	_log("Reading corpora...");
# Falta o trim
	open my $hufh, '<', $huFilename;
	my $huCorpus = [ map { [ split ]} <$hufh> ]; # Array of sentences, sentence = Array of words
	open my $enfh, '<', $enFilename;
	my $enCorpus = [ map { [ split ]} <$enfh> ];
	_log("Done.");

	print "num sents: ",scalar @$huCorpus,"\n";

	my $huFreq = tokenFreq($huCorpus);  # Map word => frequency (number of times word appears in corpus)
	my $enFreq = tokenFreq($enCorpus);
	my $huHap  = hapaxes($huFreq);  	# Words which have frequency = 1
	my $enHap  = hapaxes($enFreq);


	my %hash;
	@hash{keys %$huHap} = (1) x keys %$huHap;
	my $commonHap = [ grep { $hash{$_} } keys %$enHap ] ;
	my $huPositions = hapaxPositions($huHap, $huCorpus); # Map word => id_sentence
	my $enPositions = hapaxPositions($enHap, $enCorpus);


	my $pairs = [];					# (id_sentence_file1, id_sentence_file2)
	for my $t (@$commonHap) {
		# print "$huPositions->[$t]\t$enPositions->[$t]\t$t\n";
		push @$pairs, [$huPositions->{$t}, $enPositions->{$t}];
	}
	push @$pairs, [0,0];

	my $corpusSizes = [ scalar @$huCorpus, scalar @$enCorpus ];
	push @$pairs, $corpusSizes;

	$pairs = uniqSort($pairs);

	_log("Computing maximal chain in poset...");
	my $chain = maximalChain($pairs);
	_log("Done.");
	_log(length(@$chain)." long chain found in ".length(@$pairs)." sized poset...");

	if($maximalChunkSize > 0) {
		_log("Selecting at most $maximalChunkSize sized chunks...");
		($chain,my $forced) = selectFromChain($chain,$maximalChunkSize);
		_log(scalar(@$chain)." chunks selected.");
		_log("Done.");
		_log("WARNING: maximalChunkSized could not be obeyed.") if $forced;
	}

	my $debug = 0;
	if($debug) {
		my $justResult = 1;
		my $chainToPrint = ($justResult ? @$chain[0..@$chain-2] : @$pairs[0..@$pairs-2] );

		for my $ref (@$chainToPrint) {
			my ($huPos,$enPos) = @$ref;
			my $s = (join ' ',$huCorpus->[$huPos]) . "\t" . (join ' ',$enCorpus->[$enPos]);
			if($justResult){
				print "$s\n";
			} else {
				$s+="\t<<<<<<<<" if grep { $_->[0] == $huPos and $_->[1] == $enPos } @$chain;
				print "$s\n";
				print;
			}
		}
	}
	else {
		my $justPrintChain = 0;
		if($justPrintChain) {
			for my $p (@$chain) {
				print "$p->[0]\n$p->[1]\n";
			}
		}
		else {
			_log("Writing subcorpora to files...");
			my $lastPos = [0,0];
			my $ind = 1;
			for my $pos (@$chain) {
				next if $pos->[0] == $lastPos->[0] and $pos->[1] == $lastPos->[1];
				my $baseFilename = "${outputFilename}_$ind";
				my $huSubCorpus = strInterval($huCorpus, $lastPos->[0], $pos->[0]);
				my $enSubCorpus = strInterval($enCorpus, $lastPos->[1], $pos->[1]);
				my $huFilename = "$baseFilename.$huLangName";
				open my $huFile, '>', $huFilename;
				print $huFile $huSubCorpus;
				close $huFile;


				my $enFilename = "$baseFilename.$enLangName";
				open my $enFile, '>', $enFilename;
				print $enFile $enSubCorpus;
				close $enFile;

	#			print "$huFilename\t$enFilename\t$baseFilename.align\n";

				$lastPos = $pos;
				$ind++;
			}
			_log("Done.");
		}
	}
}


sub strInterval {
	my ($corpus,$start,$end) = @_;
	my $s;
	for my $line (@$corpus[$start..$end-1]){
		$s.= (join ' ', @$line) . "\n";
	}
	#print "=======================\n$s\n";
	return $s;
}