#!/usr/bin/perl -s

use common::sense;

binmode STDOUT, ":utf8";

our ($add, $n, $ssize);
my ($l1, $l2) = @ARGV;

$n ||= 3;
$ssize ||= 10;

die "Need two languages" unless $l1 && $l2;

my ($l1weight, $l1grams) = load_grams($l1);
my ($l2weight, $l2grams) = load_grams($l2);

my $diffs = compute_diffs($l1weight, $l2weight, $l1grams, $l2grams);

my $N = $ssize;
for my $k (sort {$diffs->{$b} <=> $diffs->{$a}} keys %$diffs) {
	last unless $N;
	print "$k\t$l1grams->{$k}\t$l2grams->{$k}\n";
	$N--;
}

if ($add) {
	my $current = {};
	open my $fh, "<:utf8", "$n-features.txt" or die;
	while (<$fh>) {
		chomp;
		$current->{$_}++;
	}
	close $fh;

	$N = $ssize;

	for my $k (sort {$diffs->{$b} <=> $diffs->{$a}} keys %$diffs) {
		last unless $N;
		$current->{$k}++;
		$N--;
	}

	open my $fh, ">:utf8", "$n-features.txt" or die;
	for my $k (keys %$current) {
		print $fh "$k\n";
	}
	close $fh;
	
} else {
	print "\nUse -add to add to $n-features file\n";
}

sub compute_diffs {
	my ($l1w, $l2w, $l1g, $l2g) = @_;

	my $diff = {};

	my @keys = keys %$l1w;
	for (keys %$l2w) {
		push @keys, $_ unless exists $l1w->{$_};
	}

	for my $k (@keys) {
		$diff->{$k} = abs(($l1w->{$k} || 0) - ($l2w->{$k} || 0)) *
					  abs(($l1g->{$k} || 0) - ($l2g->{$k} || 0));
	}
	return $diff;
}

sub load_grams {
	my $l = shift;
	my @files = <out/$n-${l}_*.dmp>;
	my $hash = {};
	my $weight = {};
	for my $f (@files) {
		my $x = do $f;
		for my $k (keys %$x) {
			$hash->{$k} += $x->{$k};
			$weight->{$k} ++;
		}
	}

	for my $k (keys %$hash) {
		$hash->{$k} /= scalar(@files);
	}

	return ($weight, $hash);
}
