use strict;
use warnings;
use utf8;
use Data::Dumper;
use FindBin qw($Bin);
use lib "$Bin/../lib";
require "$Bin/common.pl";

my $dir = shift;
my $findcmd = "find $dir -regex '.*ptd.pt-[a-z]*.*\.dmp'";
my @files = `$findcmd`;
chomp @files;

for my $file (@files){
    $file =~ /\/([^\/]+?)\/[^\/]+\/ptd.pt-(\w+).dmp/;
    my $prefix = $1;
    my $lang = $2;
    my $right = $file;
    $right =~ s/pt-$lang/$lang-pt/;

    if ( -e $file && -e $right ) {
        `mkdir -p output/$prefix/`;

        my $biwords = gen_biwords( $file, $right);
        dump_biwords( $biwords, "output/biwords-pt-$lang.csv");
        dump_biwords( $biwords, "output/$prefix/biwords-pt-$lang.csv");

        $biwords = gen_biwords( $file, $right, t => 1);
        dump_biwords( $biwords, "output/biwords-pt-$lang-all.csv");
        dump_biwords( $biwords, "output/$prefix/biwords-pt-$lang-all.csv");

    }


}

sub dump_biwords {
    my $biwords = shift;
    my $f = shift;

    `touch $f` unless (-e $f);

    open FILE, '>>', $f or die "Could not open $f";

    for my $bw (@{$biwords}){
        my $r = $bw->{'r'};
        my $l = $bw->{'l'};
        my $rank = $bw->{'rank'};
        print FILE "$l $r\n";
    }

    close FILE;
}
