#!/usr/bin/perl

use common::sense;

use MLDBM;
use XML::RSS::Parser;
use FileHandle;
use LWP::Simple qw.$ua getstore is_success.;
use Term::ANSIColor;
use Try;
use Digest::MD5 'md5_hex';
use Fcntl;  
use Encode qw(encode_utf8);

our $NOT_YET = 0;

binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

my %DB;
my $DB = tie %DB, 'MLDBM', 'DB.db', O_CREAT|O_RDWR, 0640 or die $!;

my $url_file      = "RSSs";

my %url = load_urls($url_file);

my $parser = XML::RSS::Parser->new;

$ua->agent("Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0");

my @langs = @ARGV ? @ARGV : (sort keys %url);

for my $lang (@langs) {
    say STDERR yellow("Processing $lang");

    mkdir $lang unless -f $lang;

    if (enough($lang)) {
        say green(" enough documents for $lang");
    } else {
        foreach my $f (@{$url{$lang}}) {
            grab($lang, $f, $lang);
        }
        $NOT_YET++;
    }
}

print "Languages not yet complete: $NOT_YET\n";

untie %DB;

sub enough { return count(@_) == 200; }

sub count {
    my ($folder) = @_;
    my @files = <$folder/*.txt>;
    return scalar(@files);
}

sub dump_feed {
    my ($lang, $feed, $output) = @_;

    my $i = 1;

    # item => content
    for my $item ($feed->query('//item')) {
        last if enough($output);

        my $guid = ($item->query('guid')    ||
                    $item->query('dc:date') ||
                    $item->query('link') ||
                    $item->query('pubDate'))->text_content;

        $guid = $lang . md5_hex(encode_utf8($guid));

        next if $DB{$guid};
        $DB{$guid} = 1;

        my $title = $item->query('title');
        $title = $title->text_content if $title;

        my $content = ($item->query('content:encoded') ||
                       $item->query('description'))->text_content;

        my $file;
        do {
            $file = sprintf("$output/out_%03d.txt", $i++);
        } while -f $file;

        open my $ofh, ">:utf8", $file or die "$!";
        print $ofh $title, "\n\n" if $title;
        print $ofh clean_up($content);
        close $ofh;

        say "  saved $file";

    }

    my $c = count($output);
    say " $lang has $c documents";

    return 1;
}

sub clean_up {
    my $x = shift;
    $x =~ s/<[^>]+>/ /g;
    return $x;
}

sub grab {
    my ($lang, $url, $output) = @_;

    say " - trying $url";

    my $response_code = getstore($url => "tmp.rss");

    if (is_success($response_code)) {
        sleep 2;

        my $fh = FileHandle->new("tmp.rss");

        try {
            my $feed = $parser->parse_file($fh);

            dump_feed($lang, $feed => $output);

        } catch {
            say STDERR red("Can't parse fetched feed: $_");
        }
    } else {
        say STDERR red("error fetching $lang rss feed: $response_code");
    }
    unlink "tmp.rss"; # make sure it is not there for next iteration
}


sub load_urls {
    my $file = shift;
    my %url;
    open my $fh, "<", $file or die "Error opening $file: $!";
    while (<$fh>) {
        chomp;
        s/#.*$//;
        next if /^\s*$/;
        my ($code, $url) = split /\t/;
        push @{$url{$code}}, $url;
    }
    close $fh;
    return %url;
}

sub yellow { _colored('bright_yellow', @_); }
sub red    { _colored('bright_red', @_); }
sub green  { _colored('bright_green', @_); }

sub _colored {
	my $color = shift;
	return color($color), @_, color('reset');
}

