#!/usr/bin/perl
# PODNAME: dmoss-corpus
# ABSTRACT: command line tool to create a corpus

use warnings;
use strict;

use File::Slurp qw/slurp/;
use File::Find;
use File::Copy;
use File::Temp qw/tempdir/;
use File::Which;
use DMOSS::Oracle;
use DMOSS::File;
use HTML::FormatText::Html2text;
use Path::Iterator::Rule;
use Lingua::Identify::CLD;

my $root = shift;
die unless $root;

my $dispatcher = {
  'README INSTALL TEXT'
    => sub { `cat $_[0]` },
  'MAN' => sub { `/usr/bin/man ./$_[0] | col -b` },
  'HTML' => sub {
      return HTML::FormatText::Html2text->format_file($_[0]);
    },
  'JAVA' => sub { _handle_java($_[0]) },
};

my $ora = DMOSS::Oracle->new;
find(\&proc_file, $root);

sub proc_file {
  return if $_ =~ m/^\.+$/;

  my $name = $_;
  my $path = $File::Find::name;
  return if ($path =~ m/\.gen/ or -d $path);

  my $file = DMOSS::File->new('', "$path/$name");
  my $type = $ora->type($file);
  return unless $type;
  print STDERR "FILE $path <- TYPE $type\n";

  my $cld = Lingua::Identify::CLD->new();
  foreach (keys %$dispatcher) {
    if (grep {$type eq $_} split /\s+/, $_) {
      my $sub = $dispatcher->{$_};
      my $cnt =  &$sub($name);

      my $lang = $cld->identify($cnt);
      if (lc($lang) eq 'english') {
        print STDERR " + Adding by RE [$_] LANG [$lang]\n";
        print $cnt;
      }
    }
  }
}

sub _handle_java {
  my $file = shift;
  my $dir = tempdir( CLEANUP => 1 );

  my $javadoc_bin = which('javadoc');
  my $javadoc_opts = '-notimestamp -nodeprecatedlist -notree -noindex -nohelp -nonavbar -quiet'; 
  return unless $javadoc_bin;

  `$javadoc_bin $javadoc_opts -d $dir $file`;

  my $target = $file;
  $target =~ s/\.java$/\.html/i;
  my $rule = Path::Iterator::Rule->new;
  $rule->name($target);
  foreach ( $rule->all($dir) ) {
    return HTML::FormatText::Html2text->format_file($_);
  }
}

__END__

=encoding UTF-8

=head1 SYNOPSIS

    $ tar zxvf tree-1.5.3.tgz
    $ dmoss-corpus tree-1.5.3/ > corpus.txt

=head1 DESCRIPTION

This tools' goal is to create a corpus of the software documentation,
and other natural language content files. The corpus is printed to the
standard output. The files type are computed by L<DMOSS::Oracle>.