#!/usr/bin/perl # PODNAME: dmoss-corpus # ABSTRACT: command line tool to create a corpus use warnings; use strict; use File::Slurp qw/slurp/; use File::Find; use File::Copy; use File::Temp qw/tempdir/; use File::Which; use DMOSS::Oracle; use DMOSS::File; use HTML::FormatText::Html2text; use Path::Iterator::Rule; use Lingua::Identify::CLD; my $root = shift; die unless $root; my $dispatcher = { 'README INSTALL TEXT' => sub { `cat $_[0]` }, 'MAN' => sub { `/usr/bin/man ./$_[0] | col -b` }, 'HTML' => sub { return HTML::FormatText::Html2text->format_file($_[0]); }, 'JAVA' => sub { _handle_java($_[0]) }, }; my $ora = DMOSS::Oracle->new; find(\&proc_file, $root); sub proc_file { return if $_ =~ m/^\.+$/; my $name = $_; my $path = $File::Find::name; return if ($path =~ m/\.gen/ or -d $path); my $file = DMOSS::File->new('', "$path/$name"); my $type = $ora->type($file); return unless $type; print STDERR "FILE $path <- TYPE $type\n"; my $cld = Lingua::Identify::CLD->new(); foreach (keys %$dispatcher) { if (grep {$type eq $_} split /\s+/, $_) { my $sub = $dispatcher->{$_}; my $cnt = &$sub($name); my $lang = $cld->identify($cnt); if (lc($lang) eq 'english') { print STDERR " + Adding by RE [$_] LANG [$lang]\n"; print $cnt; } } } } sub _handle_java { my $file = shift; my $dir = tempdir( CLEANUP => 1 ); my $javadoc_bin = which('javadoc'); my $javadoc_opts = '-notimestamp -nodeprecatedlist -notree -noindex -nohelp -nonavbar -quiet'; return unless $javadoc_bin; `$javadoc_bin $javadoc_opts -d $dir $file`; my $target = $file; $target =~ s/\.java$/\.html/i; my $rule = Path::Iterator::Rule->new; $rule->name($target); foreach ( $rule->all($dir) ) { return HTML::FormatText::Html2text->format_file($_); } } __END__ =encoding UTF-8 =head1 SYNOPSIS $ tar zxvf tree-1.5.3.tgz $ dmoss-corpus tree-1.5.3/ > corpus.txt =head1 DESCRIPTION This tools' goal is to create a corpus of the software documentation, and other natural language content files. The corpus is printed to the standard output. The files type are computed by L.