#!/usr/bin/perl -s

use strict;
use POSIX qw(locale_h);
setlocale(&POSIX::LC_ALL, "pt_PT");
use locale;
use warnings;

use Data::Dumper;
use YAML qw/LoadFile Bless Dump/;

our( $addradical, $tokenize, $cqpcode, $all, $xmlclean);

my $usage = "./mkCorpus [-addradical|-tokenize|-cqpcode|-all] <metafile>\n";

my $metafile = shift or die($usage);
my $struct = LoadFile($metafile); ## $metafile is a YAML file.

my @hash = ($struct);
my @files = split /\s*,\s*/,$hash[0]{FILES};

for my $inputfile (@files){
    print STDERR "Processing [$inputfile]\n";
    my $tokenizeFile = $inputfile;
    my $annotationFile = $inputfile;

    $tokenizeFile   =~ s/(\.[^.]+)?$/.token/;
    $annotationFile =~ s/(\.[^.]+)?$/.rad/;

    if($xmlclean) {
   	xmlclean($inputfile, "__$inputfile");
        $inputfile = "__$inputfile";
    }
    if($addradical) {
        addRadical($inputfile,$annotationFile);
    }
    elsif($tokenize) {
        tokenize($inputfile,$tokenizeFile);
    }
    elsif($cqpcode) {
        cqpcode($inputfile,$metafile);
    }
    elsif($all) {
        all($inputfile,$tokenizeFile,$annotationFile);
    }
    else {
        if ($inputfile =~ /(.+).cor/) {
            tokenize($inputfile,$tokenizeFile);
        }
        elsif ($inputfile =~ /(.+).token/) {
            addRadical($inputfile,$annotationFile);
        }
        elsif ($inputfile =~ /(.+).rad/) {
            cqpcode($inputfile,$metafile);
        }
        else {
            all($inputfile,$tokenizeFile,$annotationFile);
        }
    }
}

## função de atomização/segmentação
##
sub tokenize {
  my ($input,$output) = @_;
  print STDERR " - running segmenter tool...";
  `sentences -tokenize=cqp $input > $output`;
  print STDERR "done [$output]\n";
}

## função de limpeza
##
sub xmlclean {
  my ($input, $output) = @_;
  print STDERR " - running XML cleaning...";
  `xmlclean $input -tags $struct->{"TAGS V"} `;
  rename ("outfilecorpus.cor",$output);
  print "done [$output]\n";
}

## função de anotação com o addradical
##
sub addRadical {
  my ($input, $output) = @_;
  print STDERR " - adding radicals...";
  `addradical $input > $output`;
  print "done [$output]\n";
}

## função de codificação em código CQP
##
sub cqpcode {
    my ($input,$meta) = @_;
    print STDERR " - codifying for CQP...";
    `tokens2cqp.pl $input $meta`;
    print "done\n";
}

## função de realização de todos os passos 
##
sub all {
    my ($input,$tokenizeF,$annotationF) = @_;
    $tokenizeF =~ s/(\.[^.]+)?$/.token/;
    tokenize($input,$tokenizeF);
    $annotationF =~ s/(\.[^.]+)?$/.rad/;
    addRadical($tokenizeF,$annotationF);
    cqpcode($annotationF,$metafile);
}


__END__

=head1 NAME

mkCorpus - prepare a corpus to be searched by nat-based CWB

=head1 SYNOPSIS

 mkCorpus [options] corpus.meta 

=head1 DESCRIPTION

Given a metafile with required information about the corpus, the script 
prepares the corpus files to be searched by a nat-based CWB or 
using a web interface to do it.

The metafile has a pre-defined syntax and structure that will be 
described later.

With certain options before the metafile, we can do the steps that we
want to do, and ignore some attributes of the metafile.  

=head2 Syntax of META

  CORPUS: corpusname
  FILES: corpusfiles
  ATTRIBUTES: corpusattributes

  TAGS S: corpustagss
  TAGS V: corpustagsp
  
The corpusname is the name that we want it has.
The corpusfiles is a set of file names that compose the corpus.
The corpusattributes is a set of attributes that we want the
corpus has.
The corpustagsp is the set of XML S tags that the corpus files have.
The corpustagss is the set of XML P tags that the corpus files have.

Example:
   CORPUS: name
   FILES: f1,f2,f3
   ATTRIBUTES: alema, apos

   TAGS S:  
   TAGS V: text, p, s

=head2 Options
 -addradical
      This option adds to all words that compose the corpus two types
      of sintatic information. The first one is the lema and the 
      second one is the morphosintatic properties.
      
 -tokenize
      This option tokenizes the corpus. It prepares the corpus, 
      putting a word by line. 
      
 -cqpcode
      This option codes the tokens, preparing them to be searched
      by nat-based CWB.
      
 -makeall   (default)
      This option does all the necessary steps to the corpus be 
      properly searched.

=head1 AUTHOR

J. Joao Almeida <jj@di.uminho.pt>
Alberto Simoes <ambs@di.uminho.pt>
Joana Vilas Boas <joana.vboas@gmail.com>

=head1 SEE ALSO

perl(1), sentences(1), token2cqp(1), addradical(1),jspell(1).

=cut