#!/usr/bin/perl -s use strict; use POSIX qw(locale_h); setlocale(&POSIX::LC_ALL, "pt_PT"); use locale; use warnings; use Data::Dumper; use YAML qw/LoadFile Bless Dump/; our( $addradical, $tokenize, $cqpcode, $all, $xmlclean); my $usage = "./mkCorpus [-addradical|-tokenize|-cqpcode|-all] \n"; my $metafile = shift or die($usage); my $struct = LoadFile($metafile); ## $metafile is a YAML file. my @hash = ($struct); my @files = split /\s*,\s*/,$hash[0]{FILES}; for my $inputfile (@files){ print STDERR "Processing [$inputfile]\n"; my $tokenizeFile = $inputfile; my $annotationFile = $inputfile; $tokenizeFile =~ s/(\.[^.]+)?$/.token/; $annotationFile =~ s/(\.[^.]+)?$/.rad/; if($xmlclean) { xmlclean($inputfile, "__$inputfile"); $inputfile = "__$inputfile"; } if($addradical) { addRadical($inputfile,$annotationFile); } elsif($tokenize) { tokenize($inputfile,$tokenizeFile); } elsif($cqpcode) { cqpcode($inputfile,$metafile); } elsif($all) { all($inputfile,$tokenizeFile,$annotationFile); } else { if ($inputfile =~ /(.+).cor/) { tokenize($inputfile,$tokenizeFile); } elsif ($inputfile =~ /(.+).token/) { addRadical($inputfile,$annotationFile); } elsif ($inputfile =~ /(.+).rad/) { cqpcode($inputfile,$metafile); } else { all($inputfile,$tokenizeFile,$annotationFile); } } } ## função de atomização/segmentação ## sub tokenize { my ($input,$output) = @_; print STDERR " - running segmenter tool..."; `sentences -tokenize=cqp $input > $output`; print STDERR "done [$output]\n"; } ## função de limpeza ## sub xmlclean { my ($input, $output) = @_; print STDERR " - running XML cleaning..."; `xmlclean $input -tags $struct->{"TAGS V"} `; rename ("outfilecorpus.cor",$output); print "done [$output]\n"; } ## função de anotação com o addradical ## sub addRadical { my ($input, $output) = @_; print STDERR " - adding radicals..."; `addradical $input > $output`; print "done [$output]\n"; } ## função de codificação em código CQP ## sub cqpcode { my ($input,$meta) = @_; print STDERR " - codifying for CQP..."; `tokens2cqp.pl $input $meta`; print "done\n"; } ## função de realização de todos os passos ## sub all { my ($input,$tokenizeF,$annotationF) = @_; $tokenizeF =~ s/(\.[^.]+)?$/.token/; tokenize($input,$tokenizeF); $annotationF =~ s/(\.[^.]+)?$/.rad/; addRadical($tokenizeF,$annotationF); cqpcode($annotationF,$metafile); } __END__ =head1 NAME mkCorpus - prepare a corpus to be searched by nat-based CWB =head1 SYNOPSIS mkCorpus [options] corpus.meta =head1 DESCRIPTION Given a metafile with required information about the corpus, the script prepares the corpus files to be searched by a nat-based CWB or using a web interface to do it. The metafile has a pre-defined syntax and structure that will be described later. With certain options before the metafile, we can do the steps that we want to do, and ignore some attributes of the metafile. =head2 Syntax of META CORPUS: corpusname FILES: corpusfiles ATTRIBUTES: corpusattributes TAGS S: corpustagss TAGS V: corpustagsp The corpusname is the name that we want it has. The corpusfiles is a set of file names that compose the corpus. The corpusattributes is a set of attributes that we want the corpus has. The corpustagsp is the set of XML S tags that the corpus files have. The corpustagss is the set of XML P tags that the corpus files have. Example: CORPUS: name FILES: f1,f2,f3 ATTRIBUTES: alema, apos TAGS S: TAGS V: text, p, s =head2 Options -addradical This option adds to all words that compose the corpus two types of sintatic information. The first one is the lema and the second one is the morphosintatic properties. -tokenize This option tokenizes the corpus. It prepares the corpus, putting a word by line. -cqpcode This option codes the tokens, preparing them to be searched by nat-based CWB. -makeall (default) This option does all the necessary steps to the corpus be properly searched. =head1 AUTHOR J. Joao Almeida Alberto Simoes Joana Vilas Boas =head1 SEE ALSO perl(1), sentences(1), token2cqp(1), addradical(1),jspell(1). =cut