#!/usr/bin/perl -w -s use CWB; #use Lingua::PT::PLN; use Lingua::PT::PLNbase; use Data::Dumper; use strict; our ($local,$html, $quebra, $lema, $text, $filename); $quebra = "-text" if $text; my $corpname=shift or die("$0 ... CorpusName\n"); chomp(my $regis = `cwb-config -r`); $regis="$ENV{HOME}/registry" if $local; my $corpdir="/corpora"; if ($lema) { open(O,"| addradical > $corpname.corpus") or die("Error in addradical"); } else { open(O,"> $corpname.corpus") or die("cant create $corpname.corpus\n");} my %t=( s => {}, v => {}); select(O); my $tags=undef; for my $n (@ARGV){ if ($html) { $n = "html2pml $n |" } elsif($quebra) { $n = "quebraxmlsent $quebra < $n|" } print STDERR "$n\n"; $tags = cqptokens({enc=>":utf8",outenc=>":utf8"},$n); %{$t{s}} = ( %{$t{s}}, %{$tags->{s}} ); %{$t{v}} = ( %{$t{v}}, %{$tags->{v}} ); # while(){ # print $_ ; # xmlsentences($_); # } } close(O); select (STDOUT); print STDERR "$corpname.corpus\n"; my ($r,$o)= tags2options($corpname,%t); open(R,">$regis/$corpname") or die; print R $r; close R; mkdir("$corpdir/$corpname"); mkdir("$corpdir/c1"); mkdir($regis); unlink(<$corpdir/$corpname/*>); print STDERR "$CWB::Encode -d $corpdir/$corpname -0 corpus -s $o < $corpname.corpus"; !system ("$CWB::Encode -d $corpdir/$corpname -0 corpus -s $o < $corpname.corpus")|| die ("Erro Encode -d $corpdir/$corpname -s $o < $corpname.corpus ($?)$@$!\n"); !system ("$CWB::Makeall $corpname") || die ("Erro Makeall ($?)$@$!\n"); ## REVER !system ("$CWB::CompressRdx ??? $corpname") || die ("Erro Makeall ($?)$@$!\n"); ## REVER !system ("$CWB::Huffcode ??? $corpname") || die ("Erro Makeall ($?)$@$!\n"); ## REVER apagar alguns .cr?? .corpus corpuz.rdx corpus.rev e outros que ele diz que se poderao remover sub tags2options{ my ($name,%t)=@_; my $options=""; my @v = grep !/corpus/, keys(%{$t{v}}); my @s = grep !/corpus/, keys(%{$t{s}}); my $registry= qq{ NAME "$name" ID $name HOME $corpdir/$name ATTRIBUTE word }; for (@v){ $registry .= "STRUCTURE $_\n"; } for my $s (@s){ $registry .= "STRUCTURE $s\n" unless grep(/^$s$/,@v); } $registry .= "ATTRIBUTE alema\n" if $lema; $registry .= "ATTRIBUTE apos\n" if $lema; $registry .= "ATTRIBUTE tit\n" if $filename; $options.= " -P alema -P apos " if $lema; $options.= " -P tit " if $filename; for (@v){ $options.=" -V $_";} for my $s (@s){ $options.=" -S $s" unless grep(/^$s$/,@v);} ($registry,$options) } __END__ =head1 NAME txt2cqp - Perl script to make CWB-cqp corpora from pure text or XML =head1 SYNOPSIS txt2cqp [options] corpusName file* options -lema to make lematization and POS atributes ("alema" and "apos") -text to process plain text files -quebra to break text in sentences -quebra=p for XML texts, to break p elements in sentences -filename to make a "tit" atribute ### Not working =head1 DESCRIPTION =head1 Options C<-text> - to proccess text with empty lines as paragraphs C<-html> - to proccess html input C<-lema> - it uses jspell morpulogical analyser to eval a lemma and a pos and builds C (ambiguos lema) and C (ambiguos pos) attributes. C<-filename> - to enabel the creation of an attribute with the filename called C C<-quebra> - to break sentences and insert a sentence split tag C C<-quebra=tagname> - to break XML elements names C in sentences and insert a tag C =head1 Install In order to install this script you need: . CWB (to build and query the corpus) . jspell (in order do use C<-lema> option) . Lingua::PT::PLN (to make cqp tokenize) =head1 Bugs option C<-filename> is not working =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO cwb -- corpus workbench (sttutgard) jspell perl(1). Lingua::PT::PLN =cut