#!/usr/bin/perl -CSDL my $id="__jj1-constituicao"; my @l = (); my @inv =(direction => "<~"); use NAT::Client; use Data::Dumper; $Data::Dumper::Indent=1; $Data::Dumper::Terse=1; my $client = NAT::Client->new( Local => "./$id"); while(<>){ chomp; my %r = (); my $ws = split(/\s+/,$_); my $word = $_; if ($ws==3){ for (sngrams({max=>10},"$word *")){ next unless $_; printf("%-13s |", "$_->[4] $_->[3]"); print "\n"; } } elsif($ws == 2){ for (sngrams({max=>10},"$word *")){ next unless $_; printf("%-13s |", "$_->[3] $_->[2]"); for (sngrams({max=>10},"$word $_->[2] *")){ next unless $_; print ("| $_->[3] "); } print "\n"; } } else { for (sngrams({max=>10},"$word *")){ next unless $_; printf("%-13s |", "$_->[2] $_->[1]"); for (sngrams({max=>10},"$word $_->[1] * *")){ next unless $_; print ("| $_->[2] $_->[3]"); } print "\n"; } } print "\n"; } sub sngrams{ my %opt =(max => 50); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my $exp=shift; my $ng=split(/\s+/,$exp); if($opt{max}){ # grep {$_} ((sort {$b->[$ng]<=>$a->[$ng]} @{$client->ngrams($exp)})[0..$opt{max}-1]) } else { (sort {$b->[$ng]<=>$a->[$ng]} @{$client->ngrams($exp)}) } } __END__ =head1 NAME jj-8 - compact tetragrams from 1 or 2 words =head1 SYNOPSIS =head1 DESCRIPTION (Previously: jj-1 -> to create the corpus) Print compact tetragrams . Example of output: == os tribunais 3 de || primeira | segunda | comarca 2 da || relacao 2 sao || os | independentes 2 judiciais || de | sao 2 comuns || . | em 1 . || (null) 1 tem || direito 1 previstos || nos 1 aplicar || normas == tribunais 10 judiciais || de primeira| de segunda| e o| sao os| ou de| formam um 5 de || qualquer instancia| conflitos .| primeira instancia| comarca , 5 administrativos || e fiscais 4 sao || os orgaos| publicas ,| independentes e| obrigatorias para 3 militares || com competencia 2 e || do ministerio| os demais In order to be accurate a big corpus is necessary... =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut