#!/usr/bin/perl -w use NAT::Corpus; use NAT::Lexicon; my $lex = shift; my $crp; my %chunks = (); while ($crp = shift @ARGV) { my $corpus = NAT::Corpus::new($crp); my $corpusIterator = $corpus->iterator; while($sentence = $corpusIterator->next) { chunk($sentence); } $corpus->free; } my $lexicon = NAT::Lexicon::open($lex); my $foo = 0; for my $context (keys %chunks) { $foo++; DumpEntry($context,$chunks{$context}); } print STDERR ">> $foo contexts\n"; sub chunk { my $sentence = shift; my @sentence = @{$sentence}; my $nrchunks = $#sentence; @sentence = ("0","0",@sentence,'0','0'); for (0..$nrchunks) { my @slice = @sentence[$_..$_+4]; $chunks{sprintf("%X %X|%X %X",$slice[0],$slice[1],$slice[3],$slice[4])} .= sprintf("%X ",$slice[2]); } } sub DumpEntry { my $context = shift; my $string = shift; my %e = (); $e{$_}++ for (split /\s/, $string); if (scalar(keys %e) > 1) { print "CHUNK: ",idstosentence($context),"\n"; for (keys %e) { print " ",$lexicon->word_from_id(hex($_))," ($e{$_})\n"; } print "\n"; } } sub idstosentence { my $x = shift; $x =~ s/\b0\b/^/g; $x =~ s/[0-9A-F]+/$lexicon->word_from_id(hex($&))/ge; return $x; }