#!/usr/bin/perl use strict; use warnings; use FL3; use Getopt::Std; my %options; getopt(l => \%options); my $lang = $options{l} || "en"; my $filename = shift; ## init the morph analyzer morph( $lang, ProbabilityAssignment => 'yes', QuantitiesDetection => 'no', MultiwordsDetection => 'yes', NumbersDetection => 'no', DatesDetection => 'yes', #OrthographicCorrection => 'no', NERecognition => 'yes' ); local $/ = "\n\n"; open my $fh, "< :utf8", $filename or die "Cannot open file $filename: $!\n"; binmode STDOUT, ":utf8"; my %classes = (NP00SP0 => 'Person', NP00G00 => 'Geographical', NP00O00 => 'Organization', NP00V00 => 'Others'); my %counts; while (my $line = <$fh>) { my ($tokens, $sentences); $tokens = tokenizer($lang)->tokenize($line); $sentences = splitter($lang)->split($tokens); $sentences = morph($lang)->analyze($sentences); $sentences = nec($lang)->analyze($sentences); for my $sentence (@$sentences) { for my $word ($sentence->words) { if ($word->is_multiword && exists($classes{$word->tag})) { my $class = $classes{$word->tag}; my $fw = $word->get_mw_words(); $counts{$fw}{_}++; $counts{$fw}{$class}++; } } } } for my $mw (keys %counts) { print $mw; for my $class (keys %{$counts{$mw}}) { next if $class eq "_"; printf "\t$class (%.4f)", $counts{$mw}{$class} / $counts{$mw}{_}; } print "\n"; } close $fh;