#!/usr/bin/perl use strict; use XML::TMX::Reader; use libmorfo_perl; use Analysis; my $FREELINGDIR = '/home/jorge/tools/'; my $DATA = $FREELINGDIR . 'share/FreeLing/'; my $l1="pt"; my $l2="en"; my %analysers = (); #sub apt{ uc($_[0])} #sub aen{ uc($_[0])} for my $file (@ARGV){ my $tm = XML::TMX::Reader->new($file); my @langs = $tm->languages; # initialize analysers map { $analysers{$_} = init($_) if not $analysers{$_}; } @langs; #if (not defined $l1) { # ($l1,$l2) = sort ($tm->languages); # print "Using languages: $l1/$l2\n" #} $tm->for_tu( { output => "$file._" }, sub { my $tu = shift; $tu->{$l1} = analyse($l1, $tu->{$l1})->to_xml; $tu->{$l2} = analyse($l2, $tu->{$l2})->to_xml; $tu; }); } sub init { my $LANG = shift; my $analyser = {}; my $tokenizer = libmorfo_perl::tokenizer->new("$DATA$LANG/tokenizer.dat"); my $splitter = libmorfo_perl::splitter->new("$DATA$LANG/splitter.dat"); my $options = libmorfo_perl::maco_options->new($LANG); $options->set_active_modules(1,1,1,1,1,1,1,1,0,0); $options->set_data_files( "$DATA$LANG/locucions.dat", "$DATA$LANG/quantities.dat", "$DATA$LANG/afixos.dat", "$DATA$LANG/probabilitats.dat", "$DATA$LANG/maco.db", "$DATA$LANG/np.dat", "$DATA/common/punct.dat", "$DATA$LANG/corrector/corrector.dat"); my $maco = libmorfo_perl::maco->new($options); my $tagger = libmorfo_perl::hmm_tagger->new($LANG,"$DATA$LANG/tagger.dat",1,2); $analyser->{tokenizer} = $tokenizer; $analyser->{splitter} = $splitter; $analyser->{maco} = $maco; $analyser->{tagger} = $tagger; return $analyser; } sub analyse { my $LANG = shift; my $str = shift; my $words = $analysers{$LANG}{tokenizer}->tokenize($str); my $sentences = $analysers{$LANG}{splitter}->split($words,1); # análise morfológica $sentences = $analysers{$LANG}{maco}->analyze($sentences); # PoS tagging $sentences = $analysers{$LANG}{tagger}->analyze($sentences); my $analysis = Analysis->new(); $analysis->{sentences} = $sentences; return $analysis; }