#!/usr/bin/perl
use libmorfo_perl;
# inicialização dos objectos do FreeLing
while (<STDIN>) {
my $line = $_;
# processamento do texto com o FreeLing
# visualização do resultado
}
my $tokenizer =
libmorfo_perl::tokenizer->new("$DATA$LANG/tokenizer.dat");
my $splitter =
libmorfo_perl::splitter->new("$DATA$LANG/splitter.dat");
my $options =
libmorfo_perl::maco_options->new($LANG);
$options->set_active_modules(1,1,1,1,1,1,1,1,0,0);
$options->set_data_files(...);
my $analyser =
libmorfo_perl::maco->new($options);
my $tagger =
libmorfo_perl::hmm_tagger->new($LANG,"$DATA$LANG/tagger.dat",1,2);
# atomização
my $words = $tokenizer->tokenize($line);
# segmentação
my $sentences = $splitter->split($words, 0);
# análise morfológica
$sentences = $analyser->analyze($sentences);
# Part-of-Speech tagging
$sentences = $tagger->analyze($sentences);
# percorrer as orações
for my $s (@$sentences) {
my $ws = $s->get_words;
# percorrer as palavras da oração
for my $w (@$ws) {
# resultado: palavra lema PoS
print $w->get_form . ' ' .
$w->get_lemma . ' ' .
$w->get_parole . "\n";
}
print "\n";
}
"(...) consiste em apresentar as linhas gerais do Projecto Per-Fide, (...)"
(tirado do resumo da apresentação do projecto Per-Fide)
| (...) | ||
| consiste | consistir | VMIP3S0 |
| em | em | SPS00 |
| apresentar | apresentar | VMN0000 |
| as | o | DA0FP0 |
| linhas | linha | NCFP000 |
| gerais | geral | AQ0CP0 |
| de | de | SPS00 |
| o | o | DA0MS0 |
| Projecto_Per-Fide | projecto_per-fide | NP00000 |
| , | , | Fc |
| (...) | ||
...
use XML::TMX::Reader;
use XML::TMX::Writer;
use Lingua::FreeLing::Simple;
...
for my $file (@ARGV){
my $tmin = XML::TMX::Reader->new($file);
my $tmout = XML::TMX::Writer->new();
# initialize analysers
my @langs = $tmin->languages;
map { $analysers{$_} = Lingua::FreeLing::Simple->new($_) if not $analysers{$_}; } @langs;
$tmout->start_tmx(DATATYPE => 'xml', OUTPUT => "$file.freeling");
### processar o ficheiro TMX
$tmout->end_tmx();
}
$tmin->for_tu(
{ },
sub {
my $tu = shift;
my %tus = ();
for my $lang (@langs) {
if (defined $tu->{$lang}) {
my $sentences = $analysers{$lang}->analyse($tu->{$lang});
$tus{$lang} =
join "\n", Lingua::FreeLing::Simple::forall($sentences,
{ sentence => \&to_xml , returns => 1});
}
}
$tmout->add_tu(%tus);
});
sub to_xml {
my $sentence = shift;
my $ws = $sentencei->get_words;
my @words = Lingua::FreeLing::Simple::get_tuple($ws, { lemma => 1, tag => 1 });
"<sentence>\n\t" .
(join "\n", (map { my $w = $_;
$_ = ("<word " .
(join " ", (map { $_ = qq|$_="$w->{$_}"| } (keys %$w))) . " />") }
@words))
. "<sentence>\n";
}
<tu><!--1:1-->
<tuv lang='en'>
<seg>Eastern Europe fell under the domination of the Soviet Union .</seg>
</tuv>
<tuv lang='pt'>
<seg>A Europa de Leste caiu sob o domínio da União Soviética .</seg>
</tuv>
</tu>
<tu>
<tuv xml:lang="en">
<seg>
<sentence>
<word form="Eastern" tag="JJ" lemma="eastern" />
<word form="Europe" tag="NNP" lemma="europe" />
<word form="fell" tag="VBD" lemma="fall" />
<word form="under" tag="IN" lemma="under" />
<word form="the" tag="DT" lemma="the" />
<word form="domination" tag="NN" lemma="domination" />
<word form="of" tag="IN" lemma="of" />
<word form="the" tag="DT" lemma="the" />
<word form="Soviet_Union" tag="NNP" lemma="soviet_union" />
<word form="." tag="Fp" lemma="." />
<sentence>
</seg>
</tuv>
<tuv xml:lang="pt">
<seg>
<sentence>
<word form="A" tag="DA0FS0" lemma="o" />
<word form="Europa_de_Leste" tag="NP00000" lemma="europa_de_leste" />
<word form="caiu" tag="VMIS3S0" lemma="cair" />
<word form="sob" tag="SPS00" lemma="sob" />
<word form="o" tag="DA0MS0" lemma="o" />
<word form="domínio" tag="NCMS000" lemma="domínio" />
<word form="de" tag="SPS00" lemma="de" />
<word form="a" tag="DA0FS0" lemma="o" />
<word form="União_Soviética" tag="NP00000" lemma="união_soviética" />
<word form="." tag="Fp" lemma="." />
<sentence>
</seg>
</tuv>
</tu>
<tu>
<tuv xml:lang="en">
<seg>
<sentence>
<word form="The" tag="DT" lemma="the" />
<word form="Eastern_Europe" tag="NNP" lemma="eastern_europe" />
<word form="fell" tag="VBD" lemma="fall" />
<word form="under" tag="IN" lemma="under" />
<word form="the" tag="DT" lemma="the" />
<word form="domination" tag="NN" lemma="domination" />
<word form="of" tag="IN" lemma="of" />
<word form="the" tag="DT" lemma="the" />
<word form="Soviet_Union" tag="NNP" lemma="soviet_union" />
<word form="." tag="Fp" lemma="." />
<sentence>
</seg>
</tuv>
<tuv xml:lang="pt">
<seg>
<sentence>
<word form="A" tag="DA0FS0" lemma="o" />
<word form="Europa_de_Leste" tag="NP00000" lemma="europa_de_leste" />
<word form="caiu" tag="VMIS3S0" lemma="cair" />
<word form="sob" tag="SPS00" lemma="sob" />
<word form="o" tag="DA0MS0" lemma="o" />
<word form="domínio" tag="NCMS000" lemma="domínio" />
<word form="de" tag="SPS00" lemma="de" />
<word form="a" tag="DA0FS0" lemma="o" />
<word form="União_Soviética" tag="NP00000" lemma="união_soviética" />
<word form="." tag="Fp" lemma="." />
<sentence>
</seg>
</tuv>
</tu>