#!/usr/bin/perl
use strict;
use warnings;
use XML::TMX::Reader;
use Encode;
use Lingua::FreeLing;
use Lingua::FreeLing::Simple;
my $l1="pt";
my $l2="en";
my %analysers = ();
for my $file (@ARGV){
my $tm = XML::TMX::Reader->new($file);
my @langs = $tm->languages;
# initialize analysers
map { $analysers{$_} = new Lingua::FreeLing::Simple($_) if not $analysers{$_}; } @langs;
# process TMX file
$tm->for_tu(
{ output => "$file.freeling" },
sub { my $tu = shift;
$tu->{$l1} = to_string($analysers{$l1}->analyse(encode("iso-8859-1", $tu->{$l1})));
$tu->{$l2} = to_string($analysers{$l2}->analyse(encode("iso-8859-1", $tu->{$l2})));
$tu;
});
}
sub to_string {
my $sentences = shift;
my $xml = "";
my $template_word = qq|\t#FORM# #LEMMA# #TAG#|;
for my $s (@$sentences) {
my $sentence = "";
my $ws = $s->get_words;
for my $w (@$ws) {
my $form = decode("iso-8859-1", $w->get_form);
my $lemma = decode("iso-8859-1", $w->get_lemma);
my $tag = decode("iso-8859-1", $w->get_parole);
my $word = $template_word;
$word =~ s/#FORM#/$form/ge;
$word =~ s/#LEMMA#/$lemma/ge;
$word =~ s/#TAG#/$tag/ge;
$sentence.= "$word\n";
}
$sentence.= "";
$xml.= $sentence;
}
return $xml;
}
sub to_xml {
my $sentences = shift;
my $xml = "";
my $template_word = qq||;
for my $s (@$sentences) {
my $sentence = "\t\n";
my $ws = $s->get_words;
for my $w (@$ws) {
my $form = $w->get_form;
my $lemma = $w->get_lemma;
my $tag = $w->get_parole;
my $word = $template_word;
$word =~ s/#FORM#/$form/ge;
$word =~ s/#LEMMA#/$lemma/ge;
$word =~ s/#TAG#/$tag/ge;
$sentence.= "\t\t$word\n";
}
$sentence.= "\t\n";
$xml.= $sentence;
}
return $xml;
}