#!/usr/bin/perl use strict; use warnings; use XML::TMX::Reader; use Encode; use Lingua::FreeLing; use Lingua::FreeLing::Simple; my $l1="pt"; my $l2="en"; my %analysers = (); for my $file (@ARGV){ my $tm = XML::TMX::Reader->new($file); my @langs = $tm->languages; # initialize analysers map { $analysers{$_} = new Lingua::FreeLing::Simple($_) if not $analysers{$_}; } @langs; # process TMX file $tm->for_tu( { output => "$file.freeling" }, sub { my $tu = shift; $tu->{$l1} = to_string($analysers{$l1}->analyse(encode("iso-8859-1", $tu->{$l1}))); $tu->{$l2} = to_string($analysers{$l2}->analyse(encode("iso-8859-1", $tu->{$l2}))); $tu; }); } sub to_string { my $sentences = shift; my $xml = ""; my $template_word = qq|\t#FORM# #LEMMA# #TAG#|; for my $s (@$sentences) { my $sentence = ""; my $ws = $s->get_words; for my $w (@$ws) { my $form = decode("iso-8859-1", $w->get_form); my $lemma = decode("iso-8859-1", $w->get_lemma); my $tag = decode("iso-8859-1", $w->get_parole); my $word = $template_word; $word =~ s/#FORM#/$form/ge; $word =~ s/#LEMMA#/$lemma/ge; $word =~ s/#TAG#/$tag/ge; $sentence.= "$word\n"; } $sentence.= ""; $xml.= $sentence; } return $xml; } sub to_xml { my $sentences = shift; my $xml = ""; my $template_word = qq||; for my $s (@$sentences) { my $sentence = "\t\n"; my $ws = $s->get_words; for my $w (@$ws) { my $form = $w->get_form; my $lemma = $w->get_lemma; my $tag = $w->get_parole; my $word = $template_word; $word =~ s/#FORM#/$form/ge; $word =~ s/#LEMMA#/$lemma/ge; $word =~ s/#TAG#/$tag/ge; $sentence.= "\t\t$word\n"; } $sentence.= "\t\n"; $xml.= $sentence; } return $xml; }