#!/usr/bin/perl use strict; use warnings; use utf8; use Encode; use Lingua::FreeLing; use Lingua::FreeLing::Simple; my %analysers = (); my @langs = qw|pt|; # initialize analysers map { $analysers{$_} = new Lingua::FreeLing::Simple($_) if not $analysers{$_}; } @langs; # Encode->from_to($tu->{$l1}, "utf8", "iso-8859-1"); # Encode->from_to($tu->{$l2}, "utf8", "iso-8859-1"); # $tu->{$l1} = to_string($analysers{$l1}->analyse($tu->{$l1})); # $tu->{$l2} = to_string($analysers{$l2}->analyse($tu->{$l2})); # Encode->from_to($tu->{$l1}, "iso-8859-1", "utf8"); # Encode->from_to($tu->{$l2}, "iso-8859-1", "utf8"); binmode(STDOUT, ":utf8"); my $str1 = "mudanças é que é bonito."; my $str2 = $str1; ##Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what))); ##Encode::from_to($str2, "utf8", "iso-8859-1"); #Encode->from_to($str2, "utf8", "iso-8859-1"); my $s1 = $analysers{"pt"}->analyse($str1); $str2 = encode("iso-8859-1", $str2); print "'$str1'\n"; #print "'$str2'\n"; my $s2 = $analysers{"pt"}->analyse($str2); print to_string($s1); print "----------\n"; print to_string($s2); sub to_string { my $sentences = shift; my $xml = ""; my $template_word = qq|\t#FORM# #LEMMA# #TAG#|; for my $s (@$sentences) { my $sentence = ""; my $ws = $s->get_words; for my $w (@$ws) { my $form = decode("iso-8859-1", $w->get_form); my $lemma = decode("iso-8859-1",$w->get_lemma); my $tag = decode("iso-8859-1",$w->get_parole); my $word = $template_word; $word =~ s/#FORM#/$form/ge; $word =~ s/#LEMMA#/$lemma/ge; $word =~ s/#TAG#/$tag/ge; $sentence.= "$word\n"; } $sentence.= ""; $xml.= $sentence; } return $xml; } sub to_xml { my $sentences = shift; my $xml = ""; my $template_word = qq||; for my $s (@$sentences) { my $sentence = "\t\n"; my $ws = $s->get_words; for my $w (@$ws) { my $form = $w->get_form; my $lemma = $w->get_lemma; my $tag = $w->get_parole; my $word = $template_word; $word =~ s/#FORM#/$form/ge; $word =~ s/#LEMMA#/$lemma/ge; $word =~ s/#TAG#/$tag/ge; $sentence.= "\t\t$word\n"; } $sentence.= "\t\n"; $xml.= $sentence; } return $xml; }