#!/usr/bin/perl -s # PODNAME: tmx-POStagger # ABSTRACT: POStaggers translation units on a tmx file. use strict; use warnings; use File::Temp qw/ :POSIX /; my $tmpName = tmpnam(); our ( $o, # output filename $mwe, # tag mwe -- STILL NOT SUPPORTED $dates, # do dates analysis... $ner, # try to do NER.... $compact, # compact form $s, # use tags (on by default) ); $s = 1 unless defined $s; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); eval { require Lingua::FreeLing3::Word }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; Lingua::FreeLing3::Word->import(); my %initted = (); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "pos_$file"; $output = $o if $o; my $pos_languages = {}; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $tmpName, -verbose => 1, -verbatim => 1, }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) { $pos_languages->{$1} = 1; my $ln = lc $1; my $seg = $compact ? "" : "{$lang}{-seg}; my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt; my $sentences = splitter($ln)->split(\@tokens); $sentences = morph($ln)->analyze($sentences); $sentences = hmm($ln)->tag($sentences); for my $stc (@$sentences) { $seg .= "\n" if $s; $seg .= $compact ? _dump_compact($stc->words) : _dump_words($stc->words); $seg .= "\n" if $s; } $tu->{$lang}{-iscdata} = $compact ? 0 : 1; $seg .= "]]>" unless $compact; $tu->{$lang}{-seg} = $seg; } } return $tu; }); $reader = XML::TMX::Reader->new($tmpName); $reader->for_tu({ -output => $output, -verbatim => 1, -raw => 1, -prop => ($compact ? {} : { 'pos-tagged' => join(",",keys %$pos_languages), 'pos-fields' => "word,lemma,pos", 'pos-s-attributes' => ['s'], })}, sub { $_ = $_[0]; if (!/\[CDATA\[/) { s{()(.*?)()} {my ($before,$middle,$after) = ($1,$2,$3); for ($middle) { s/&/&/g;s//>/g; }; $before.$middle.$after }ge; } return $_; }); #print STDERR "$tmpName\n"; unlink $tmpName if -e $tmpName; sub _init_ma { my $lang = shift; morph($lang, QuantitiesDetection => 0, MultiwordsDetection => ($mwe // 0), NumbersDetection => 0, DatesDetection => ($dates // 0), NERecognition => ($ner // 0)); $initted{$lang}++; } sub _dump_words { my @words = @_; my $seg = ""; for my $t (@words) { if ($t->is_multiword) { $seg .= sprintf("\n", $t->lemma(), $t->tag()); $seg .= _dump_words($t->get_mw_words); $seg .= "\n"; } else { $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n"; } } return $seg; } sub _dump_compact { my $seg = join(" " => map { my $t = $_; ($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words) : ($t->lemma() || $t->lc_form())) . "/" . substr($t->tag(), 0, 1); } @_) . "\n"; $seg =~ s/&/&/g; $seg =~ s//>/g; return $seg; } =encoding UTF-8 =head1 SYNOPSIS tmx-POStagger file.tmx # creates pos_file.tmx tmx-POStagger -o=out.tmx file.tmx -ner ... tags multiword named entities -dates -compact =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based POStaggerr might be implemented. =head2 Options -ner -- groups Proper names with tag C which which WDT includes include VBZ Edward Cole -compact -dates =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =cut