#!/usr/bin/perl -s # PODNAME: tmx-tokenize # ABSTRACT: Tokenizes translation units on a tmx file. use strict; use warnings; our $o; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "t_$file"; $output = $o if $o; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $output, -prop => { tokenized => "true" }, verbose => 1 }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|it|ru|en|gl)/i) { my $ln = lc $1; my $txt = $tu->{$lang}{-seg}; if ($txt !~ /^\s*$/) { $txt = join(" ", @{ tokenizer($ln)->tokenize($txt, to_text => 1)}); } $tu->{$lang}{-seg} = $txt; } } return $tu; }); =encoding UTF-8 =head1 SYNOPSIS tmx-tokenize file.tmx # creates t_file.tmx tmx-tokenize -o=out.tmx file.tmx =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based tokenizer might be implemented. =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =cut