#!/usr/bin/perl -s

# PODNAME: tmx-tokenize
# ABSTRACT: Tokenizes translation units on a tmx file.

use strict;
use warnings;

our $o;

eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;

FL3->import();
use XML::TMX::Reader '0.25';

my $file = shift or die "You must supply the name of the file to tokenize";

my $reader = XML::TMX::Reader->new($file);

my $output = "t_$file";
$output = $o if $o;

binmode STDOUT, ":utf8";
$reader->for_tu( {
                  -output => $output,
                  -prop => { tokenized => "true" },
                  verbose => 1
                 },
                 sub {
                     my $tu = shift;
                     for my $lang (keys %$tu) {
                         if ($lang =~ /(pt|es|it|ru|en|gl)/i) {
                             my $ln = lc $1;
                             my $txt = $tu->{$lang}{-seg};
                             if ($txt !~ /^\s*$/) {
                             $txt = join(" ",
                                         @{ tokenizer($ln)->tokenize($txt,
                                                                     to_text => 1)});
                             }
                             $tu->{$lang}{-seg} = $txt;
                         }
                     }
                     return $tu;
                 });

=encoding UTF-8

=head1 SYNOPSIS

   tmx-tokenize file.tmx  # creates t_file.tmx

   tmx-tokenize -o=out.tmx file.tmx

=head1 DESCRIPTION

Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.

At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.

It your TMX file includes any other language, they will be maintained
without a change.  This behavior can change in the future, as a basic
regexp based tokenizer might be implemented.

=head1 SEE ALSO

XML::TMX, Lingua::FreeLing3

=cut