#!/usr/bin/perl # PODNAME: tsv2tmx # ABSTRACT: Create a TMX from a TSV file use warnings; use strict; use Getopt::Long; use Pod::Usage; use utf8::all; ### Option variables my $man = 0; my $help = 0; my ($sl, $tl); our $verbose = 1; my $header = 1; my $columns = "0,1"; my $fs = qr'\t'; #### ------ GetOptions ('help|h' => \$help, 'man' => \$man, "source|sl=s" => \$sl, "target|tl=s" => \$tl, "header" => \$header, "columns|c=s" => \$columns, "fs=s" => \$fs, "verbose|v" => \$verbose) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitval => 0, -verbose => 2) if $man; ## sub strip { my $v=shift; $v =~ s/^\s*//; $v =~ s/\s*$//; $v; } our ($c1, $c2); if ($columns =~ /^(\d+),(\d+)$/) { ($c1,$c2) = ($1,$2); } else { die "Columns definition should be a pair of integers. Ex: 1,2\n"; } if ($sl and $tl) { $header = 0; } else{ if ($header) { my ($l1, $l2) = readLine("find langs in header"); $sl = strip($l1) unless defined $sl; $tl = strip($l2) unless defined $tl; } else { die "No header, and one of the source or target languages not defined!\n"; } $header = 0; } readLine() if $header; use XML::TMX::Writer; my $tmx = XML::TMX::Writer->new(); $tmx->start_tmx(id => 'tsv2tmx'); my @r; while (@r = readLine()) { $tmx->add_tu($sl=>$r[0],$tl=>$r[1]); } $tmx->end_tmx(); sub _log { say STDERR @_ if $verbose; } sub readLine { my $isheader = shift || 0; my $line = <>; return () unless $line; chomp $line; $line =~ s{&((\w+;)?)}{ if ($1){ "&$1" } else { "&"} }ge; if ($line =~ /^\s*$/ ){ return readLine() } elsif ($line =~ /^#/ ){ return readLine() } else { my @fields = split /$fs/, $line; if ($isheader and scalar @fields <2 ){ die("Invalid header:$line\n") } my $sl = strip($fields[$c1] or ""); my $tl = strip($fields[$c2] or ""); if ($isheader){ if( $sl and $tl){ _log ("From header:$sl - $tl");} else { die("Invalid header: $line\n") } } if ($sl or $tl){ return ($sl,$tl) } else { return readLine() } } } __END__ =encoding utf-8 =head1 SYNOPSIS tsv2tmx [options] file.tsv ... Options: --help brief help message --man full documentation --verbose | -v activated verbose mode --sl=EN --tl=PT describe source and target language names --header treat first line as a heading --columns=1,2 specify which columns to extract --fs='::' set field separator. (separated by '::') =head1 OPTIONS =over 8 =item B<--help> Print a brief help message and exits. =item B<--man> Prints the manual page and exits. =item B<--verbose> | B<-v> Activates the verbose mode. =item B<--sl> | B<--tl> Use these options to specify the names for the source and target languages. =item B<--header> By default this switch is on, and it means that the TSV file includes a first line with a heading. If no source or target language names are specified, the first line will be used to guess them. =item B<--columns=1,2> Specify which columns should be extracted. Needs to be a pair of integers, separated by a comma. Columns indexes start at 0. Default to C<0,1>. =back =head1 DESCRIPTION Useful to create translation memories from TSV files, that can be easily exported from spreadsheets or other applications. =head1 SEE ALSO XML::TMX =cut