#!/usr/bin/perl
# NATools - Package with parallel corpora tools
# Copyright (C) 2002-2012 Alberto Simões
#
# This package is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
use POSIX qw(locale_h);
setlocale(LC_CTYPE, "pt_PT.UTF-8");
use locale;
use warnings;
#use Data::Dumper;
use Lingua::NATools::Client;
use Lingua::NATools::CGI;
use CGI qw/:standard :cgi-lib/ ;
# Create a new client
my $server = Lingua::NATools::Client->new();
# Get the list of available corpora
my $corpora = $server->list();
# Current corpus if undefined, without a name
my $crp = undef;
my $name;
# Check if we got a corpus identifier
if (param("crp")) {
$crp = $corpora->{param("crp")}{id} || undef;
$name = param("crp");
}
# Ok, we didn't get a corpus identifier, just a corpus name
if (param("corpus") && !param("crp")) {
$crp = param("corpus");
for (keys %$corpora) {
$name = $_ if $corpora->{$_}{id} == $crp;
}
}
# We didn't get a corpus identifier nor a corpus name, so get randomly one.
($name) = keys %$corpora unless $name;
# Create JavaScript combo-box to change corpus being queried
my $s = join("\n",
join("\n", map {
"source[\"$_\"]=\"$corpora->{$_}{source}\";"} keys %$corpora),
join("\n", map {
"target[\"$_\"]=\"$corpora->{$_}{target}\";"} keys %$corpora));
my $JSCRIPT = <<"EOS";
var source = new Array();
var target = new Array();
$s
function changeLanguages() {
var corpus = document.getElementById('crp').value;
document.getElementById('source').innerHTML = source[corpus];
document.getElementById('target').innerHTML = target[corpus];
}
function go(l,c) {
if (parseInt(navigator.appVersion)>=4)
if (navigator.userAgent.indexOf("MSIE")>0) { //IE 4+
var sel=document.selection.createRange();
sel.expand("word");
window.location="nat-dict.cgi?compact=1&corpus=" + c + "&" + l + "=" + escape(sel.text)
} else // NS4+
window.location="nat-dict.cgi?compact=1&corpus=" + c + "&" + l + "=" + escape(document.getSelection())
}
function help() {
window.open('nat-search.cgi?HELP=1','NAT-QI Quick Help',
'menubar=no,height=600,width=800,resizable=yes,toolbar=no,location=no,status=no');
}
EOS
print Lingua::NATools::CGI::my_header(jscript => $JSCRIPT);
#my $x = Vars;
#print pre(Dumper($x));
# Check if we were asked for help
if (param("HELP")) {
print Lingua::NATools::CGI::close_window();
print_help();
print Lingua::NATools::CGI::my_footer();
exit;
}
# Print form HTML
print div({-class=>"hlpbt",
-onclick=>"help()"}, "Help ");
print h1("NAT-QI: NATools Corpora Query Interface");
print start_form({-class=>"main"});
print "
\n";
print Tr(td({-rowspan=>'3'},submit("Search")),
td({-rowspan=>'3'}, " "),
td({-colspan=>6, -style=>"text-align: left"},
"Corpus: ",popup_menu(-onchange=>"changeLanguages();",
-name=>'crp',
-id => 'crp',
-default=>$name,
-values=>[keys %$corpora])));
print Tr(td(["Search on ",
span({id=>"source"}, $corpora->{$name}{source}), " language: ",
textfield("l1"),
" ",
]),
td({-style=>"text-align: left"},label(checkbox(-name=>'sequence', -checked=>0,
-value=>'ON', -label=>'Pattern Matching'))),
td([" ",
"Result-set size",popup_menu(-name=>'count',
-values=>['20','50','100','500'])
]));
print Tr(td(["Search on ",
span({id=>"target"}, $corpora->{$name}{target}), " language: ",
textfield("l2"),
" "]),
td({-style=>"text-align: left"},
label(checkbox(-name=>'horiz', -checked=>0,
-value=>'ON', -label=>'Horizontal Mode')),
));
print "
";
print end_form;
my $count = param("count") || 20;
# If we have a corpus, and at least one word in one of the two
# languages, then query the server
if ($crp && (param("l1") || param("l2"))) {
# param("l1", lc(param("l1"))) if param("l1");
# param("l2", lc(param("l2"))) if param("l2");
# print the corpus name and a link to the information page
print h1($name);
print "",
a({-style=>"font-size: small;", -href=>"nat-about.cgi?corpus=$crp"},
"meta-information"), "",br;
# variable to store the query results
my $results;
my $ptds;
# Check if we are looking for a pattern or a set of words
$mod = (param("sequence") && param("sequence") eq "ON") ? "=" : "-";
if (param("l1") && !param("l2")) {
# We have just source language...
$results = $server->conc({count => $count,
crp => $crp,
direction => "$mod>"}, param("l1"));
# get PTDs for all searched words
$ptds = get_ptds($server, $crp, "~>", lc(param("l1")));
} elsif (param("l2") && !param("l1")) {
# We have just the target language
$results = $server->conc({count => $count,
crp => $crp,
direction => "<$mod"}, param("l2"));
# get PTDs for all searched words
$ptds = get_ptds($server, $crp, "<~", lc(param("l2")));
} else {
# We have both languages
$results = $server->conc({count => $count,
crp => $crp,
direction => "<$mod>"}, param("l1"), param("l2"));
$ptds = [];
}
$_->[1]{'**KEYS**'} = [sort {$_->[1]{$b} <=> $_->[1]{$a}} keys %{$_->[1]} ] for @$ptds;
# Start to print results
print "";
# print table header accordingly with the horizontal vs vertical
# user request
unless (param("horiz")) {
print Tr(th({-class=>'first'},"#"),
th(["%","Source Language","Target Language","Tools"]))
} else {
print Tr(th({-class=>'first'},"#"),
th(["%","Source/Target Language","Tools"]))
}
my $i = 0;
# print the results
for (@$results) {
$i++;
# Code backwards compatibility O:-)
$_->[4]=$_->[2]?sprintf("%.1f%%", 100*$_->[2]):"";
$_->[2]=$_->[0];
$_->[3]=$_->[1];
# Highlight l1 if defined
if (param("l1")) {
$_->[0] = highlite($_->[0], param("l1"), $mod eq '='?1:0);
$_->[1] = highlite_translations($_->[1], $ptds) if ($ptds);
}
# Highlight l2 if defined
if (param("l2")) {
$_->[1] = highlite($_->[1], param("l2"), $mod eq "="?1:0);
$_->[0] = highlite_translations($_->[0], $ptds) if ($ptds);
}
# Create the form to ask for the matrix diagonalization tool
# my $FORM = start_form(-method => "POST",
# -action => "nat-matrix.cgi");
# $FORM .= hidden("corpus", $crp);
# $FORM .= hidden("s1", $_->[2]);
# $FORM .= hidden("s2", $_->[3]);
# $FORM .= submit("\\");
# $FORM .= end_form;
my $FORM = "n/a";
# Print the data accordingly with the requested format (horizontal
# vs vertical)
if (param("horiz")) {
print Tr(td({-class=>$i%2?"entry1":"entry2", -rowspan=>2},
$i),
td({-class=>$i%2?"entry1":"entry2", -rowspan=>2},
$_->[4]),
td({-class=>$i%2?"entry1":"entry2",
-ondblclick=>"go('l1','$crp')"},
$_->[0]),
td({-class=>$i%2?"entry1":"entry2", -rowspan=>2},
$FORM));
print Tr(td({-class=>$i%2?"entry1":"entry2", -ondblclick=>"go('l2','$crp')"},
$_->[1]));
} else {
print Tr(td({-class=>$i%2?"entry1":"entry2"},
$i),
td({-class=>$i%2?"entry1":"entry2"},
$_->[4]),
td({-class=>$i%2?"entry1":"entry2", -ondblclick=>"go('l1','$crp')"},
$_->[0]),
td({-class=>$i%2?"entry1":"entry2", -ondblclick=>"go('l2','$crp')"},
$_->[1]),
td({-class=>$i%2?"entry1":"entry2"},
$FORM));
print "\n";
}
}
print "
";
} else {
# if no corpus is selected, and/or no word was passed as parameter,
# print help usage and go out.
print_help();
}
print Lingua::NATools::CGI::my_footer();
# This is the help code
sub print_help {
while() {
print
}
}
sub highlite {
my ($text, $keywords, $seq) = @_;
my $x = $keywords;
$x =~ s/^\s*(.*?)\s*$/$1/;
if ($seq) {
$x =~ s/\*/\\S+/g;
$text =~ s/\b($x)\b/$1<\/span>/gi;
} else {
$x =~ s/\*//g;
for my $y (split /\s+/, $x) {
$text =~ s/\b($y)\b/$1<\/span>/gi;
}
}
return $text
}
sub highlite_translation {
my ($changed, $text, $word, $perc) = (0, @_);
my $class = class_from_perc($perc);
$changed = $text =~ s/\b(\Q$word\E)\b/$1<\/span>/gi;
return ($text, $changed);
}
sub class_from_perc {
my $perc = shift;
if ($perc < .3) {
return "guessed30"
} elsif ($perc < .6) {
return "guessed60"
} else {
return "guessed100"
}
}
sub get_ptds {
my ($server, $crp, $dir, $words) = @_;
return [ map { $server->ptd({crp => $crp,
direction=>$dir},$_) }
grep { $_ !~ m"\*" } split /\s+/, $words ];
}
sub highlite_translations {
my ($text, $ptds) = @_;
for my $word (@$ptds) {
for my $t (@{$word->[1]{'**KEYS**'}}) {
($text, my $changed) = highlite_translation($text, $t, $word->[1]{$t});
last if $changed;
}
}
return $text
}
__DATA__
NAT-QI Help
NAT-QI (NATools Query Interface) is a web frontend to query and
browse Parallel Corpora. For details about its architecture and
associated tools see this page.
This interface is querying a server (NATServer) with a
specific parallel corpora, and a specific pair of
languages.
Toolbar Usage:
- Simple Search:
- Enter a word in the search on source language or
search on target language entries (or in both) to
search for concordancies in the parallel corpora. Words are
searched with no specific order.
- Pattern Search:
- If you want to search for a specific sequence of words, click
the Pattern Matching radio button. This option applies
to both source and target language entries. It also has the
feature of searching for patterns: enter some '*' in places
you want any, not specific, word.
- Horizontal Mode:
- The Horizontal Mode radio button let you change the
layout of the output. Instead of two columns, one for each
language, you get two lines, one for each language.
- Result-set Size:
- The Result-set size combo-box let you specify how many
results you want.
Output Description:
The standard output is a five column table:
- number which corresponds to the order in the result-set. Not
very useful.
- a quality measure of the sentence pair.
- sentence in the source language.
- sentence in the target language.
- links to tools you can apply to the sentence pair.
Related Tools Integration:
-
Double-click any word in the source or target language to
access its probabilistic dictionary entry. This tool will let you
navigate through the dictionary.
- Use the [/] button in the tools column to access a
sub-segment aligner, and sentence generalization tool.