#!/usr/bin/perl -w use Getopt::Std; use File::Type; use Google::Search; use LWP::Simple; my %opt = (s=>""); my ($site, $l1, $l2); getopts('dl:m:s:',\%opt); $opt{l} ||= "pt:en"; $opt{m} ||= 10; $opt{s}=~ s!http://(.+?)/!$1!; $opt{s}=~ s!(.+)!site:$1!; $site= $opt{s} || "site:eur-lex.europa.eu"; my $keywords= join (" ", map {if(/\s/){qq{"$_"}}else{$_} } @ARGV); $keywords ||= '"materias perigosas" "acido sulfurico"'; ($l1,$l2) = ( $opt{l} =~ m{(\w+):(\w+)} ) if ($opt{l}); ## my $key= "qXzxjFKVfBtSxAHu0jZcZb8CjXYi8FQh"; use strict; my $ft = File::Type->new() ; my %valid =( html => 1, pdf => 1, 'pdf++' => 1 ); my %Trlangs=( "pt:en" =>[ ["pt","en"], ["PT","EN"], ["por","eng"], ["portuguese","english"], ["Portuguese","English"], ["portugues","english"] , ["Portugues","English"] , ], "pt:fr" =>[ ["pt","fr"], ["PT","FR"], ["por","fre"], ["portuguese","french"], ["Portuguese","French"], ["Portuguese","Francais"], ["Portugues","Frances"], ["portugues","french"] , ["Portugues","French"] , ], "pt:de" =>[ ["pt","de"], ["PT","DE"], ["portuguese","german"], ["por","deu"], ["por","ger"], ], "pt:es" =>[ ["pt","es"], ["pt","sp"], ["PT","ES"], ["por","esp"], ["portuguese","spanish"], ["portugues","espanhol"], ], ); my %langs = ( "" => "", "Arabic" => "lang_ar", "Chinese_CN" => "lang_zh-CN", "Chinese_TW" => "lang_zh-TW", "Czech" => "lang_cs", "Danish" => "lang_da", "Dutch" => "lang_nl", "English" => "lang_en", "Estonian" => "lang_et", "Finnish" => "lang_fi", "French" => "lang_fr", "German" => "lang_de", "Greek" => "lang_el", "Hebrew" => "lang_iw", "Hungarian" => "lang_hu", "Icelandic" => "lang_is", "Italian" => "lang_it", "Japanese" => "lang_ja", "Korean" => "lang_ko", "Latvian" => "lang_lv", "Lithuanian" => "lang_lt", "Norwegian" => "lang_no", "Portuguese" => "lang_pt", "Polish" => "lang_pl", "Romanian" => "lang_ro", "Russian" => "lang_ru", "Spanish" => "lang_es", "Swedish" => "lang_sv", "Turkish" => "lang_tr" ); die ("undefined language pair $l1:$l2\n") unless defined $Trlangs{$opt{l}}; my $usage = <<"_USAGE_"; Usage: $0... só vendo _USAGE_ if ($opt{d}) { ## debug translate Url function while(<>){ chomp; print join("\n", translateurl($_, $opt{l})),"\n"; } exit 0; } print STDERR "$keywords + $site\n"; my $search = Google::Search->Web (query=>"$keywords $site" , ## __ => $key, ## max_results=>$opt{m} ); ## $search->lr("lang_$l1"); open(PAIRS,"> out.$l1.$l2.pairs") or die("can't create output\n");; my $i=1; my $t=0; while(my $result = $search->next ) { my $url= $result->uri; print STDERR "----------url $url\n"; my @trurl= translateurl($url,"$l1:$l2"); print STDERR "---- not OK $url\n" unless @trurl; $t++; last if $i > $opt{m}; for my $trurl (@trurl){ next if is_error( getstore($trurl, "FFF-$l2-$i")); my $t = mime2ext( $ft->mime_type("FFF-$l2-$i")); if( $valid{$t}){ next if is_error( getstore($url, "FFF-$l1-$i")); rename("FFF-$l1-$i","FFF-$l1-$i.$t"); rename("FFF-$l2-$i","FFF-$l2-$i.$t"); print PAIRS "FFF-$l1-$i.$t FFF-$l2-$i.$t\n"; print STDERR "++++ OK $url\n"; $i++; } else { unlink ("FFF-$l2-$i"); } } } sub mime2ext{ my $a=shift; $a =~ s!.*/!!; if($a eq "pdf" ) {$a = "pdf++"} $a; } sub translateurl{ my $url = shift; my $langs=shift; ## ex pt:en my ($base,$file); my $i=0; ($base,$file) = ($url =~ m{(http://.*?/)(.*)}); return () unless $base; return () unless $file; die ("undefined language pair $langs= $l1:$l2\n") unless defined $Trlangs{$langs}; my $wb=qr{\b|[_0-9]}; my $old=$file; for my $pair ( @{$Trlangs{$langs}} ){ if($file =~ s!($wb)$pair->[0]($wb)!$1$pair->[1]$2!g ){ $i++ } } print STDERR "... ($old ==> $file)\n"; if($i){return ("$base$file")} else { return () } } __END__ =head1 NAME getwebbitext - extration of bitexts from the web =head1 SYNOPSIS getwebbitext [options] keyword+ =head1 DESCRIPTION =head2 options -l pt:en Language pair -d debug translation Url function -s site -m max max number of documents -k -c ... -n =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut