#!/usr/bin/perl -w use Getopt::Std; use File::Type; use Data::Dumper; use Yahoo::Search; use LWP::Simple; use Lingua::GetWebBitext; use YAML; use strict; my %opt = (s=>""); my ($site, $l1, $l2); getopts('F:LUl:m:s:d:t:f:',\%opt); $opt{l} ||= "pt:en"; $opt{m} ||= 100; $opt{d} ||= "BITEXT"; $opt{d} .= "/" unless $opt{d} =~ m{/$}; if($opt{L}){Lingua::GetWebBitext::dump_lang(); exit 0;} if($opt{U}){Lingua::GetWebBitext::test_trurl($opt{l}); exit 0;} my $chunk=100; my $finish=$opt{F}||1000; $opt{s}=~ s!http://(.+?)/!$1!; $opt{s}=~ s!(.+)!site:$1!; $site= $opt{s} || "site:eur-lex.europa.eu"; my $keywords= join (" ", map {if(/\s/){qq{"$_"}}else{$_} } @ARGV); $keywords ||= '"materias perigosas" "acido sulfurico"'; ($l1,$l2) = ( $opt{l} =~ m{(\w+):(\w+)} ) if ($opt{l}); my $ft = File::Type->new() ; my %valid =( html => 1, pdf => 1, 'pdf++' => 1 ); ## my $chunk = Yahoo::Search::MaxCount("Doc") #warn ("undefined language pair $opt{l}=$l1 :: $l2\n") # unless defined $Lingua::GetWebBitex::Trlangs{$opt{l}}; my $usage = <<"_USAGE_"; Usage: $0... só vendo _USAGE_ my $q = 0; print STDERR "$keywords + $site\n"; mkdir($opt{d}); my $type = $opt{t} || "any"; my $yahoo = Yahoo::Search->new(); my $Request = $yahoo->Request( Doc => "$keywords $site" , AppId => "Natura bitext", Start => 0, Count => $chunk, Type => $type, ## html msword pdf ppt rss txt xls Language => $l1, ## FIX ME AutoContinue => 1, ## automatic next ... (dangerous) ); warn $@ if $@; my $r = $Request->Fetch(); warn $@ if $@; my $i=1; my $tf=0; die("no results is bad results... :-(\n") unless $r; open(PAIRS,"> $opt{d}/out.$l1.$l2.pairs") or die("cant create $opt{d}/out.$l1.$l2.pairs\n"); print PAIRS "##$keywords $site\n"; close PAIRS; while (my $result = $r->NextResult) { my $url= $result->Url; print STDERR "----------url $url\n"; my @trurl= Lingua::GetWebBitext::translateurl($url,"$l1:$l2"); print STDERR "---- not OK $url\n" unless @trurl; $tf++; last if $i > $opt{m}; last if $tf > $finish; for my $trurl (@trurl){ next if is_error( getstore($trurl, "FFF-$l2-$i")); my $t = mime2ext( $ft->mime_type("FFF-$l2-$i")); if( $valid{$t}){ next if is_error( getstore($url, "FFF-$l1-$i")); rename("FFF-$l1-$i","$opt{d}/FFF-$l1-$i.$t"); rename("FFF-$l2-$i","$opt{d}/FFF-$l2-$i.$t"); open(PAIRS,">> $opt{d}/out.$l1.$l2.pairs") or die("can't create output\n");; print PAIRS "FFF-$l1-$i.$t FFF-$l2-$i.$t\n"; close PAIRS; print STDERR "++++ OK $url\n"; $i++; } else { unlink ("FFF-$l2-$i"); } } } sub mime2ext{ my $a=shift; $a =~ s!.*/!!; if($a eq "pdf" ) {$a = "pdf++"} $a; } __END__ =head1 NAME getwebbitext - extraction of bitexts from the web =head1 SYNOPSIS getwebbitext [options] keyword+ =head1 DESCRIPTION =head2 options -l pt:en Language pair -s site the site -m max max number of documents -t type force a doument type Ex: -t html ## html msword pdf ppt rss txt xls -U debug translation Url function -L get the list of available languages =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut