#!/usr/bin/perl -s use XML::DT ; use Lingua::PT::PLNbase; use utf8::all; use Lingua::Identify qw(:language_identification); use Lingua::Identify::CLD; use Data::Dumper; $Data::Dumper::Indent=0; $Data::Dumper::Terse=1; our($ignore, $isep, $dir,$debug,$osep,$tld,$flucky,$end,$langwl); use strict; $isep = qr{$isep} if $isep; $isep //= qr{ +}; $ignore = qr{$ignore} if $ignore; $ignore //= qr{^#}; $osep = eval "qq{$osep}" if $osep; $osep //= "\n \n"; $end = qr{$end} if $end; $end //= qr{__END__}; my %localdic; if($langwl){ open(my $f,"<","$langwl.wl") or die("Error: $langwl.wl not found\n"); while(<$f>){ for(m/(\w+)/g){ $localdic{$langwl}{lc($_)}=1} } close $f; ##FIXME : not used yet } my @tld = (tld => $tld) if $tld; my $cld = Lingua::Identify::CLD->new(@tld); my $filename = shift; my $text; ### get the text if($filename =~ /\.pdf$/){ # if PDF file: convert it to tex open (F,"-|","jop -raw $filename | lawpp ") or die; $text = join('',); close F; } else{ open (F,"<",$filename) or die; $text = join('',); close F; } if($dir){ $filename =~ s!.*/!!; $filename = "$dir/$filename"; mkdir($dir); } my $chunknumber=0; for my $chunk (split($isep,$text,0)){ $chunknumber++; last if /$end/; if($chunk =~/$ignore/ or $chunk !~ /\S{2}/){ print "=>($chunknumber) ignored\n"; print $chunk,"\n" if $debug; next} my @aux = $cld->identify($chunk); my @aux2 = identify_local($chunk,\%localdic); print STDERR "\n",Dumper([$chunknumber,$aux[1], $aux[3],$aux[2],$aux2[0]]); if ($aux[3] and $aux[2]>=75){ } elsif($aux[3] and $aux[2]<75 and $aux2[0][1]>50){ @aux=($aux2[0][0],$aux2[0][0],$aux2[0][1],1); print STDERR Dumper([$chunknumber,@aux]); } elsif(not $aux[3] and $aux2[0][1]>50 ){ @aux=($aux2[0][0],$aux2[0][0],$aux2[0][1],1); print STDERR Dumper([$chunknumber,@aux]); } # if($aux[1] and $aux[3] and $aux[2]<75){ # print STDERR Dumper(\@aux2); # print "=>($chunknumber) $aux[1](?)=$aux[2]/$aux[3]\n"; # print $chunk,"\n" if $debug; # open(F,">>","$filename-$aux[1].txt") # or die("can't write $filename-$aux[1].txt\n");; # print F "$chunk$osep"; # close F; # } if($aux[1] and $aux[3]){ print "=>($chunknumber) $aux[1]=$aux[2]/$aux[3]\n"; print $chunk,"\n" if $debug; open(F,">>","$filename-$aux[1].txt") or die("can't write $filename-$aux[1].txt\n");; print F "$chunk$osep"; close F; } else{ print "=>($chunknumber) Don't know: $aux[1]=$aux[2]/$aux[3]\n"; print "=> $aux2[0][1]=$aux2[0][1]\n" if $aux2[0]; print $chunk,"\n" if $debug; if($flucky){ # I'm feeling lucky (use best unsafe guess) open(F,">>","$filename-$aux[1].txt") or die("can't write $filename-$aux[1].txt\n");; print F "$chunk$osep";} else{ open(F,">>","$filename-DONTKNOW.txt") or die("can't write $filename-$aux[1].txt\n");; print F "$chunk$osep"; } } } sub identify_local{ my ($chunk,$localdic)=@_; my ($t,%cp); for my $l (keys %$localdic){ $t=0; # print STDERR Dumper($l,$localdic->{$l}); for($chunk =~ m/(\w\w+)/g){ $t++; $cp{$l}++ if $localdic->{lc($l)}{$_} } $cp{$l} = ($t < 10)? undef : 130*$cp{$l}/$t; } if(%cp){ map {return [$_,$cp{$_}]} sort {$cp{$b} <=> $cp{$a}} keys %cp } else{ return [] } } sub dict_base_lang{ my $t=shift; ### FIXME: tbp } __END__ =encoding utf8 =head1 NAME langsplit-by-page - splits a multilingual doc by language =head1 SYNOPSIS langsplit-by-page [-dir=D] file.txt langsplit-by-page [-dir=D] file.pdf -isep= input chunk separator (def=newpage) -osep= output chunk separator -ignore= -debug -flucky (I'm feeling lucky --> use best guess) -langwl=tetum =head1 DESCRIPTION Separates pages by its language. for each page p in file id = language-of(p) append p to file-id.txt =head1 dependencies Lingua::Identify::CLD; jop (for pdf) lawpp (for pdf) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut