#!/usr/bin/perl -s #use File::Spec; #use File::Spec::Functions; use utf8::all; use Cwd 'abs_path'; use Text::Hunspell; use strict; our($minratio,$l, $i, $pcase, $pacc, $txt, $min, $max, $lat1, $latin1, $q, $br, $oco, $debug,$c,$std,$log,$ptall); my $pp=""; my %L=(pt => "cetemGordo.txt", en => "bnc.txt"); $l //= "pt"; my $lll=$l; $lll = "pt_PT" if $l eq "pt"; $lll = "en_US" if $l eq "en"; my $s; if($c){ $s= Text::Hunspell->new( "/usr/share/hunspell/$lll.aff", "/usr/share/hunspell/$lll.dic",); # $s->set_option('lang',$lll); } my $dir="/usr/local/share/oco"; my $op = " -sub "; $op.= " -minratio=$minratio " if $minratio; $op.= " -pcase " if $pcase; $op.= " -pacc " if $pacc; if($br) { $std //= "$dir/tokens.todos_br" } if($ptall) { $std //= "$dir/tokens.todos_ptall" } if($q) { $std //= "$dir/cetemMagro.txr" } if($l) { $std //= "$dir/".$L{$l} } if($i) { $op .= " -ignorecase " } if($oco){ $op .= " -oco " } else { $op .= " -istxt1 " } if($txt){ $op .= " -istxt1 " } if($min){ $op .= " -minoco=$min " } else { $op .= " -minoco=30 " } if($max){ $pp .= " |head -$max "} if($lat1){ $pp .= " | iconv -t CP1252 -t utf8 "} my $tmp="/tmp/__$$"; my $x= shift or die("wrong usage; try perldoc susprise\n"); my $charset= `file -i $x`; if(not $lat1 and $charset =~ m!text/\S+\s*charset=(\S+)!){ if ($1 eq "utf-8") {linkit($x, $tmp)} elsif($1 eq "utf-16") {syst("iconv -c -f utf16 -t utf8 $x > $tmp")} elsif($1 eq "unknown") {syst("iconv -c -f cp1252 -t utf8 $x > $tmp")} elsif($1 eq "iso-8859-1"){syst("iconv -c -t utf8 -f latin1 $x > $tmp")} else {linkit($x, $tmp)} } else {linkit($x, $tmp)} my $aux="/tmp/_surprise$$"; if($log){syst("freqcomp -log $op $tmp $std |sort -nr $pp | cut -c '1-7,12-' > $aux"); } else { syst("freqcomp $op $tmp $std |sort -r $pp | cut -c '1-7,12-' > $aux"); } open(F,"<",$aux); while(){ if($c){ if(/(\S+)\s+(\S{3,})(.*)/){ if ($s->check($2)){ print "+"} else { print "-"} } } print $_; } close(F); unlink("__$$",$aux); sub linkit{ my ($a,$tmp)=@_; my $abs_path = abs_path($a); syst("ln -s $abs_path $tmp"); } sub syst{ my $com = shift; print "Debug $com\n" if $debug; system($com); if ($!) { print STDERR "Erro ($com): $_\n"; } } __END__ =encoding utf8 =head1 NAME surprise - extracting unexpected words from text =head1 SYNOPSIS surprise [options] file.oco surprise -txt [options] file.txt =head1 DESCRIPTION Compare the number of ocurences of the words in a text and the words in a big reference corpus. Output is sorted by "what is new in this text" =head2 Options -i ignore case -pcase preferencial case -pacc preferencial accents -l=EN set language (def: PT) -txt input is a text (def: list of (word ocurrences) -min=10 minimum of occurences/million of words to be analysed (def: 30) -max= Maximum number of words in output -lat1 input is encoded in latin1 (def: utf8) -oco input is in oco format (see freqoco command) -std=file.oco user provided word-oco database -q quick (compare text with smaller word-oco database) -br compare agains pt_br tokens (instead of pt_pt tokens) -ptall compare agains (pt_br+pt_pt) tokens -loq use log compare/sort strategies -minratio minimum logaritmic difference eg(minratio=2.30 (=log(10)) -- 10 times bigger) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). freqcomp =cut