#!/usr/bin/perl -w -s our($p,$o,$i,$log,$rel); $log//=0; $i //=0; $p //=2 if $log; $p //=0; my $format=".$p"; $p ||=""; use utf8::all; use strict; my $hoco; my $lmax=1000000; my $maxlog=13.815; ## = log(1000000) my $magicF=$maxlog/log($lmax); my $op = shift or usage(); if(-f $op){ unshift(@ARGV,$op); $op = "add"} if($op eq "add"){ for my $f (@ARGV){ open(F1,"<",$f); while(){ if(/(\d+)[ \t]+(\S+)/){ if($i){ $hoco->{lc($2)} +=$1} else { $hoco->{$2} +=$1} } } close F1; } } elsif($op eq "sub"){ my $f = shift or usage(); open(F1,"<",$f); while(){ if(/(\d+)[ \t]+(\S+)/){ if($i){ $hoco->{lc($2)} +=$1} else { $hoco->{$2} +=$1} } } close F1; $f = shift or usage(); open(F1,"<",$f); while(){ if(/(\d+)[ \t]+(\S+)/){ if($i){ $hoco->{lc($2)} -=$1} else { $hoco->{$2} -=$1} } } close F1; } else { usage() } my $tot=0; my $permillion; if($rel or $log){ while (my ($k, $v) = each %$hoco){ $tot += $v if($v > 0); } $permillion = 1000000/$tot; } if($o){ open(F,"|-","sort -nr > '$o'"); } else { open(F,"|-","sort -nr "); } my ($k,$v); while (($k, $v) = each %$hoco){ if($rel){ print F sprintf("%".$format."f\t%s\n",$v*$permillion,$k) if($v > 0); } elsif($log){ print F sprintf("%".$format."f\t%s\n",logit($v*$permillion),$k) if($v > 0); } else { print F "$v $k\n" if($v > 0); } } close F; sub _off_utf8oco{ my %opt =(ignorecase => 0); if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my %oco; my @fs = @_; for my $f (@fs){ open(F1,"<",$f); while(){ for my $w (m/(\w(?:[-'"'"']\w|\w)+)/g){ if($opt{ignorecase}){ $oco{lc($w)}++ } else { $oco{$w }++ } } } close F1; } return \%oco; } sub usage{ die ("usage: ococalc [options] f.oco(s) ... to add multiseds sub f1 f2 ... to subtract options: -i ignore case (ouput is all in lower case) -o=out output is sent to 'out' (def: file.oco) -log output is in logaritmic scale (log(oco / million)) -rel output is in per-million relative format -p=2 number of decimal digits (in rel or log formats) (def=0) "); } sub setlogmax{ $maxlog = shift; $magicF=$maxlog/log($lmax); } sub setmax{ $lmax = shift; $magicF=$maxlog/log($lmax); } sub logit{ my $n=shift; return 0 unless $n; ## print STDERR "...$n,", log($n*$magicF) ,"\n" ; log($n)*$magicF } __END__ =head1 NAME ococalc - add / subtract word ocurrences =head1 SYNOPSIS ococalc f.oco(s) ... to add multiseds ococalc sub f1.oco f2.oco ... to subtract options: -i ignore case (ouput is all in lower case) -o=out output is sent to 'out' (def: file.oco) -log output is in logaritmic scale (log(oco / million)) -rel output is in per-million relative format -p=2 number of decimal digits (in rel or log formats) (def=0) =head1 DESCRIPTION UTF-8 input expected. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO frecoco 123oco Lingua::PT::PLN perl(1). =cut