#!/usr/bin/perl -s -w use Encode::Guess; use encoding "utf8"; use Data::Dumper; use strict; use warnings; our( $w ,$m, $debug, $lat1, $utf8, $ac, $ctx, $i,$cctx, $wa,$maxwl); my $f1=shift; my $f2=shift or die("usage $0 [options] f1 f2 -cctx same as -ctx -m -ac -s -wa same as -w -ac -s -w compare words -m compare characters -lat1 force reading input as latin 1 -utf8 force reading input as utf8 -ac output to sort | uniq -c | sort -nr -ac=file starts with the previous acumulated results -ctx one (char or word) context (left and rigth) -i ignore case "); $ctx = $m = 1 if ($cctx); $w = 1 if ($wa); $ac ||= 1 if ($cctx|| $wa); $maxwl ||= 8 if $m; $maxwl ||= 20 if $w; my $cm1={}; my $cm2={}; my $text1 = ""; my $text2 = ""; my $tmp = "_cmp_$$"; if($ac and $ac ne "1" and -f $ac){ ($cm1,$cm2)=read_ac_file($ac); } sub read_ac_file{ my ($ac) = @_; my $tot = do($ac); unless ($tot) { warn "couldn't do $ac: $!\n" unless defined $tot; warn "couldn't parse $ac: $@\n" if $@; warn "couldn't run $ac\n" unless $tot; $tot = [{},{}]; } return($tot->[0],$tot->[1]); } my ($pattern, $fs, $handle_enc); my $opts = ""; ## if ($ac){ open(OUT,"|-:utf8", "sort | uniq -c | sort -nr") or die; } if ($i) { $opts.="-i " } ## ignore case $handle_enc = encguess($f1); open(F1,"-|:utf8","$handle_enc $f1"); open(TEMP1,">:utf8","$tmp-1"); $text1=normalize(*F1,*TEMP1,1); close F1; $handle_enc = encguess($f2); open(F2,"-|:utf8","$handle_enc $f2"); open(TEMP2,">:utf8","$tmp-2"); $text2=normalize(*F2,*TEMP2,1); close F2; open(F,"-|:utf8", "diff $opts -y $tmp-1 $tmp-2") or die; my $st = 0; my @pend = (); while(){ chomp; my @a=split(/\t+/, $_); if ($a[1] =~ m/ *>$/ ) { add2($a[2]) ; $st = 1; } elsif($a[1] =~ m/ *<$/ ) { add1($a[0]) ; $st = 1; } elsif($a[1] =~ m/ *\|$/) { add12($a[0],$a[2]) ; $st = 1; } elsif($ctx && $st==0) { shift(@pend) if @pend ; add12($a[0],$a[1]); $st = 0; } elsif($ctx && $st==1) { add12($a[0],$a[1]); $st = 0; flushpendctx(); add12($a[0],$a[1]); } else { flushpend(); } } flushpend(); close F; fix_cm($cm1,$text2); fix_cm($cm2,$text1); dump_cm($cm1,$cm2); ## $dump_ac($mat) if $ac; unlink ("$tmp-1", "$tmp-2",) unless $debug; #sub dump_ac{ my ($m1,$m2)=@_; # for(sort {$oco{$b} <=> $oco{$a} or $a cmp $b} keys %oco ){ # my ($a1,$b1)=split(/\s*=>\s*/); # my $aa = 0; # my $bb = ""; # my $con = " "; # my $k = $_; # s/(^|\t) /:/g; # s/ ($|\t)/:/g; # $aa = $mat->{$a1}{$a1} if ($mat and $mat->{$a1} and $mat->{$a1}{$a1}); # # if($aa == 0) { $con = "!!"; $bb="" } # elsif($aa < $oco{$k}) { $con = "! "; $bb="($aa)"; } # else { $con = " "; $bb="($aa)"; } # print "$con\t$oco{$k}\t$_\t$bb\n"; # } #} sub fix_cm{ my ($mat,$text)=@_; for (keys %$mat){ my $v ; if ($w){ $v = ($text =~ s/\b($_)\b/$1/g) } else { $v = ($text =~ s/($_)/$1/g); } $mat->{$_}{$_} += $v if $v } } sub dump_cm{ open(D,">CM.dumper"); print D Dumper([@_]); close D; } sub encguess{ if ( defined $lat1){ return "iconv -f latin1 -t utf8" } elsif ( defined $utf8){ return "cat"} else{ my $fn = shift; local $/; open (FILE,$fn) or die("Cant open $fn\n");; my $data ; read(FILE,$data,2048); ## avoid just empty lines close FILE; my $enc = guess_encoding($data,"utf8"); if (!ref $enc) { return "iconv -f latin1 -t utf8" } elsif ($enc->name eq "utf8"){ return "cat" } else { return "iconv -f ".$enc->name." -t utf8" } ## also works for UFT16 } } sub syst { my $cmd = shift; print STDERR "$cmd\n" if $debug ; if(system ($cmd) != 0){ warn "** ERROR ********* system $cmd failed: $!$?\n"; return 0 } return 1 } sub flushentrydi{ return if length($_[0]) > $maxwl or length($_[1]) > $maxwl or $_[0] =~ m/ / or $_[1] =~ m/ / ; $cm1->{$_[0]}{$_[1]} ++; $cm2->{$_[1]}{$_[0]} ++; } sub flushpend { for (@pend){ flushentrydi($_->[0],$_->[1]) }; @pend =(); } sub add12{ push(@pend,[@_]); } sub flushpendctx{ flushentrydi(join($fs,(map {$_->[0]} @pend)),join($fs,(map {$_->[1]} @pend))); @pend=(); } sub add1{ my $a = shift; my $w0 = join($fs,(map {$_->[0]} @pend),$a); my $w1 = join($fs,(map {$_->[1]} @pend)); @pend=([$w0,$w1]); } sub add2{ my $a = shift; my $w0 = join($fs,(map {$_->[0]} @pend)); my $w1 = join($fs,(map {$_->[1]} @pend),$a); @pend=([$w0,$w1]); } sub normalize{ my($in,$out,$tt) = @_; my $text=""; while(<$in>){ $_=lc($_) if $i; $text.=$_ if $tt; if (defined($m)){ $fs = ""; s/^\s*\n$//g; s/^\s+//g; s/\s+/ /g; s/\n/ /g; s/[^\w\'\-\s]//g; s/['\-]\B//g; s/\B['\-]//g; s/(.)/$1\n/g; print $out $_; } else{ $fs = " "; my $pattern = q{\w+(['-]\w+)*}; while(/($pattern)/g){ print $out "$1\n"; } } } if ($tt){ return $text;} } __END__ =head1 NAME lexcm - lexical confusion matrix calculator =head1 SYNOPSIS lexcm [options] f1 f2 Options: -cctx same as -ctx -m -ac -s ==> char level changes -wa same as -w -ac -s ==> word level changes -s supress common tokens -w compare words -m compare characters -ac output to sort | uniq -c | sort -nr -ac=file starts with the previous acumulated results -ctx one (char or word) context (left and rigth) -i ignore case -debug keep temp. files -lat1 force reading input in ISO latin 1 encoding -utf8 force reading input is UTF-8 encoding =head1 DESCRIPTION lexcm performs word by word or char by char comparations and calculation of th confusion matrix. =head2 Example In order to calculate the differences between a set of files in the current directory and their correspondents in NEW/ folder, we can do: for a in *.txt; \ do lexcm -w -ac=CM $a new/$a > CM2; \ mv CM2 CM ; \ done =head1 AUTHOR Andre F. Santos, pg15973@alunos.uminho.pt J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1), diff(1) =cut __END__