#!/usr/bin/perl -s -w use Encode::Guess; use encoding "utf8"; use Data::Dumper; use strict; use warnings; our( $s, $w ,$m, $debug, $lat1, $utf8, $ac, $ctx, $i,$cctx, $wa,$maxwl, $maxdlen); my $f1=shift; my $f2=shift or die("usage $0 [options] f1 f2 -cctx same as -ctx -m -ac -s -wa same as -w -ac -s -s supress common tokens -w compare words -m compare characters -lat1 force reading input as latin 1 -utf8 force reading input as utf8 -ac output to sort | uniq -c | sort -nr -ac=file starts with the previous acumulated results -ctx one (char or word) context (left and rigth) -i ignore case -maxwl max word length (def: 8 for chars, 20 for words) -maxdlen max diff between length original and target "); my %oco; my $mat={}; $ctx = $m = $s = 1 if ($cctx); $w = $s = 1 if ($wa); $ac ||= 1 if ($cctx|| $wa); $maxwl ||= 8 if $m; $maxwl ||= 20 if $w; $maxdlen ||= 5 if $w; $maxdlen ||= 20; if($ac and $ac ne "1" and -f $ac){ read_ac_file($ac,\%oco,$mat); } sub read_ac_file{ my ($ac,$roco,$mat) = @_; open(F,"<:utf8",$ac) or die("cant open $ac\n"); while(){ if(/(\d+)\t(.+?)\t=>\t(.*?)\t(.*)?\t(.*)/){ my($v1,$v2,$v3,$v4,$v5)=($1,$2,$3,$4,$5); $v4 =~ s/\+//; $v4||= 0; $mat->{$v2}{$v3}=$v1; $mat->{$v2}{$v2}=$v4; $roco->{"$v2\t=>\t$v3"} += $v1; } } close F; } my ($pattern, $fs, $handle_enc); my $opts = ""; my $text = ""; my $tmp = "_cmp_$$"; ## if ($ac){ open(OUT,"|-:utf8", "sort | uniq -c | sort -nr") or die; } if ($i) { $opts.="-i " } ## ignore case $handle_enc = encguess($f1); open(F1,"-|:utf8","$handle_enc $f1"); open(TEMP1,">:utf8","$tmp-1"); normalize(*F1,*TEMP1); close F1; $handle_enc = encguess($f2); open(F2,"-|:utf8","$handle_enc $f2"); open(TEMP2,">:utf8","$tmp-2"); $text=normalize(*F2,*TEMP2,1); close F2; open(F,"-|:utf8", "diff $opts -y $tmp-1 $tmp-2") or die; my $st = 0; my @pend = (); while(){ chomp; my @a=split(/\t+/, $_); if ($a[1] =~ m/ *>$/ ) { add2($a[2]) ; $st = 1; } elsif($a[1] =~ m/ *<$/ ) { add1($a[0]) ; $st = 1; } elsif($a[1] =~ m/ *\|$/) { add12($a[0],$a[2]) ; $st = 1; } elsif($ctx && $st==0) { shift(@pend) if @pend ; add12($a[0],$a[1]); $st = 0; } elsif($ctx && $st==1) { add12($a[0],$a[1]); $st = 0; flushpendctx(); add12($a[0],$a[1]); } else { flushpend(); flushentry(@a) unless $s || $ac; } } flushpend(); close F; fix_cm($mat,$text); dump_cm($mat); dump_ac($mat) if $ac; unlink ("$tmp-1", "$tmp-2",) unless $debug; sub dump_ac{ my $mat=shift; for(sort {$oco{$b} <=> $oco{$a} or $a cmp $b} keys %oco ){ my ($a1,$b1)=split(/\s*=>\s*/); next if (abs(length($a1) - length($b1)) > $maxdlen); my $aa = 0; my $bb = ""; my $con = " "; my $k = $_; s/(^|\t) /:/g; s/ ($|\t)/:/g; $aa = $mat->{$a1}{$a1} if ($mat and $mat->{$a1} and $mat->{$a1}{$a1}); if($aa == 0) { $con = "(=>)"; $bb="" } elsif($aa < $oco{$k}) { $con = "(=>?)"; $bb="+$aa"; } else { $con = ""; $bb="+$aa"; } print "$oco{$k}\t$_\t$bb\t$con\n"; } } sub fix_cm{ my ($mat,$text)=@_; for (keys %$mat){ my $v ; if ($w){ $v = ($text =~ s/\b($_)\b/$1/g) } else { $v = ($text =~ s/($_)/$1/g); } $mat->{$_}{$_} += $v if $v } } sub dump_cm{ my $mat=shift; open(D,">CM.dumper"); print D Dumper($mat); close D; } sub encguess{ if ( defined $lat1){ return "iconv -f latin1 -t utf8" } elsif ( defined $utf8){ return "cat"} else{ my $fn = shift; local $/; # open(FILE,"head -20 $fn |"); # my $data = ; # close FILE; open (FILE,$fn) or die("Cant open $fn\n");; my $data ; read(FILE,$data,2048); ## avoid just empty lines close FILE; my $enc = guess_encoding($data,"utf8"); if (!ref $enc) { return "iconv -f latin1 -t utf8" } elsif ($enc->name eq "utf8"){ return "cat" } else { return "iconv -f ".$enc->name." -t utf8" } ## also works for UFT16 } } sub syst { my $cmd = shift; print STDERR "$cmd\n" if $debug ; if(system ($cmd) != 0){ warn "** ERROR ********* system $cmd failed: $!$?\n"; return 0 } return 1 } sub flushentrydi{ return if length($_[0]) > $maxwl or length($_[1]) > $maxwl or $_[0] =~ m/ / or $_[1] =~ m/ / ; $mat->{$_[0]}{$_[1]} ++; if($ac){ $oco{ "$_[0]\t=>\t$_[1]"}++} else { print "$_[0]\t=>\t$_[1]\n" } } sub flushentry { $mat->{$_[0]}{$_[1]} ++; print "$_[0]\t=\t$_[1]\n" } sub flushpend { for (@pend){ flushentrydi($_->[0],$_->[1]) }; @pend =(); } sub add12{ push(@pend,[@_]); } sub flushpendctx{ flushentrydi(join($fs,(map {$_->[0]} @pend)),join($fs,(map {$_->[1]} @pend))); @pend=(); } sub add1{ my $a = shift; my $w0 = join($fs,(map {$_->[0]} @pend),$a); my $w1 = join($fs,(map {$_->[1]} @pend)); @pend=([$w0,$w1]); } sub add2{ my $a = shift; my $w0 = join($fs,(map {$_->[0]} @pend)); my $w1 = join($fs,(map {$_->[1]} @pend),$a); @pend=([$w0,$w1]); } sub normalize{ my($in,$out,$tt) = @_; $text=""; while(<$in>){ $_ = lc($_) if $i; $text.=$_ if $tt; if (defined($m)){ $fs = ""; s/^\s*\n$//g; s/^\s+//g; s/\s+/ /g; s/\n/ /g; s/[^\w\'\-\s]//g; s/['\-]\B//g; s/\B['\-]//g; s/(.)/$1\n/g; print $out $_; } else{ $fs = " "; my $pattern = q{\w+(['-]\w+)*}; while(/($pattern)/g){ print $out "$1\n"; } } } if ($tt){ return $text;} } __END__ =head1 NAME lexdiff - lexical comparation of 2 files =head1 SYNOPSIS lexdiff [options] f1 f2 Options: -cctx same as -ctx -m -ac -s ==> char level changes -wa same as -w -ac -s ==> word level changes -s supress common tokens -w compare words -m compare characters -ac output to sort | uniq -c | sort -nr -ac=file starts with the previous acumulated results -ctx one (char or word) context (left and rigth) -i ignore case -debug keep temp. files -lat1 force reading input in ISO latin 1 encoding -utf8 force reading input is UTF-8 encoding -maxwl max word length (def: 8 for chars, 20 for words) -maxdlen max diff between length original and target =head1 DESCRIPTION lexdiff performs word by word or char by char comparations =head2 Example In order to calculate the differences between a set of files in the current directory and their correspondents in NEW/ folder, we can do: for a in *.txt; \ do lexdiff -w -ac=dif $a new/$a > aux; \ mv aux dif ; \ done =head1 AUTHOR Andre F. Santos, pg15973@alunos.uminho.pt J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1), diff(1) =cut __END__