#!/usr/bin/perl -s -w

use Encode::Guess;
use encoding "utf8";
use Data::Dumper;
use strict;
use warnings;

our( $s, $w ,$m, $debug, $lat1, $utf8, $ac, $ctx, $i,$cctx, $wa,$maxwl, $maxdlen);

my $f1=shift;
my $f2=shift or die("usage $0 [options] f1 f2
     -cctx            same as -ctx -m -ac -s
     -wa              same as -w -ac -s
     -s               supress common tokens
     -w               compare words
     -m               compare characters
     -lat1            force reading input as latin 1
     -utf8            force reading input as utf8
     -ac              output to sort | uniq -c | sort -nr
     -ac=file         starts with the previous acumulated results
     -ctx             one (char or word) context (left and rigth)
     -i               ignore case
     -maxwl           max word length (def: 8 for chars, 20 for words)
     -maxdlen         max diff between length original and target
");
my %oco;
my $mat={};

$ctx = $m = $s = 1 if ($cctx);
       $w = $s = 1 if ($wa);              
$ac ||= 1          if ($cctx|| $wa);
$maxwl   ||= 8  if $m;
$maxwl   ||= 20 if $w;
$maxdlen ||= 5  if $w;
$maxdlen ||= 20;


if($ac and $ac ne "1" and -f $ac){ read_ac_file($ac,\%oco,$mat); }

sub read_ac_file{
  my ($ac,$roco,$mat) = @_;
  open(F,"<:utf8",$ac) or die("cant open $ac\n");
  while(<F>){ 
     if(/(\d+)\t(.+?)\t=>\t(.*?)\t(.*)?\t(.*)/){ 
        my($v1,$v2,$v3,$v4,$v5)=($1,$2,$3,$4,$5);
	    $v4 =~ s/\+//;
        $v4||= 0;
        $mat->{$v2}{$v3}=$v1;
        $mat->{$v2}{$v2}=$v4;
        $roco->{"$v2\t=>\t$v3"} += $v1;   } 
  }
  close F; 
}

my ($pattern, $fs, $handle_enc);
my $opts = "";

my $text = "";
my $tmp = "_cmp_$$";

## if ($ac){ open(OUT,"|-:utf8", "sort | uniq -c | sort -nr") or die; }
if ($i) { $opts.="-i " }  ## ignore case


$handle_enc = encguess($f1);
open(F1,"-|:utf8","$handle_enc $f1");
open(TEMP1,">:utf8","$tmp-1");
normalize(*F1,*TEMP1);
close F1; 

$handle_enc = encguess($f2);
open(F2,"-|:utf8","$handle_enc $f2");
open(TEMP2,">:utf8","$tmp-2");
$text=normalize(*F2,*TEMP2,1);
close F2;

open(F,"-|:utf8", "diff $opts -y $tmp-1 $tmp-2") or die;

my $st = 0;
my @pend = ();
while(<F>){
	chomp;
	my @a=split(/\t+/, $_);

	if   ($a[1] =~ m/ *>$/ ) { add2($a[2])         ; $st = 1;   } 
	elsif($a[1] =~ m/ *<$/ ) { add1($a[0])         ; $st = 1;   } 
	elsif($a[1] =~ m/ *\|$/) { add12($a[0],$a[2])  ; $st = 1;   } 
	elsif($ctx && $st==0)    { shift(@pend) if @pend ;
                           	   add12($a[0],$a[1]);   $st = 0;   }
	elsif($ctx && $st==1)    { add12($a[0],$a[1]);   $st = 0;   
                               flushpendctx();
                               add12($a[0],$a[1]);  }
	else                     { flushpend();
                               flushentry(@a) unless $s || $ac; }
}

flushpend();
close F;

fix_cm($mat,$text);
dump_cm($mat);
dump_ac($mat) if $ac;

unlink ("$tmp-1", "$tmp-2",) unless $debug;

sub dump_ac{ my $mat=shift;
  for(sort {$oco{$b} <=> $oco{$a} or $a cmp $b} keys %oco ){
     my ($a1,$b1)=split(/\s*=>\s*/);
     next if (abs(length($a1) - length($b1)) > $maxdlen);
     my $aa = 0;
     my $bb = "";
     my $con = "  ";
     my $k = $_;
     s/(^|\t) /:/g;
     s/ ($|\t)/:/g;
     $aa = $mat->{$a1}{$a1} if ($mat and $mat->{$a1} and $mat->{$a1}{$a1});

        if($aa == 0)       { $con = "(=>)"; $bb=""      }
     elsif($aa < $oco{$k}) { $con = "(=>?)"; $bb="+$aa"; }
     else                  { $con = ""; $bb="+$aa"; }
     print "$oco{$k}\t$_\t$bb\t$con\n";
  }
}

sub fix_cm{
  my ($mat,$text)=@_; 
  for (keys %$mat){
    my $v ;
    if ($w){ $v = ($text =~ s/\b($_)\b/$1/g) }
    else   { $v = ($text =~ s/($_)/$1/g);    }
    $mat->{$_}{$_} += $v if $v
  }
}

sub dump_cm{ my $mat=shift;
  open(D,">CM.dumper");
  print D Dumper($mat);
  close D;
}

sub encguess{
  if    ( defined $lat1){ return "iconv -f latin1 -t utf8"  }
  elsif ( defined $utf8){ return "cat"}
  else{ my $fn = shift;
    local $/; 
#   open(FILE,"head -20 $fn |");
#   my $data = <FILE>;
#   close FILE;
    open (FILE,$fn) or die("Cant open $fn\n");;
    my $data ; read(FILE,$data,2048);  ## avoid just empty lines
    close FILE;
    my $enc = guess_encoding($data,"utf8");
    if (!ref $enc) { return "iconv -f latin1 -t utf8"  }
    elsif ($enc->name eq "utf8"){ return "cat" }
    else {  return "iconv -f ".$enc->name." -t utf8" } ## also works for UFT16
  }
}

sub syst {
  my $cmd = shift;
  print STDERR "$cmd\n"    if $debug ;
  if(system ($cmd) != 0){ 
     warn "** ERROR ********* system $cmd failed: $!$?\n"; 
     return 0 }
  return 1
}

sub flushentrydi{ 
   return if length($_[0]) > $maxwl
          or length($_[1]) > $maxwl
          or $_[0] =~ m/  /
          or $_[1] =~ m/  /
          ;
   $mat->{$_[0]}{$_[1]} ++; 
   if($ac){ $oco{ "$_[0]\t=>\t$_[1]"}++}
   else   { print "$_[0]\t=>\t$_[1]\n" }  
}

sub flushentry  { $mat->{$_[0]}{$_[1]} ++;  print "$_[0]\t=\t$_[1]\n" }

sub flushpend { for (@pend){ flushentrydi($_->[0],$_->[1]) }; @pend =(); }

sub add12{ push(@pend,[@_]); }

sub flushpendctx{
 flushentrydi(join($fs,(map {$_->[0]} @pend)),join($fs,(map {$_->[1]} @pend)));
 @pend=();
}

sub add1{
 my $a = shift;
 my $w0 = join($fs,(map {$_->[0]} @pend),$a);
 my $w1 = join($fs,(map {$_->[1]} @pend));
 @pend=([$w0,$w1]);
}

sub add2{
 my $a = shift;
 my $w0 = join($fs,(map {$_->[0]} @pend));
 my $w1 = join($fs,(map {$_->[1]} @pend),$a);
 @pend=([$w0,$w1]);
}

sub normalize{
	my($in,$out,$tt) = @_;
    $text="";
	while(<$in>){
        $_ = lc($_) if $i;
        $text.=$_ if $tt;
		if (defined($m)){
			$fs = "";
			s/^\s*\n$//g;
			s/^\s+//g;
			s/\s+/ /g;
			s/\n/ /g;
			s/[^\w\'\-\s]//g;
			s/['\-]\B//g; s/\B['\-]//g;
			s/(.)/$1\n/g;
			print $out $_;
		}
		else{
			$fs = " ";
			my $pattern = q{\w+(['-]\w+)*};
			while(/($pattern)/g){
				print $out "$1\n";
			}
		}
	}	
    if ($tt){ return $text;}
}

__END__

=head1 NAME

lexdiff - lexical comparation of 2 files

=head1 SYNOPSIS

 lexdiff [options] f1 f2
   Options:
     -cctx            same as -ctx -m -ac -s  ==> char level changes
     -wa              same as      -w -ac -s  ==> word level changes
     -s               supress common tokens
     -w               compare words
     -m               compare characters
     -ac              output to sort | uniq -c | sort -nr
     -ac=file         starts with the previous acumulated results
     -ctx             one (char or word) context (left and rigth)
     -i               ignore case
     -debug           keep temp. files
     -lat1            force reading input in ISO latin 1 encoding
     -utf8            force reading input is UTF-8 encoding
     -maxwl           max word length (def: 8 for chars, 20 for words)
     -maxdlen         max diff between length original and target

=head1 DESCRIPTION

lexdiff performs word by word or char by char comparations

=head2 Example

In order to calculate the differences between a set of files
in the current directory and their correspondents in NEW/ folder,
we can do:

 for a in *.txt;                           \
   do  lexdiff -w -ac=dif $a new/$a > aux; \
       mv aux dif ;                        \
   done


=head1 AUTHOR

Andre F. Santos, pg15973@alunos.uminho.pt

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1), diff(1)

=cut      
__END__
