package XML::XCES;

use XML::DT;

use warnings;
no warnings 'recursion';
use strict;

=head1 NAME

XML::XCES - Perl module to handle XCES xml files

=head1 VERSION

Version 0.02

=cut

our $VERSION = '0.02';

=head1 SYNOPSIS

    use XML::XCES;

    XML::XCES->align2pair("File.xml", "prefix");

    XML::XCES->encode_file("File.txt");

=head1 ABSTRACT

This module intents to incorporate tools to manage XML XCES files
(XCES stands for XMS Corpus Encoding Standard).

I am developing it accordingly with my own needs, which means it
probably does not do what you want. Meanwhile, feel free to send
comments, bug-reports, requests and patches.

=head1 FUNCTIONS

XML::XCES provides the following functions:

=head2 encode_file

This function takes a filename and replaces (or creates a new
instance) of the file encoded using XCES syntax. Given that this
process needs to detect boundaries of sentences and words, which is
not a simple and language independent task, the module uses basic
functions that can be overriden by external functions.

=cut



#    p id: <p>
#    s id: s<p>
#    w id: w<p>.<s>.<w>

### Bem, o OPUS não usa o formato xces. Tenho de ver isto com calma.

sub encode_file {
  shift if $_[0] eq "XML::XCES";
  my $conf = {};
  $conf = shift if ref $_[0] eq "HASH";
  my $filename = shift;

  ## FIXME:
  ##  By default, let use an empty line as record separator.
  ##  later this should be changed.

  my $old_record_separator = $/;
  $/ = "\n\n";
  open INPUT, $filename or die "Cannot open file '$filename': $!\n";
  print '<?xml version="1.0" encoding="iso-8859-1"?>',"\n";
  
  while(<INPUT>) {
    
  }
  close INPUT;
  $/ = $old_record_separator;
}

=head2 align2pair

This function receives an XCES xml filename with sentence alignment
content, and, optionally, a prefix for the output files.

Note that the aligned files paths should be absolute or relative to
the command issue directory.

=cut

sub align2pair {
  shift if $_[0] eq "XML::XCES";
  my $xces = shift;
  my $prefix = shift || $xces;

  my $tuCount = 0;

  open S, ">:utf8", "$prefix-source.nat" or die;
  open T, ">:utf8", "$prefix-target.nat" or die;

  my %handler = (
		 -type => { linkGrp => 'SEQ' },

		 'link' => sub {
		   my ($s, $t) = split /\s*;\s*/, $v{xtargets};
		   my @s = grep { /./ } split /\s+/, $s;
		   my @t = grep { /./ } split /\s+/, $t;
		   return [[@s],[@t],$v{certainty}];
		 },

		 'linkGrp' => sub {
		   my ($source,$target) = ($v{fromDoc},$v{toDoc});
		   return unless -f $source and -f $target;

		   my $cont = $c;
		   printf STDERR "+ %s * %s ", _last26($source), _last26($target);

		   my (%s,%t);
		   my $ACTIVE;
		   my %h2 = (
			     -type => { linkGrp => 'SEQ' },

			     -outputenc => 'UTF-8',

			     -default => sub {
			       $c = _trim($c);
			       if ($v{id} && exists($ACTIVE->{$v{id}})) {
				 $ACTIVE->{$v{id}} = $c;
			       }
			       $c
			     });

		   my $tu = 0;
		   for my $link (@$cont) {
		     $tu++;
		     @s{@{$link->[0]}} = 1 x @{$link->[0]};
		     @t{@{$link->[1]}} = 1 x @{$link->[1]};
		   }
		   print STDERR "($tu TUs)\n";
		   $tuCount+=$tu;

		   $ACTIVE = \%s;
		   dt($source, %h2);

		   $ACTIVE = \%t;
		   dt($target, %h2);

		   for my $link (@$cont) {
		     print S (map { "$_\n" } (@s{@{$link->[0]}},'$'));
		     print T (map { "$_\n" } (@t{@{$link->[1]}},'$'));
		   }
		 },
		);

  dt($xces, %handler);

  return $tuCount;
}


sub _trim {
  my $x = shift;
  $x =~ s/\s+/ /g;
  $x =~ s/^\s+//;
  $x =~ s/\s+$//;
  return $x;
}

sub _last26 {
  my $x = shift;
  if (length($x)>26) {
    return "...".substr($x,-23,23);
  } else {
    return $x
  }
}



=head1 AUTHOR

Alberto Simoes, C<< <ambs@cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-xml-xces@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org>.  I will be notified, and then you'll automatically
be notified of progress on your bug as I make changes.

=head1 ACKNOWLEDGEMENTS

=head1 COPYRIGHT & LICENSE

Copyright 2004-2005 Alberto Simoes, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of XML::XCES
