
package Lingua::PT::NamedEntity;

use warnings;
use strict;
use locale;


require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw();
our @EXPORT_OK = qw();
our @EXPORT_TAGS = qw(:DEFAULT => []);

our $version = "0.01";


## perEntity => &func
## perText => &func
## perChunk => &func
## text => $string

sub process {
    my $self = shift;

    my %options = @_;

    my $perChunk = $options{perChunk};
    my $perEntity = $options{perEntity};
    my $perText = $options{perText};

    my $m = "\x01";
    my $n = "\x02";

    die "Can't process entities without text.\n" if not defined $options{text};
    my $text = "$m$options{text}$m";

    $text =~ s/($self->{ent})/$m($1)$m/sg;
    # marca as marcas (!) quando estas estão no ínicio do texto a processar ou são precedidas de pontuação
    $text =~ s/^($m\s*[${\ '"\'«'}]?\s*)$m\(/$1$m$n(/sg;
    $text =~ s/((?:[.?!]\s*[${\ '"\'«'}]?|:\s*\")\s*)$m\(/$1$m$n(/sg;
    
    while ($text =~ /($m$n?\((.*?)\)$m)|($m(.*?)$m)/smg) {
	if (defined($1) and (defined($perEntity) or defined($perChunk))) {
	    my $marcacao = $1;
	    my $entidade = $2;
	    if (defined($perChunk)) {
		&{$perChunk} (type => "entity",
			      text => $entidade,
			      ($marcacao =~ /$n/) ? (beginning =>  1) : ());
	    } else {
		&{$perEntity} ($entidade, $marcacao =~ /$n/);
	    }
	} elsif (defined($3)) {
	    if (defined($perChunk)) {
		&{$perChunk} (type => "raw",
			     text => $4);
	    } elsif (defined($perText)) {
		&{$perText} ($4);
	    }
	} else {
	    die "Uh?"; # XXX isto não pode acontecer ...
	}
	pos($text) = pos($text) - 1;
    }
}

sub new {
    my $proto = shift || __PACKAGE__;
    my $class = ref($proto) || $proto;

    my $abrev1 = q![[:upper:]]\.!;
    my $parte = q![[:upper:]][[:alpha:]\-\']+!;
    my $ent = qr{(?:(?:(?:$parte\s+)|(?:$abrev1\s*))(?:d[eao]s?\s+)?)*(?:(?:$parte)|(?:$abrev1))(?:\s*\d+)?};

    my $self = { ent =>  $ent };
    bless($self, $proto);

    return $self;
}

1;

__END__

=head1 NAME

Lingua::PT::NamedEntity - Named Entity Extraction for portuguese texts

=head1 SYNOPSIS

  use Lingua::PT::NamedEntity;

  sub myCallback {
      my $b = $_[1] ? " (begin)" : "";
      print "ENTITY$b: '$_[0]'\n";
  }

  my $ne = new Lingua::PT::NameEntity;
  while (<>) {
    $ne->process(text => $_, perEntity => \&myCallback);
  }

=head1 DESCRIPTION

=head1 AUTHOR

=head1 COPYRIGHT AND LICENSE

=cut
