#!/usr/bin/perl # This file is part of The New Aspell # Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL # license version 2.0 or 2.1. You should have received a copy of the # LGPL license along with this library if you did not you can find it # at http://www.gnu.org/. use strict; use warnings; no locale; use IO::File; use Data::Dumper; use constant { CHAR => 0, TYPE => 1, DISPLAY=> 2, UPPER => 3, LOWER => 4, TITLE => 5, PLAIN => 6, SCRIPT => 7, NAME => 8, }; use constant {COMPAT => 1, PRIVATE => 2}; sub uni_char($$$); # uni def what sub print_map(@); sub codepoint($); my (%unidata, %decomp, %fdecomp, %uni_char, %char_uni, %compexcl); sub cdecomp ( $ ) {local $_ = $fdecomp{$_[0]}; return $_[0] unless defined $_; return $_;} open IN, "unicode.txt" or die "Can't open \"unicode.txt\": $!\n"; while () { chop; s/\s*#\s*(.*)$//; my $n = $1; next unless length $_ > 0; my @data = split / /; $n =~ /#\s*(.+?)\s*$/; push @data, $1; $unidata{$data[0]} = \@data; } open IN, "decomp.txt" or die "Can't open \"decomp.txt\": $!\n"; while () { chop; s/\s*#.*$//; next unless length $_ > 0; my ($c, $f, $d) = /^([0-9A-F]+) (.) (.+)$/ or die; $d =~ s/ / /g; # flags one-way, compat, private my $a = [ $d, $f eq '!', grep {0xE000 <= hex($_) && hex($_) < 0xF800} split(' ', $d)]; push @{$decomp{$c}}, $a; $fdecomp{$c} = $d unless $a->[COMPAT] || $a->[PRIVATE]; $compexcl{$c} = $d if $f eq '>' && ! exists $compexcl{$c}; } foreach my $file (@ARGV) { my %scripts; my ($base) = $file =~ /^(.+)\.txt/i or die "$file does not end in \".txt\"\n"; $base = lc $base; open OUT, ">$base\.cset" or die "Can't create \"$base\.cset\": $!\n"; open MAP, ">$base\.cmap" or die "Can't create \"$base\.cmap\": $!\n"; my $nl = 0; my @in = (new IO::File); open $in[0], "<:utf8", "$base\.txt" or die "Can't open \"$base.txt\": $!\n"; my $dir = ''; $base =~ s/^(.+)\/// and $dir = $1; my @ord; push @ord, "= $base"; my @ascii = (0,16,32..64,91..96,123..127); my @chardata; undef %char_uni; undef %uni_char; my %ldata; my @base_letters; my %base_letter; my @vowels; my @special_case; my $no_ascii = 0; while (@in) { while (local $_ = $in[0]->getline) { chop; s/\s*#.*$//; s/^\s+//; next unless $_; if (/^include\s+(.+)/i) { unshift(@in, new IO::File); open $in[0], "<:utf8", "$dir/$1" or die "Unable to open \"$dir/$1\": $!\n"; } elsif (/^no-latin/i) { $nl = 1; } elsif (/^(<|==|>)\s*(.+)/) { if ($1 eq '==') {$ord[0] = "= $2"} else {push @ord, "$1 $2"} } elsif (/^(letter|vowel)s?\s+(.+)/i) { my $w = $1; my $l = $2; my (@l) = split /\s+/, $2; foreach (@l) { my $c = codepoint $_; push @base_letters, $c; push @vowels, $c if $w =~ /^vowel/i; } } elsif (/^case\s+(.+)/i) { push @special_case, [map {codepoint $_} (split /\s+/, $1)]; } else { s/\=|0x|U\+//g; my ($char, $uni) = /^\s*([a-fA-F0-9.]+)\s+\=?\s*([a-fA-F0-9.]+)/ or next; my @char = map {sprintf "%02X", $_} ($char =~ /(.+?)\.\.(.+)/ ? (hex($1)..hex($2)) : hex $char); my @uni = map {sprintf "%04X", $_} ($uni =~ /(.+?)\.\.(.+)/ ? (hex($1)..hex($2)) : hex $uni); die "bad range in $base\n" unless @char == @uni; my $num = @char; for (my $i = 0; $i != $num; $i++) { next unless $num == 1 || $unidata{$uni[$i]}; my $char = hex($char[$i]); my $uni = hex($uni[$i]); next if $char == $uni && ($char < 0x20 || (0x80 <= $char && $char < 0xA0)); next if exists $uni_char{$uni[$i]}; printf("Warning remapping '%c' (0x%X) may cause problems with Aspell.\n", $char, $char) if $char != $uni && ($char == 0x00 || $char == 0x10 || (0x20 <= $char && $char <= 0x40) || (0x5B <= $char && $char <= 0x60) || (0x7B <= $char && $char <= 0x7F)); $no_ascii = 2 if $char != $uni && (0x20 <= $char && $char <= 0x7F); $char_uni{$char[$i]} = $uni[$i]; $uni_char{$uni[$i]} = $char[$i]; } } } $in[0]->close; shift @in; } foreach my $i (@ascii) { my $char = sprintf("%02X",$i); my $unichar = "00".$char; next if defined $char_uni{$char}; $char_uni{$char} = $unichar; $uni_char{$unichar} = $char; } foreach (keys %uni_char) { $ldata{$_} = [@{$unidata{$_}}] if defined $unidata{$_}; } foreach my $case (@special_case) { foreach my $u (@$case) { $ldata{$u}[UPPER] = $case->[0]; $ldata{$u}[LOWER] = $case->[1]; $ldata{$u}[TITLE] = $case->[2] || $case->[0]; } } foreach my $u (@base_letters) { $base_letter{$ldata{$u}[UPPER]} = 1; $base_letter{$ldata{$u}[LOWER]} = 1; $base_letter{$ldata{$u}[TITLE]} = 1; } foreach my $u (keys %ldata) { my $plain = $u; $plain = $unidata{$plain}[PLAIN] while (!$base_letter{$plain} && $plain ne $unidata{$plain}[PLAIN]); $ldata{$u}[PLAIN] = defined $ldata{$plain} ? $plain : $u; } if (@vowels) { $_->[TYPE] =~ tr/vV/lL/ foreach (values %ldata); foreach (@vowels) { my $u = $ldata{$_}[PLAIN]; $ldata{$ldata{$u}[UPPER]}[TYPE] = 'V'; $ldata{$ldata{$u}[LOWER]}[TYPE] = 'V'; $ldata{$ldata{$u}[TITLE]}[TYPE] = 'V'; } foreach my $u (keys %ldata) { next unless uc $ldata{$u}[TYPE] eq 'L'; $ldata{$u}[TYPE] = 'V' if $ldata{$ldata{$u}[PLAIN]}[TYPE] eq 'V'; } } foreach my $i (0x00..0xFF) { my $char = sprintf("%02X",$i); my $unichar = $char_uni{$char}; my $info = $ldata{$unichar} if defined $unichar; my $letter = $info->[SCRIPT] eq 'Latn' if defined $info; if (defined $info && (!$nl || !$letter)) { $scripts{$info->[SCRIPT]} = 1; $chardata[$i] = [$char, $unichar, $info->[TYPE], # 2 $info->[DISPLAY], # 3 uni_char($info->[UPPER], $char, "upper"), # 4 uni_char($info->[LOWER], $char, "lower"), # 5 uni_char($info->[TITLE], $char, "title"), # 6 uni_char($info->[PLAIN], $char, "plain"), # 7 '00', # 8 '00', # 9 $info->[NAME], ]; } else { my $u = (defined $unichar ? hex($unichar) : $no_ascii && 0x20 <= $i && $i < 0x80 ? $i + 0xE000 : $i >= 0xA0 ? $i + 0xE000 : $i); my $n = ($u >= 0xE000 ? '' : defined $unichar ? ($letter ? '' : '') : $u < 0x20 ? '' : $u < 0x80 ? '' : $u < 0xA0 ? '' : die "$base $u"); $chardata[$i] = [$char, sprintf("%04X", $u), '.', ($n =~ /letter/ ? 'Y': 'N'), $char, $char, $char, '00', '00', '00', $n]; } } foreach my $char (sort keys %char_uni) { my $unichar = $char_uni{$char}; my $info = $ldata{$unichar}; next unless defined $info; my $inf = $chardata[hex $char]; my $t = uc($inf->[2]); if ($t eq 'L' || $t eq 'V') { $inf->[2] =~ tr/vV/lL/; $inf->[8] = $t eq 'V' ? '2A' : $chardata[hex $inf->[5]][7]; $inf->[9] = $t eq 'V' ? '00' : $chardata[hex $inf->[5]][7]; } else { $inf->[7] = '00'; } } #push @ord, '> ascii' unless $no_ascii || $base eq 'ascii'; print OUT "# Aspell Character Data File.\n"; foreach (@ord) {print OUT "$_\n";} print OUT "/\n"; print OUT "# <plain>\n"; print OUT "# <sl-first> <sl-rest>\n"; foreach my $i (0..255) { my @d = @{$chardata[$i]}; print OUT "@d[0..9] # $d[10]\n"; } my %map; my %rmap; my %nmap; foreach my $i (0..255) { my $char = $chardata[$i][0]; my $unichar = $chardata[$i][1]; next if 0xE000 <= hex($unichar) && hex($unichar) <= 0xE100; $map{$unichar} = $char; $rmap{$char} = [$unichar]; foreach my $d (@{$decomp{$unichar}}) { next if $d->[COMPAT]; $map{$d->[0]} = $char; push @{$rmap{$char}}, $d->[0]; } } while (my ($u, $dl) = each %decomp) { next if $uni_char{$u}; foreach my $d (@$dl) { my @d = split ' ', $d->[0]; my $dont_use; my $unsupported; local $_ = ''; foreach my $c (@d) { my $i = $uni_char{$c}; my $t = $chardata[hex $i][2] if defined $i; if (defined $i && $t ne '.') {$_ .= "$i ";} elsif ($unidata{$c} && $unidata{$c}[2] eq 'M') {$unsupported = 1;} else {$dont_use = 1;} } die if exists $map{$u}; next if $dont_use; next unless length $_ > 0; chop; if ($unsupported) {$_ .= " # unsup: $d->[0]"} elsif ($d->[COMPAT]) {$_ .= ' # compat'} $map{$u} = $_; push @{$rmap{$_}}, $u unless $unsupported || $d->[COMPAT]; last; } } my $use_comb = $scripts{Latn} || $scripts{Grek} || $scripts{Cyrl}; if ($use_comb) { foreach my $u (0x300..0x345) { my $uni = sprintf "%04X", $u; next unless defined $unidata{$uni}; next if $map{$uni}; $map{$uni} = "0 # unsup"; } } #$map{"FFFF FFFF FFFF"} = "FD"; my @INTERNAL; foreach (sort keys %map) { push @INTERNAL, "$_ > $map{$_}\n"; } my @STRICT; foreach (sort keys %map) { next if $map{$_} =~ /\#/; push @STRICT, "$_ > $map{$_}\n"; } my @NFD; foreach my $k (sort keys %rmap) { my $d = join ' ', map {cdecomp $_} split ' ', $rmap{$k}[-1]; my $d0 = $k !~ / / ? '' : join ' ', (map {cdecomp $_} map {$chardata[hex $_][1]} split ' ', $k); push @NFD, "$k > $d\n" unless $d0 eq $d; } my @NFC; foreach my $k (sort keys %rmap) { my $d0 = ''; my $d = $rmap{$k}[0]; while ($d0 ne $d) { $d0 = $d; $d = join ' ', map {$compexcl{$_} || $_} split ' ', $d0; } push @NFC, "$k > $d\n" unless $k eq $d; } my @COMP; foreach my $k (sort keys %rmap) { push @COMP, "$k > $rmap{$k}[0]\n" } print MAP "INTERNAL\n/\n"; print MAP print_map(@INTERNAL), "\n"; print MAP "\nSTRICT\n"; if ("@INTERNAL" ne "@STRICT") {print MAP "/\n", print_map(@STRICT), "\n"} else {print MAP "= INTERNAL\n.\n"} print MAP "\nNFD\n/\n"; print MAP print_map(@NFD), "\n"; print MAP "\nNFC\n"; if ("@NFC" ne "@NFD") {print MAP "/\n", print_map(@NFC), "\n"} else {print MAP "= NFD\n.\n"} print MAP "\nCOMP\n"; if ("@COMP" ne "@NFC") {print MAP "/\n", print_map(@COMP), "\n"} else {print MAP "= NFC\n.\n"} } sub codepoint($) { local ($_) = @_; my $c = -1; $c = ord $_ if length $_ == 1; $c = hex $2 if /^(U\+|0x)([A-F0-9]+)$/i; die if $c == -1; return sprintf("%04X", $c); } sub uni_char($$$) { my ($uni, $def, $what) = @_; my $chr = $uni_char{$uni}; return $chr if defined $chr; print STDERR "Warning U+$uni mot mapped. It is needed for the \"$what\" mapping of 0x$def.\n" if ($unidata{$uni} eq 'L' || $unidata{$uni} eq 'V') && ($unidata{$char_uni{$def}} eq 'L' || $unidata{$char_uni{$def}} eq 'V'); return $def; } sub print_map(@) { my (@map) = @_; my $i = 0; my $f; $f = sub { my $c = 0; my $l = ''; my $p = ''; while ($i < @map) { my ($k,$r) = $map[$i] =~ /^(.+) > (.+)\n$/ or die; $k =~ s/^$_[0]// || last if length $_[0] > 0; my @k = split / /, $k; $c++ if $k[0] ne $p; if (@k == 1) { $l .= "\n"; $l .= ' 'x(length $_[0]); $l .= "$k[0] > $r"; $i++; } else { if ($k[0] ne $p) { $l .= "\n"; $l .= ' 'x(length $_[0]); $l .= "$k[0] > -"; } $l .= " /\n"; $l .= &$f("$_[0]$k[0] ") } $p = $k[0]; } return join ('', ' 'x(length $_[0]), 'N ', $c, $l, "\n", ' 'x(length $_[0]), '.'); }; &$f(''); }