#!perl
#
# This auxiliary script makes locale .pl files and copes with Korean.pm
# used by Unicode::Collate::Locale.
#
# Usage:
#    <do 'mklocale'> in perl, or <perl mklocale> in command line
#
# Input files:
#    data/*.txt
#    Collate/CJK/Korean.pm
#    Collate/allkeys.txt
#
# Output files:
#    Locale/*.pl (need to be moved to Collate/Locale/*.pl to install them)
#    Korean.pm   (need to be moved to Collate/CJK/Korean.pm to install it)
#
# == Examples of the Rules for data/*.txt ==
#
# 00F1;n+1 ===> primary weight of 00F1 is greater than that of n by 1.
#    Among literals, only [A-Za-z] can be the base.
#    +1  primary weight greater by 1.
#    -1  primary weight lesser by 1.
#    ++1 and --1 for secondary weight, +++1 and ---1 for tertiary weight.
#    a number followed by + or - is decimal.
#
# 01FD;<00E6><0301> ===> U+01FD eq U+00E6,U+0301
#    <XXXX> can be the base followed by +1 etc.
#    Ex. 1D2D;<00C6>+++12
#
# 0064 0335;=
# 0111;d++1<0335>
#    '=' saves DUCET weights as it is.
#    0064 0335;= prevents 0064 0335 from being equal to 0111.
#
# {ch};c+1   ===> 0063 0068;c+1
# {K'};Q++1  ===> 004B 0027;Q++1
#     { } before ; encloses a literal: [A-Za-z'] (alphabets or apostrophe).
#
# {gh}0335;<{g}0127> ===> U+0067,U+0068,U+0335 eq U+0067,U+0127
# {dZ}030C;<{d}017D> ===> U+0064,U+005A,U+030C eq U+0064,U+017D
#     < > after ; can enclose a unit comprising plural XXXX or {literal}.
#
# backwards
#    backwards => 2
#
# upper
#    upper_before_lower => 1
#
# suppress
#    suppress 0400-0417 041A-0437 043A-045F
#        will be converted to
#    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
#
# use
#
# overrideCJK and overrideHangul
#
use 5.006;
use strict;
use warnings;
use Carp;
use File::Spec;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Collate cannot stringify a Unicode code point\n";
    }
}

sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
sub trim { $_[0] =~ s/^\ +//; $_[0] =~ s/\ +\z// }

sub ce {
    my $var = shift;
    my $vc = $var ? '*' : '.';
    my $hx = join '.', map { sprintf '%04X', $_ } @_;
    return "[$vc$hx]";
}

our $PACKAGE = 'Unicode::Collate, locale';
our $ENT_FMT = "%-9s ; %s # %s\n";
our $RE_CE   = '(?:\[[0-9A-Fa-f\.\*]+\])';


use constant SBase  => 0xAC00;
use constant SFinal => 0xD7A3;
use constant NCount =>    588;
use constant TCount =>     28;
use constant LBase  => 0x1100;
use constant VBase  => 0x1161;
use constant TBase  => 0x11A7;

sub decHangul {
    my $code = shift;
    my $si = $code - SBase;
    my $li = int( $si / NCount);
    my $vi = int(($si % NCount) / TCount);
    my $ti =      $si % TCount;
    return (
	LBase + $li,
	VBase + $vi,
	$ti ? (TBase + $ti) : (),
    );
}


my (%Keys, %Code, %Equiv, %Name, $vDUCET, %JamoWt1);
my $JamoWt2 = 0;

my @OtherEquiv = split /\n=/, <<'OTHEREQUIV';
=00C2= (a-circ)
1EA7;<00E2><0300>
1EA6;<00C2><0300>
1EA5;<00E2><0301>
1EA4;<00C2><0301>
1EAB;<00E2><0303>
1EAA;<00C2><0303>
1EA9;<00E2><0309>
1EA8;<00C2><0309>
1EAD;<00E2><0323>
1EAC;<00C2><0323>
=00C4= (a-uml)
01DF;<00E4><0304>
01DE;<00C4><0304>
=00C5= (a-ring)
01FB;<00E5><0301>
01FA;<00C5><0301>
=00C6= (ae-lig)
1D2D;<00C6>+++12
01FD;<00E6><0301>
01FC;<00C6><0301>
01E3;<00E6><0304>
01E2;<00C6><0304>
=00CA= (e-circ)
1EC1;<00EA><0300>
1EC0;<00CA><0300>
1EBF;<00EA><0301>
1EBE;<00CA><0301>
1EC5;<00EA><0303>
1EC4;<00CA><0303>
1EC3;<00EA><0309>
1EC2;<00CA><0309>
1EC7;<00EA><0323>
1EC6;<00CA><0323>
=00D4= (o-circ)
1ED3;<00F4><0300>
1ED2;<00D4><0300>
1ED1;<00F4><0301>
1ED0;<00D4><0301>
1ED7;<00F4><0303>
1ED6;<00D4><0303>
1ED5;<00F4><0309>
1ED4;<00D4><0309>
1ED9;<00F4><0323>
1ED8;<00D4><0323>
=00D5= (o-tilde)
1E4D;<00F5><0301>
1E4C;<00D5><0301>
022D;<00F5><0304>
022C;<00D5><0304>
1E4F;<00F5><0308>
1E4E;<00D5><0308>
1EE1;<00F5><031B>
1EE0;<00D5><031B>
=00D6= (o-uml)
022B;<00F6><0304>
022A;<00D6><0304>
=00D8= (o-slash)
01FF;<00F8><0301>
01FE;<00D8><0301>
=00DC= (u-uml)
01DC;<00FC><0300>
01DB;<00DC><0300>
01D8;<00FC><0301>
01D7;<00DC><0301>
01D6;<00FC><0304>
01D5;<00DC><0304>
01DA;<00FC><030C>
01D9;<00DC><030C>
=0102= (a-breve)
1EB1;<0103><0300>
1EB0;<0102><0300>
1EAF;<0103><0301>
1EAE;<0102><0301>
1EB5;<0103><0303>
1EB4;<0102><0303>
1EB3;<0103><0309>
1EB2;<0102><0309>
1EB7;<0103><0323>
1EB6;<0102><0323>
=01A0= (o-horn)
1EDD;<01A1><0300>
1EDC;<01A0><0300>
1EDB;<01A1><0301>
1EDA;<01A0><0301>
1EE1;<01A1><0303>
1EE0;<01A0><0303>
1EDF;<01A1><0309>
1EDE;<01A0><0309>
1EE3;<01A1><0323>
1EE2;<01A0><0323>
=01AF= (u-horn)
1EEB;<01B0><0300>
1EEA;<01AF><0300>
1EE9;<01B0><0301>
1EE8;<01AF><0301>
1EEF;<01B0><0303>
1EEE;<01AF><0303>
1EED;<01B0><0309>
1EEC;<01AF><0309>
1EF1;<01B0><0323>
1EF0;<01AF><0323>
=0301= (acute)
00E1;a<0301>
00C1;A<0301>
00E9;e<0301>
00C9;E<0301>
00ED;i<0301>
00CD;I<0301>
00F3;o<0301>
00D3;O<0301>
00FA;u<0301>
00DA;U<0301>
00FD;y<0301>
00DD;Y<0301>
=0303= (tilde)
00E3;a<0303>
00C3;A<0303>
1EBD;e<0303>
1EBC;E<0303>
0129;i<0303>
0128;I<0303>
00F5;o<0303>
00D5;O<0303>
0169;u<0303>
0168;U<0303>
1EF9;y<0303>
1EF8;Y<0303>
=0309= (hook-above)
1EA3;a<0309>
1EA2;A<0309>
1EBB;e<0309>
1EBA;E<0309>
1EC9;i<0309>
1EC8;I<0309>
1ECF;o<0309>
1ECE;O<0309>
1EE7;u<0309>
1EE6;U<0309>
1EF7;y<0309>
1EF6;Y<0309>
=0323= (dot-below)
1EA1;a<0323>
1EA0;A<0323>
1EB9;e<0323>
1EB8;E<0323>
1ECB;i<0323>
1ECA;I<0323>
1ECD;o<0323>
1ECC;O<0323>
1EE5;u<0323>
1EE4;U<0323>
1EF5;y<0323>
1EF4;Y<0323>
=0629= (arabic-teh-marbuta)
FE93;<0629>
FE94;<0629>
=1EB8= (e-dot-below)
1EC7;<1EB9><0302>
1EC6;<1EB8><0302>
=1ECC= (o-dot-below)
1ED9;<1ECD><0302>
1ED8;<1ECC><0302>
1EE3;<1ECD><031B>
1EE2;<1ECC><031B>
=1EE4= (u-dot-below)
1EF1;<1EE5><031B>
1EF0;<1EE4><031B>
OTHEREQUIV

my %OtherEquiv;
for my $o (@OtherEquiv) {
    my @ln = split /\n/, $o;
    my $uv = shift @ln;
    $uv =~ s/ *\([a-z-]+\) *//;
    $uv =~ tr/=//d;
    croak "$PACKAGE: $uv is invalid in OTHEREQUIV" if $uv !~ /^[0-9A-F]+\z/;
    $OtherEquiv{$uv} = \@ln;
}

##### read DUCET #####

{
    my($f, $fh);
    foreach my $d ('.') {
	$f = File::Spec->catfile($d, "Collate", "allkeys.txt");
	last if open($fh, $f);
	$f = undef;
    }
    croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f;

    while (my $line = <$fh>) {
	chomp $line;
	next if $line =~ /^\s*#/;
	$vDUCET = $1 if $line =~ /^\@version\s*(\S*)/;

	next if $line !~ /^\s*[0-9A-Fa-f]/;

	my $name = '';
	$line =~ s/[#%]\s*(.*)// and $name = $1;

	# gets element
	my($e, $k) = split /;/, $line;
	trim($e);
	trim($k);
	$name =~ s/; QQ[A-Z]+//;

	next if $k eq '[.0000.0000.0000.0000]';

	croak "Wrong Entry: <charList> must be separated by ';' ".
	      "from <collElement>" if ! $k;
	push @{ $Equiv{$k} }, $e if exists $Code{$k};

	$Keys{$e} = $k;
	$Code{$k} = $e;
	$Name{$e} = $name;

	if ($e =~ /^11[0-9A-F]{2}\z/) { # Hangul Jamo (modern only)
	    my @ec = _getHexArray($e);
	    my @kc = _getHexArray($k);
	    if (@ec == 1) {
		$JamoWt1{$ec[0]} = $kc[0]; # each jamo
		$JamoWt2 = $kc[1] if $JamoWt2 < $kc[1]; # max
	    }
	}
    }
    close $fh;
}

##### Korean.pm #####

{
    my($f, $fh);
    foreach my $d ('.') {
	$f = File::Spec->catfile($d, "Collate", "CJK", "Korean.pm");
	last if open($fh, $f);
	$f = undef;
    }
    croak "$PACKAGE: Collate/CJK/Korean.pm is not found" if !defined $f;

    my %KO_jamo;
    my $KO_head = '';
    my $KO_foot = '';
    my $KO_state = 0;
    my $KO_data  = 0;
    while (my $line = <$fh>) {
	chomp $line;
	if ($KO_state == 0) {
	    $KO_head .= "$line\n";
	}
	if ($KO_state == 2) {
	    $KO_foot .= "$line\n";
	}
	$KO_state = 1 if $line =~ /^my %jamo2prim/;
	$KO_state = 2 if $line =~ /^\); # for DUCET/;

	$KO_data = 1 if $line =~ /^__DATA__/;
	$KO_data = 0 if $line =~ /^__END__/;

	if ($KO_data) {
	    next if $line !~ /^[A-D]/;
	    my @u = _getHexArray($line);
	    croak "unexpected $line" if @u != 1;
	    my $uv = $u[0];
	    croak "unexpected $line" unless SBase <= $uv && $uv <= SFinal;
	    $KO_jamo{$_} = 1 for decHangul($uv);
	}
    }
    close $fh;

    open my $pm, ">Korean.pm" or die 'Korean.pm';
    print $pm $KO_head;

    my $count = 0;
    for (sort keys %KO_jamo) {
	print $pm ' ' if ($count % 4) == 0;
	++$count;
	print $pm '   ';
	print $pm sprintf '0x%04X, 0x%04X,', $_, $JamoWt1{$_};
	print $pm "\n" if ($count % 4) == 0;
    }
    print $pm "\n" if $count % 4 != 0;

    my $v = $vDUCET ? " # for DUCET v$vDUCET" : '';
    print $pm ");$v\n";

    $JamoWt2 = sprintf "0x%02X", $JamoWt2;
    $KO_foot =~ s/(\$wt = )[0-9x]+/$1$JamoWt2/;
    print $pm $KO_foot;
    close $pm;
}

##### Locale/*.pl #####

opendir DIR, "data" or croak "no data";
my @txts = grep !/^\./, readdir DIR;
closedir DIR;

mkdir 'Locale', 0666;
for my $txt (@txts) {
    my($fh, $ph);
    my %locale_keys;
    my $txtfile = File::Spec->catfile('data', $txt);
    my $pl = $txt;
       $pl =~ s/\.txt\z/.pl/ or croak "$PACKAGE: data/$txt is not .txt";
    my $plfile = File::Spec->catfile('Locale', $pl);

    open($fh, $txtfile) or croak "$PACKAGE: data/$txt is not found";
    open($ph, ">$plfile") or croak "$PACKAGE: locale/$pl can't be made";

    my $ptxt  = '';
    my $entry = '';
    while (<$fh>) {
	chomp;
	if (/^use/) {
	    print $ph "$_;\n";
	    next;
	}
	if (/^backwards$/) {
	    $ptxt .= "   backwards => 2,\n";
	    next;
	}
	if (s/^override(CJK|Hangul)[ \t]+//) {
	    $ptxt .= "   override$1 => $_,\n";
	    next;
	}
	if (/^upper$/) {
	    $ptxt .= "   upper_before_lower => 1,\n";
	    next;
	}
	if (s/^suppress//) {
	    my @c = split;
	    for (@c) {
		s/(?:0[xX])?([0-9A-Fa-f]+)/0x$1/g;
		s/-/../g;
	    }
	    my $list = join ", ", @c;
	    $ptxt .= "   suppress => [$list],\n";
	    next;
	}
	if (/^\s*(#\s*)/) {
	    $ptxt .= "$_\n" if $1 ne '#';
	    next;
	}
	next if /^\s*\z/;

	my($e,$rule) = split_e_rule($_);
	my $name = getname($e);
	my $eq_rule = $rule eq '=';
	$rule = join '', map "<$_>", split ' ', $e if $eq_rule;
	my ($newce, $simpdec) = parserule($e, $rule, \%locale_keys);

	if (!$locale_keys{$e}) {
	    $entry .= sprintf $ENT_FMT, $e, $newce, $name if !$eq_rule;
	    $locale_keys{$e} = $newce;
	}

	if (!$simpdec && $Keys{$e}) { # duplicate for the decomposition
	    my $key = $Keys{$e};
	    my @ce = $key =~ /$RE_CE/go;
	    if (@ce > 1) {
		my $ok = 1;
		my $ee = '';
		for my $c (@ce) {
		    $ok = 0, last if !$Code{$c};
		    $ee .= ' ' if $ee ne '';
		    $ee .= $Code{$c};
		}
		if ($ok && !$locale_keys{$ee}) {
		    $entry .= sprintf $ENT_FMT, $ee, $newce, $name;
		    $locale_keys{$ee} = $newce;
		}
		if ($ee =~ s/ 030([01])/ 034$1/ &&
		    $ok && !$locale_keys{$ee}) {
		    $entry .= sprintf $ENT_FMT, $ee, $newce, $name;
		    $locale_keys{$ee} = $newce;
		}
	    }
	    if ($Equiv{$key}) {
		for my $eq (@{ $Equiv{$key} }) {
		    next if $locale_keys{$eq};
		    $entry .= sprintf $ENT_FMT, $eq, $newce, $Name{$eq};
		    $locale_keys{$eq} = $newce;
		}
	    }
	}

	if ($OtherEquiv{$e}) {
	    for my $o (@{ $OtherEquiv{$e} }) {
		my($e,$rule) = split_e_rule($o);
		my $name = getname($e);
		(my $newce, undef) = parserule($e, $rule, \%locale_keys);
		if (!$locale_keys{$e}) {
		    $entry .= sprintf $ENT_FMT, $e, $newce, $name;
		    $locale_keys{$e} = $newce;
		}
	    }
	}
    }
    if ($entry) {
	my $v = $vDUCET ? " # for DUCET v$vDUCET" : '';
	$ptxt .= "   entry => <<'ENTRY',$v\n";
	$ptxt .= $entry;
	$ptxt .= "ENTRY\n";
    }

    print $ph "+{\n$ptxt};\n";
    close $fh;
    close $ph;
}

sub getunicode {
    my $c = shift;
    my @c = split //, $c;
    return join ' ', map { sprintf '%04X', unpack 'U', $_ } @c;
}

sub parse_element {
    my $e = shift;
    $e =~ s/\{([A-Za-z']+)\}/' '.getunicode($1).' '/ge;
    $e =~ s/ +/ /g;
    trim($e);
    return $e;
}

sub split_e_rule {
    my $line = shift;
    my($e, $r) = split /;/, $line;
    return (parse_element($e), $r);
}

sub getname {
    my $e = shift;
    return $Name{$e} if $Name{$e};  # single collation element (without <>)
    my @e = split ' ', $e;
    my @name = map { $Name{$_} ? $Name{$_} : 'unknown' } @e;
    return sprintf '<%s>', join ', ', @name;
}

sub parserule {
    (my $e   = shift) =~ s/ .*\z//;
    my $rule = shift;
    my $lockeys = shift;
    my $result = '';
    my $simple_decomp = 1; # rules containing only [A-Za-z] or <XXXX>

    for (my $prerule = $rule; $rule ne ''; $prerule = $rule) {
	$rule =~ s/^ +//;
	last if $rule =~ /^#/;
	if ($rule =~ s/^($RE_CE)//o) {
	    my $k = $1;
	    my $var = $k =~ /^\[\*/ ? 1 : 0;
	    my @c = _getHexArray($k);
	    $result .= ce($var, @c);
	    next;
	}

	my $key;
	if ($rule =~ s/^(<[0-9A-Za-z'{ }]+>|[A-Za-z])//) {
	    my $e = $1;
	    my $c = $e =~ tr/<>//d ? parse_element($e) : getunicode($e);
	    croak "<$c> is too short" if 4 > length $c;
	    $key = $lockeys->{$c} || $Keys{$c};
	}

	my @base;
	for my $k ($key =~ /$RE_CE/go) {
	    my $var = $k =~ /^\[\*/ ? 1 : 0;
	    push @base, [$var, _getHexArray($k)];
	}
	croak "the rule seems wrong at $prerule" if !@base;

	my $replaced = 0;
	while ($rule =~ s/^(([+-])\2*)(\d+)//) {
	    my $idx = length($1);
	    my $num = $2 eq '-' ? -$3 : $3;
	    $base[0][$idx] += $num;
	    ++$replaced;
	}

	$simple_decomp = 0 if $replaced;
	for my $c (@base) {
	    $c->[4] = hex $e if $replaced;
	    $result .= ce(@$c);
	}
	croak "something wrong at $rule" if $prerule eq $rule;
    }
    return($result, $simple_decomp);
}
