#!/usr/bin/perl -w
# Copyright (c) 2010-2011 Sullivan Beck.  All rights reserved.
# This program is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.

###############################################################################
###############################################################################
# This script is used to automatically generate the Locale::Codes module
# which contain the actual codes.

require 5.000000;
use YAML;
use IO::File;
use strict;
use warnings;
use Archive::Zip;
use Encode;
use Text::CSV::Slurp;

use lib "./internal";

our $VERSION;
$VERSION='3.16';

our $DEBUG;
$DEBUG = 0;

# so the CPAN indexer won't treat this as a POD file
our $podstr = '=pod';
our $hdstr  = '=head1';

###############################################################################
# GLOBAL VARIABLES
###############################################################################

# We need to create the following variables:
#
#  %Country{COUNTRY_ID}           => [ COUNTRY, COUNTRY, ... ]
#                                    A list of all valid country names that
#                                    correspond to a given COUNTRY_ID.
#                                    The names are all real (i.e. correct
#                                    spelling and capitalization).
#  %CountryAlias{ALIAS}           => [ COUNTRY_ID, I ]
#                                    A hash of all aliases for a country.
#                                    Aliases are all lowercase.
#  %Code2CountryID{CODESET}{CODE} => [ COUNTRY_ID, I ]
#                                    In a given CODESET, CODE corresponds to
#                                    the I'th entry list of countries.
#  %CountryID2Code{CODESET}{COUNTRY_ID} => CODE
#                                    In the given CODESET, the COUNTRY_ID
#                                    corresponds to the given CODE.

our($CountryID,%Country,%CountryAlias,%Code2CountryID,%CountryID2Code);
our(%country_alias);

$CountryID = "0001";

our($ModDir,$CountryModule);

$ModDir        = "lib/Locale/Codes";
$CountryModule = "Country";

#
# We'll first read data from the official ISO 3166.
#
# Data available consists only of the country names and 2-character
# codes. Country names include non-ASCII characters encoded in
# ISO-8859-1. Also, they're all uppercase! Every line in the file ends
# with one unprintable character. In other words, they're distributed
# in the most unfriendly fashion you could ask for! We'll store the
# first country for error checking.
#

our($country_iso_url,$country_iso_file,$country_iso_1st);
our(%country_iso_orig);

$country_iso_url    = "http://www.iso.org/iso/list-en1-semic-3.txt";
$country_iso_1st    = "AFGHANISTAN";
($country_iso_file) = $country_iso_url =~ m,/([^/]*)$,;

#
# The UN Stats Division contains some (but not all) of the ISO 3166
# 3-character codes and 3-digit codes. Since they are the maintainers
# of this data, this is an official source.
#

our($country_un_url,$country_un_file);
our(%country_un_orig);

$country_un_url    = "http://unstats.un.org/unsd/methods/m49/m49alpha.htm";
($country_un_file) = $country_un_url =~ m,/([^/]*)$,;

#
# The National Geospatial-Intelligence Agency is the official source
# for FIPS 10 codes.
#

our($country_nga_url,$country_nga_file);
our(%country_nga_orig,%country_nga_ignore);

$country_nga_url    = "http://earth-info.nga.mil/gns/html/digraphs.htm";
($country_nga_file) = $country_nga_url =~ m,/([^/]*)$,;

#
# IANA domains
#

our($country_iana_url,$country_iana_file);
our(%country_iana_orig);

$country_iana_url    = "http://www.iana.org/domains/root/db/";
($country_iana_file) = "index.html";

#
# This is the CIA World Factbook, which is assumed to be a reliable
# source of this data. Due to the complexity of the data, we'll store
# the last country so we know when to stop.
#
# We have to force-override some codes.
#

our($country_cia_url,$country_cia_file,$country_cia_last);
our(%country_cia_ignore,%country_cia_orig,%country_cia_codes);

$country_cia_url = "https://www.cia.gov/library/publications/the-world-factbook/appendix/print_appendix-d.html";
($country_cia_file) = $country_cia_url =~ m,/([^/]*)$,;
$country_cia_last = "Zimbabwe";

require "data.country.pl";

########################################

# We need to create the following variables:
#
#  %Language{LANGUAGE_ID}           => [ LANGUAGE, LANGUAGE, ... ]
#                                     A list of all valid language names that
#                                     correspond to a given LANGUAGE_ID.
#                                     The names are all real (i.e. correct
#                                     spelling and capitalization).
#  %LanguageAlias{ALIAS}           => [ LANGUAGE_ID, I ]
#                                     A hash of all aliases for a language.
#                                     Aliases are all lowercase.
#  %Code2LanguageID{CODESET}{CODE} => [ LANGUAGE_ID, I ]
#                                     In a given CODESET, CODE corresponds to
#                                     the I'th entry in the list of languages.
#  %LanguageID2Code{CODESET}{LANGUAGE_ID} => CODE
#                                     In the given CODESET, the LANGUAGE_ID
#                                     corresponds to the given CODE.

our($LanguageID,%Language,%LanguageAlias,%Code2LanguageID,%LanguageID2Code);
our(%language_alias);

$LanguageID = "0001";

our($LanguageModule);

$LanguageModule = "Language";

#
# We'll first read data from the official ISO 639.
#
# Data available consists of the language names and 2-letter and
# 3-letter codes. Language names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($language_iso_url,$language_iso_file);
our(%language_iso_orig);

$language_iso_url    = "http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt";
($language_iso_file) = $language_iso_url =~ m,/([^/]*)$,;

require "data.language.pl";

########################################

# We need to create the following variables:
#
#  %Currency{CURRENCY_ID}           => [ CURRENCY, CURRENCY, ... ]
#                                      A list of all valid currency names that
#                                      correspond to a given CURRENCY_ID.
#                                      The names are all real (i.e. correct
#                                      spelling and capitalization).
#  %CurrencyAlias{ALIAS}            => [ CURRENCY_ID, I ]
#                                      A hash of all aliases for a currency.
#                                      Aliases are all lowercase.
#  %Code2CurrencyID{CODESET}{CODE}  => [ CURRENCY_ID, I ]
#                                      In a given CODESET, CODE corresponds to
#                                      the I'th entry in the list of currencies.
#  %CurrencyID2Code{CODESET}{CURRENCY_ID} => CODE
#                                      In the given CODESET, the CURRENCY_ID
#                                      corresponds to the given CODE.

our($CurrencyID,%Currency,%CurrencyAlias,%Code2CurrencyID,%CurrencyID2Code);
our(%currency_alias);

$CurrencyID = "0001";

our($CurrencyModule);

$CurrencyModule = "Currency";

#
# We'll first read data from the official ISO 4217.
#

our($currency_iso_url,$currency_iso_file,$currency_csv_file);
our(%currency_iso_orig,%currency_iso_ignore);

$currency_iso_url    = "http://www.currency-iso.org/dl_iso_table_a1.xls";
($currency_iso_file) = $currency_iso_url =~ m,/([^/]*)$,;
$currency_csv_file   = "currency.csv";

require "data.currency.pl";

########################################

# We need to create the following variables:
#
#  %Script{SCRIPT_ID}              => [ SCRIPT, SCRIPT, ... ]
#                                     A list of all valid script names that
#                                     correspond to a given SCRIPT_ID.
#                                     The names are all real (i.e. correct
#                                     spelling and capitalization).
#  %ScriptAlias{ALIAS}             => [ SCRIPT_ID, I ]
#                                     A hash of all aliases for a script.
#                                     Aliases are all lowercase.
#  %Code2ScriptID{CODESET}{CODE}   => [ SCRIPT_ID, I ]
#                                     In a given CODESET, CODE corresponds to
#                                     the I'th entry in the list of scripts.
#  %ScriptID2Code{CODESET}{SCRIPT_ID} => CODE
#                                     In the given CODESET, the SCRIPT_ID
#                                     corresponds to the given CODE.

our($ScriptID,%Script,%ScriptAlias,%Code2ScriptID,%ScriptID2Code);
our(%script_alias);

$ScriptID = "0001";

our($ScriptModule);

$ScriptModule = "Script";

#
# We'll first read data from the official ISO 15924.
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($script_iso_url,$script_iso_file,$script_iso_tmp);
our(%script_iso_orig,%script_iso_ignore);

$script_iso_url    = "http://www.unicode.org/iso15924/iso15924.txt.zip";
($script_iso_file) = $script_iso_url =~ m,/([^/]*)$,;
$script_iso_tmp    = "iso15924.txt";

require "data.script.pl";

###############################################################################
# HELP
###############################################################################

our($usage);
my $COM = $0;
$COM =~ s/^.*\///;

$usage=
  "usage: $COM OPTIONS
      -h/--help       : Print help.

      -a/--all        : Do all steps

      -c/--country    : Get the country codes
      -l/--language   : Get the language codes
      -r/--currency   : Get the currency codes
      -s/--script     : Get the script codes
      -C/--clean      : Clean up all temporary files
";

###############################################################################
# PARSE ARGUMENTS
###############################################################################

my $do_all      = 0;
my $do_country  = 0;
my $do_language = 0;
my $do_currency = 0;
my $do_script   = 0;
my $do_clean    = 0;

while ($_ = shift) {

   (print $usage),   exit  if ($_ eq "-h"   ||  $_ eq "--help");

   $do_all = 1,      next  if ($_ eq "-a"   ||  $_ eq "--all");

   $do_country = 1,  next  if ($_ eq "-c"   ||  $_ eq "--country");
   $do_language = 1, next  if ($_ eq "-l"   ||  $_ eq "--language");
   $do_currency = 1, next  if ($_ eq "-r"   ||  $_ eq "--currency");
   $do_script = 1,   next  if ($_ eq "-s"   ||  $_ eq "--script");
   $do_clean = 1,    next  if ($_ eq "-C"   ||  $_ eq "--clean");
}

############################################################################
# MAIN PROGRAM
############################################################################

do_country()    if ($do_all  ||  $do_country);
do_language()   if ($do_all  ||  $do_language);
do_currency()   if ($do_all  ||  $do_currency);
do_script()     if ($do_all  ||  $do_script);
do_clean()      if ($do_all  ||  $do_clean);

############################################################################
# DO_COUNTRY
############################################################################

sub do_country {
   print "Country codes...\n";

   do_country_iso();
   print_table("country")  if ($DEBUG == 2);

   do_country_un();
   print_table("country")  if ($DEBUG == 2);

   do_country_nga();
   print_table("country")  if ($DEBUG == 2);

   do_country_iana();
   print_table("country")  if ($DEBUG == 2);

   do_country_cia();
   print_table("country")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("country");
   print_table("country")  if ($DEBUG);

   write_module("country");
}

########################################
sub do_country_iso {

   ###
   ### The first set we'll do is the ISO 3166-1 2-character
   ### codes. These country names must be adjusted (since they're all
   ### uppercase). Also, the lines all end with some strange
   ### unprintable character.
   ###

   my $codeset = "alpha2";

   system("wget -N -q $country_iso_url");
   my @in = `cat $country_iso_file`;
   chomp(@in);
   chop(@in);

   my $in = join("\n",@in);
   $in = encode('UTF-8',decode('ISO-8859-1',$in));
   @in = split("\n",$in);

   # File is a line of text followed by a blank line followed by the
   # codes as ORIGNAME;CODE .

   if ($in[1]  ||  $in[2] !~ /^$country_iso_1st;/) {
      die "ERROR [iso]: country code file format changed!\n";
   }

   shift(@in);
   shift(@in);

   foreach my $line (@in) {
      next  if (! $line);
      if ($line !~ /^(.*);(.*)$/) {
         die "ERROR [iso]: line invalid\n" .
             "             $line\n";
      }
      my($country,$code) = ($1,$2);
      $code = lc($code);
      if (exists $country_iso_orig{$country}) {
         $country = $country_iso_orig{$country};
      } else {
         print "WARNING [iso]: unknown country: $country\n";
         next;
      }

      my $countryID = $CountryID++;

      $Country{$countryID}                  = [ $country ];
      $CountryAlias{lc($country)}           = [ $countryID, 0 ];
      $Code2CountryID{$codeset}{$code}      = [ $countryID, 0 ];
      $CountryID2Code{$codeset}{$countryID} = $code;
   }
}

########################################
sub do_country_un {

   print "\nINFO [un]: expect Channel Islands\n";

   ###
   ### The UN data contains most of the alpha-3 and numeric code sets.
   ###

   system("wget -N -q $country_un_url");
   my @in = `cat $country_un_file`;
   chomp(@in);
   my $in = join("",@in);
   $in = encode('UTF-8',decode('ISO-8859-1',$in));

   # Clean up some things that could cause problems in parsing:

   strip_tags(\$in,1,"br","p","strong","div");
   $in =~ s,\&nbsp;, ,g;
   $in =~ s,\s+, ,g;

   # Look for a table who's first row has the header:
   #    ISO ALPHA-3 code

   my $found = jump_to_row(\$in,"ISO ALPHA-3 code");
   if (! $found) {
      die "ERROR [un]: country code file format changed!\n";
   }

   while (1) {
      my @row = get_row("un",\$in);
      last  if (! @row);

      my($num,$country,$alpha3) = @row;
      $alpha3 = lc($alpha3);

      if ($num) {
         if ($num !~ /^\d+$/  ||  length($num) > 3) {
           print "WARNING [un]: Invalid numeric code: $country => $num\n";
           next;
        }
      }

      if ($alpha3  &&  $alpha3 !~ /^[a-z][a-z][a-z]$/) {
         print "WARNING [un]: Invalid alpha-3 code: $country => $alpha3\n";
         next;
      }

      if (exists $country_un_orig{$country}) {
         $country = $country_un_orig{$country};
      }

      my($err,$countryID,$i,$t) = get_countryID("un",$country);
      next  if ($err);

      if ($t eq "new") {
         print "INFO [un]: new country: $alpha3/$num\t$country\n";
      }

      if ($num) {
         $num = "0$num"  while (length($num) < 3);
         $Code2CountryID{"num"}{$num} = [ $countryID, $i ];
         $CountryID2Code{"num"}{$countryID} = $num;
      }

      if ($alpha3) {
         $Code2CountryID{"alpha3"}{$alpha3} = [ $countryID, $i ];
         $CountryID2Code{"alpha3"}{$countryID} = $alpha3;
      }
   }
}

########################################
sub do_country_nga {

   print "\nINFO [nga]: expect Serbia, Ashmore, Baker, Bassas, Clipperton,\n" .
           "            Coral, Europa, Gaza, Glorioso, Howland, Jan Mayen,\n" .
           "            Jarvis, Johnston, Juan, Kingman, Midway, Navassa, Netherlands,\n" .
           "            Palmyra, Paracel, Spratly, Svalbard, Tromelin, Wake, West\n";

   ###
   ### The NGA data contains the FIPS 10 codes
   ###

   system("wget -N -q $country_nga_url");
   my @in = `cat $country_nga_file`;
   chomp(@in);
   my $in = join("",@in);

   # Clean up some things that could cause problems in parsing:

   strip_tags(\$in,1,"br","p","strong","div");
   $in =~ s,\Q[United States}\E,,;       # A horrible typo in the HTML
   $in =~ s,\&nbsp;, ,g;
   $in =~ s,Other:, ,g;
   $in =~ s,\(see note[^\)]*\), ,g;
   $in =~ s,\[[^\]]*\], ,g;
   $in =~ s,\s+, ,g;

   # Look for a table who's first row has the header:
   #    SHORT FORM NAME
   # and then a table with the header:
   #    Short Form Name

   foreach my $table ("SHORT FORM NAME","Short Form Name") {
      my $found = jump_to_row(\$in,$table);
      if (! $found) {
         die "ERROR [nga]: country code file format changed!\n";
      }

      while (1) {
         my @row = get_row("nga",\$in);
         last  if (! @row);

         my($short,$long,$code) = @row;
         $code  = uc($code);
         next  if ($code eq "N/A");
         next  if (exists $country_nga_ignore{$short});

         if ($code !~ /^[A-Z][A-Z]$/) {
            print "WARNING [nga]: Invalid code: $short => $code\n";
         }

         if ($short eq "None") {
            print "ERROR [nga]: no short definition: $code  $long\n";
            next;
         }
         $long  = ""  if ($long  eq "None");

         if (exists $country_nga_orig{$short}) {
            $short = $country_nga_orig{$short};
         }
         if (exists $country_nga_orig{$long}) {
            $long = $country_nga_orig{$long};
         }

         my($countryID,$i);
         my($err,$c,$ii,$t) = get_countryID("nga",$short,1);
         next  if ($err);

         if ($t ne "new") {
            ($countryID,$i) = ($c,$ii);
         } else {
            ($err,$c,$ii,$t) = get_countryID("nga",$long,1);
            ($countryID,$i) = ($c,-1);
            if ($t eq "new") {
               print "INFO [nga]: new country: $code\t$short\n";
            }
         }

         if ($countryID == -1) {
            # New country
            $countryID = $CountryID++;
            $i = 0;
            $Country{$countryID} = [ $short ];
            $CountryAlias{lc($short)} = [ $countryID, $i ];

         } elsif ($i == -1) {
            # $short is a new alias
            push @{ $Country{$countryID} },$short;
            $i = $#{ $Country{$countryID} };
            $CountryAlias{lc($short)} = [ $countryID, $i ];
         }

         if ($long  &&
             ! exists $CountryAlias{lc($long)}) {
            # $long is a new alias
            push @{ $Country{$countryID} },$long;
            $i = $#{ $Country{$countryID} };
            $CountryAlias{lc($long)} = [ $countryID, $i ];
         }

         $Code2CountryID{"fips"}{$code} = [ $countryID, $i ];
         $CountryID2Code{"fips"}{$countryID} = $code;
      }
   }
}

########################################
sub do_country_iana {

   print "\nINFO [iana]: expect Acension, European, Soviet, Portuguese\n";

   ###
   ### The IANA data contains the domain names
   ###

   system("wget -N -q $country_iana_url");
   my @in = `cat $country_iana_file`;
   chomp(@in);
   my $in = join("",@in);

   # Clean up some things that could cause problems in parsing:

   strip_tags(\$in,1,"a");
   $in =~ s,\&nbsp;, ,g;
   $in =~ s,\(being phased out\), ,g;
   $in =~ s,\s+, ,g;

   # Look for a table who's first row has the header:
   #    Sponsoring Organisation

   my $found = jump_to_row(\$in,"Sponsoring Organisation");
   if (! $found) {
      die "ERROR [iana]: country code file format changed!\n";
   }

   while (1) {
      my @row = get_row("iana",\$in);
      last  if (! @row);

      my($dom,$type,$country) = @row;
      next  unless ($type eq "country-code");

      # *** FIX ***
      # For now, we'll remove foreign character domains.
      next  unless ($dom =~ /^\.[A-Z][A-Z]/);

      $dom     =~ s/^\.//;
      $country =~ s,<br.*,,;
      $country =~ s,\s+$,,;

      if (exists $country_iana_orig{$country}) {
         $country = $country_iana_orig{$country};
      }

      my($err,$countryID,$i,$t) = get_countryID("iana",$country);
      next  if ($err);

      if ($t eq "new") {
         print "INFO [iana]: new country: $dom\t$country\n";
      }

      $Code2CountryID{"dom"}{$dom} = [ $countryID, $i ];
      $CountryID2Code{"dom"}{$countryID} = $dom;
   }
}

########################################
sub do_country_cia {

   print "\nINFO [cia]: expect Antarctica, Bouvet, British, Christmas, Cocos,\n" .
         "            Curacao, France (Metro), French, Heard, Kosovo, Montenegro,\n" .
         "            Netherlands, Saint Barth, Saint Martin, Serbia, Taiwan, US\n";

   ###
   ### The CIA data is used to check:
   ###    alpha-2, alpha-3, numeric, fips 10
   ###

   system("wget -N -q $country_cia_url");
   my @in = `cat $country_cia_file`;
   chomp(@in);
   my $in = join("",@in);
   $in =~ s/\015/ /sg;

   # Clean up some things that could cause problems in parsing:

   strip_tags(\$in,1,"br","p","strong","div","a","b","img");
   $in =~ s,\&nbsp;, ,g;
   $in =~ s,\s+, ,g;

   # Move to a table with "Entity" as one of the entries. This
   # table is followed by the entires.

   my $found = jump_to_entry(\$in,"Entity");
   if (! $found) {
      die "ERROR [cia]: country code file format changed!\n";
   }

   # Each entry is quite complicated. Each is a single table (with a
   # table nested in it) of the form:
   #
   #   <table>
   #      <tr>
   #         <td>COUNTRY</td>
   #         <td>FIPS</td>
   #         <td>
   #            <table>
   #               <tr>
   #                  <td>ALPHA-2</td>
   #                  <td>ALPHA-3</td>
   #                  <td>NUMERIC</td>
   #            </table>
   #         </td>
   #         ...
   #     </tr>
   #     ...
   #  </table>
   #
   # After all of the "A" entries, a special table containing the headers
   # is given again.

   while (1) {

      #
      # Read the next entry
      #

      my($country,$fips,$alpha2,$alpha3,$num,$stanag,$dom);

      my $found   = jump_to_table(\$in);
      if (! $found) {
         print "ERROR [cia]: malformed file\n";
         last;
      }
      $country = get_entry(\$in);
      if (! $country) {
         # The first entry is empty if it's at the end of the entries
         # for a given letter.
         $found = jump_to_entry(\$in,"Entity");
         if (! $found) {
            die "ERROR [cia]: country code file format changed!\n";
         }
         next;
      }

      if (exists $country_cia_orig{$country}) {
         $country = $country_cia_orig{$country};
      }

      $fips    = uc(get_entry(\$in));
      $found   = jump_to_table(\$in);
      if (! $found) {
         print "ERROR [cia]: malformed file\n";
         last;
      }
      $alpha2  = lc(get_entry(\$in));
      $alpha3  = lc(get_entry(\$in));
      $num     = get_entry(\$in);
      $stanag  = get_entry(\$in);
      $dom     = uc(get_entry(\$in));
      $dom     =~ s/^\.//;

      next  if (exists $country_cia_ignore{$country});

      $alpha2 = $country_cia_codes{"alpha2"}{$country}
        if (exists $country_cia_codes{"alpha2"}{$country});
      $alpha3 = $country_cia_codes{"alpha3"}{$country}
        if (exists $country_cia_codes{"alpha3"}{$country});
      $num = $country_cia_codes{"num"}{$country}
        if (exists $country_cia_codes{"num"}{$country});
      $fips = $country_cia_codes{"fips"}{$country}
        if (exists $country_cia_codes{"fips"}{$country});
      $dom  = $country_cia_codes{"dom"}{$country}
        if (exists $country_cia_codes{"dom"}{$country});

      #
      # Get the countryID if the country or ANY of the codes match.
      #

      my($err,$countryID,$i,$type) = get_countryID("cia",$country,1);
      next  if ($err);

      if ($countryID == -1  &&  $alpha2 ne "-") {
         if (exists $Code2CountryID{"alpha2"}{$alpha2}) {
            ($countryID,$i) = @{ $Code2CountryID{"alpha2"}{$alpha2} };
         }
      }

      if ($countryID == -1  &&  $alpha3 ne "-") {
         if (exists $Code2CountryID{"alpha3"}{$alpha3}) {
            ($countryID,$i) = @{ $Code2CountryID{"alpha3"}{$alpha3} };
         }
      }

      if ($countryID == -1  &&  $num ne "-") {
         if (exists $Code2CountryID{"num"}{$num}) {
            ($countryID,$i) = @{ $Code2CountryID{"num"}{$num} };
         }
      }

      if ($countryID == -1  &&  $fips ne "-") {
         if (exists $Code2CountryID{"fips"}{$fips}) {
            ($countryID,$i) = @{ $Code2CountryID{"fips"}{$fips} };
         }
      }

      if ($countryID == -1  &&  $dom ne "-") {
         if (exists $Code2CountryID{"dom"}{$dom}) {
            ($countryID,$i) = @{ $Code2CountryID{"dom"}{$dom} };
         }
      }

      if ($countryID == -1) {
         $countryID = $CountryID++;
         $i = 0;
         $Country{$countryID} = [ $country ];
         $CountryAlias{lc($country)} = [ $countryID, $i ];
      }

      #
      # Now check that any previously defined values match the
      # CIA data.
      #

      if ($alpha2 ne "-") {
         my $err = check_code("cia","alpha2",$alpha2,$country,$countryID);
         next  if ($err);
      }

      if ($alpha3 ne "-") {
         my $err = check_code("cia","alpha3",$alpha3,$country,$countryID);
         next  if ($err);
      }

      if ($num ne "-") {
         my $err = check_code("cia","num",$num,$country,$countryID);
         next  if ($err);
      }

      if ($fips ne "-") {
         my $err = check_code("cia","fips",$fips,$country,$countryID);
         next  if ($err);
      }

      if ($dom ne "-") {
         my $err = check_code("cia","dom",$dom,$country,$countryID);
         next  if ($err);
      }

      last  if ($country eq $country_cia_last);
   }
}

########################################
sub check_code {
   my($type,$codeset,$code,$country,$countryID) = @_;

   # Check to make sure that the code is defined.

   if (exists $Code2CountryID{$codeset}{$code}) {
      return _check_code_exists($type,$codeset,$code,$country,$countryID);
   } else {
      return _check_code_new($type,$codeset,$code,$country,$countryID);
   }
}

sub _check_code_exists {
   my($type,$codeset,$code,$country,$countryID) = @_;

   # Check the countryID for the code. It must be the same as the one
   # passed in.

   my $old_countryID = $Code2CountryID{$codeset}{$code}[0];
   if ($countryID != $old_countryID) {
      print "ERROR [$type]: countryID mismatch in code: [$codeset, $country, $code, $countryID != $old_countryID ]\n";
      return 1;
   }

   # If the country is defined, it must be the same CountryID. If it
   # is not, create a new alias.

   if (exists $CountryAlias{lc($country)}) {

      my $alt_countryID = $CountryAlias{lc($country)}[0];

      if ($countryID != $alt_countryID) {
         print "ERROR [$type]: countryID mismatch in country: [$codeset, $country, $code, $countryID != $alt_countryID ]\n";
         return 1;
      }

   } else {
      push @{ $Country{$countryID} },$country;
      my $i = $#{ $Country{$countryID} };
      $CountryAlias{lc($country)} = [ $countryID, $i ];
   }
}

# This is a new code.
sub _check_code_new {
   my($type,$codeset,$code,$country,$countryID) = @_;

   print "INFO [$type]: New code: $codeset [$code] => $country\n";

   # If this country name isn't defined, create it.

   my $i;
   if (exists $CountryAlias{lc($country)}) {
      $i = $CountryAlias{lc($country)}[1];
   } else {
      push @{ $Country{$countryID} },$country;
      $i = $#{ $Country{$countryID} };
      $CountryAlias{lc($country)} = [ $countryID, $i ];
   }

   # This country name is the canonical name for the code.

   $CountryID2Code{$codeset}{$countryID} = $code;
   $Code2CountryID{$codeset}{$code} = [ $countryID, $i ];

   return 0;
}

########################################
sub get_countryID {
   my($type,$country,$no_create) = @_;

   my($countryID,$i,$t);
   if (exists $CountryAlias{lc($country)}) {
      # The country is the same name as one previously defined
      ($countryID,$i) = @{ $CountryAlias{lc($country)} };
      $t = "same";

   } elsif (exists $country_alias{$country}) {
      # It's a new alias for an existing country
      my $c = $country_alias{$country};
      if (! exists $CountryAlias{lc($c)}) {
         print "WARNING [$type]: alias referenced before it is defined: $country => $c\n";
         return (1);
      }
      $countryID = $CountryAlias{lc($c)}[0];
      push @{ $Country{$countryID} },$country;
      $i = $#{ $Country{$countryID} };
      $CountryAlias{lc($country)} = [ $countryID, $i ];
      $t = "alias";

   } else {
      # It's a new country.
      if ($no_create) {
         return(0,-1,-1,"new");
      }
      $countryID = $CountryID++;
      $i         = 0;
      $Country{$countryID} = [ $country ];
      $CountryAlias{lc($country)} = [ $countryID, $i ];
      $t = "new";
   }

   return(0,$countryID,$i,$t);
}

############################################################################
# DO_LANGUAGE
############################################################################

sub do_language {
   print "Language codes...\n";

   do_language_iso();
   print_table("language")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("language");
   print_table("language")  if ($DEBUG);

   write_module("language");
}

########################################
sub do_language_iso {
   ###
   ### The first set we'll do is the ISO codes.
   ###

   system("wget -N -q $language_iso_url");
   open(my $in,'<:encoding(utf8)',$language_iso_file);
   my @in = <$in>;
   close($in);
   chomp(@in);

   # File is a set of lines of fields delimited by "|". Fields are:
   #
   #    alpha3
   #    term
   #    alpha2
   #    English names (semicolon separated list)
   #    French name

   foreach my $line (@in) {
      my($alpha3,$term,$alpha2,$english,$french) = split(/\|/,$line);
      # The first line has some binary characters at the start.
      if (length($alpha3)>3) {
         $alpha3 = substr($alpha3,length($alpha3)-3);
      }

      if (exists $language_iso_orig{$english}) {
         $english = $language_iso_orig{$english};
      }
      my $languageID = $LanguageID++;
      my @language = split(/\s*;\s*/,$english);

      $Language{$languageID}                  = [ @language ];
      for (my $i=0; $i<=$#language; $i++) {
         my $language = $language[$i];
         $LanguageAlias{lc($language)}         = [ $languageID, $i ];
      }

      if ($alpha3) {
         $Code2LanguageID{"alpha3"}{$alpha3}     = [ $languageID, 0 ];
         $LanguageID2Code{"alpha3"}{$languageID} = $alpha3;
      }

      if ($term) {
         $Code2LanguageID{"term"}{$term}         = [ $languageID, 0 ];
         $LanguageID2Code{"term"}{$languageID}   = $term;
      }

      if ($alpha2) {
         $Code2LanguageID{"alpha2"}{$alpha2}     = [ $languageID, 0 ];
         $LanguageID2Code{"alpha2"}{$languageID} = $alpha2;
      }
   }
}

############################################################################
# DO_CURRENCY
############################################################################

sub do_currency {
   print "Currency codes...\n";

   do_currency_iso();
   print_table("currency")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("currency");
   print_table("currency")  if ($DEBUG);

   write_module("currency");
}

########################################
sub do_currency_iso {
   ###
   ### The first set we'll do is the ISO 4217 codes.
   ###

   system("wget -N -q $currency_iso_url");
   system("xls2csv -x $currency_iso_file -b WINDOWS-1252 -c $currency_csv_file -a UTF-8 -q");
   my @in = `cat $currency_csv_file`;
   shift(@in);
   my $in = Text::CSV::Slurp->load(string => join("",@in));

   foreach my $ele (@$in) {
      my $ent      = $$ele{'ENTITY'};
      my $currency = $$ele{'Currency'};
      my $alpha    = $$ele{'Alphabetic Code'};
      my $num      = $$ele{'Numeric Code'};
      $num         = ""  if ($num eq "Nil");
      $currency    =~ s/\s+$//;

      next  if (! $alpha  &&  ! $num);

      if (exists $currency_iso_orig{$currency}) {
         $currency = $currency_iso_orig{$currency};
      }

      if ($num) {
         if ($num !~ /^\d+$/  ||  length($num) > 3) {
            print "WARNING [iso]: Invalid numeric code: $currency => $num\n";
            next LINE;
         }
      }

      $alpha = uc($alpha);
      if ($alpha  &&  $alpha !~ /^[A-Z][A-Z][A-Z]$/) {
         print "WARNING [iso]: Invalid alpha code: $currency => $alpha\n";
         next LINE;
      }

      next  if (exists $currency_iso_ignore{$alpha});

      # There's a lot of duplication, so make sure that if this code
      # exists, it is consistant.

      my $new = 1;
      my @currencyID;
      if (exists $Code2CurrencyID{"num"}{$num}) {
         $new = 0;
         my($currencyID,$i) = @{ $Code2CurrencyID{"num"}{$num} };
         push(@currencyID,$currencyID);
      }
      if (exists $Code2CurrencyID{"alpha"}{$alpha}) {
         $new = 0;
         my($currencyID,$i) = @{ $Code2CurrencyID{"alpha"}{$alpha} };
         push(@currencyID,$currencyID);
      }

      if ($#currencyID == 1) {
         if ($currencyID[0] != $currencyID[1]) {
            print "WARNING [iso]: CurrencyID mismatch: $currency => $alpha,$num\n";
            next LINE;
         }
      }

      my $currencyID;
      if ($new) {
         $currencyID = $CurrencyID++;
         $Currency{$currencyID} = [ $currency ];
         $CurrencyAlias{lc($currency)} = [ $currencyID, 0 ]
      } else {
         $currencyID = $currencyID[0];
      }

      if ($num) {
         $num = "0$num"  while (length($num) < 3);
         $Code2CurrencyID{"num"}{$num} = [ $currencyID, 0 ];
         $CurrencyID2Code{"num"}{$currencyID} = $num;
      }

      if ($alpha) {
         $Code2CurrencyID{"alpha"}{$alpha} = [ $currencyID, 0 ];
         $CurrencyID2Code{"alpha"}{$currencyID} = $alpha;
      }
   }
}

############################################################################
# DO_SCRIPT
############################################################################

sub do_script {
   print "Script codes...\n";

   do_script_iso();
   print_table("script")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("script");
   print_table("script")  if ($DEBUG);

   write_module("script");
}

########################################
sub do_script_iso {
   ###
   ### The first set we'll do is the ISO 15924 codes. We
   ### can get a zip file which contains the data.
   ###

   system("wget -N -q $script_iso_url");
   my $zip = Archive::Zip->new($script_iso_file);

   my @members = grep(/^iso15924.*\.txt/,$zip->memberNames());
   if (@members != 1) {
      die "ERROR [iso]: zip file changed format\n";
   }
   my($file) = @members;
   $zip->extractMember($file,$script_iso_tmp);

   #
   # The zip file contains a series of lines in the form:
   #   alpha;numeric;english;...
   # The data is in UTF-8.
   #
   # Every line has an unprintable character at the end.
   #

   my @in = `cat $script_iso_tmp`;
   chomp(@in);
   chop(@in);

   foreach my $line (@in) {
      next  if (! $line  ||  $line =~ /^\043/);
      my($alpha,$num,$script) = split(/;/,$line);
      $alpha = ucfirst(lc($alpha));
      next  if (exists $script_iso_ignore{$alpha});

      if (exists $script_iso_orig{$script}) {
         $script = $script_iso_orig{$script};
      }
      my $scriptID = $ScriptID++;

      $Script{$scriptID}                 = [ $script ];
      $ScriptAlias{lc($script)}          = [ $scriptID, 0 ];

      $Code2ScriptID{"alpha"}{$alpha}    = [ $scriptID, 0 ];
      $ScriptID2Code{"alpha"}{$scriptID} = $alpha;

      $Code2ScriptID{"num"}{$num}        = [ $scriptID, 0 ];
      $ScriptID2Code{"num"}{$scriptID}   = $num;
   }
}

############################################################################
# PRINT_TABLE
############################################################################

sub _type_hashes {
   my($caller) = @_;

   my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);

   if      ($caller eq "country") {
      $type_alias   = \%country_alias;
      $TypeAlias    = \%CountryAlias;
      $Type         = \%Country;
      $TypeID2Code  = \%CountryID2Code;
      $Code2TypeID  = \%Code2CountryID;

   } elsif ($caller eq "language") {
      $type_alias   = \%language_alias;
      $TypeAlias    = \%LanguageAlias;
      $Type         = \%Language;
      $TypeID2Code  = \%LanguageID2Code;
      $Code2TypeID  = \%Code2LanguageID;

   } elsif ($caller eq "currency") {
      $type_alias   = \%currency_alias;
      $TypeAlias    = \%CurrencyAlias;
      $Type         = \%Currency;
      $TypeID2Code  = \%CurrencyID2Code;
      $Code2TypeID  = \%Code2CurrencyID;

   } elsif ($caller eq "script") {
      $type_alias   = \%script_alias;
      $TypeAlias    = \%ScriptAlias;
      $Type         = \%Script;
      $TypeID2Code  = \%ScriptID2Code;
      $Code2TypeID  = \%Code2ScriptID;
   }

   return($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID);
}

sub print_table {
   my($caller) = @_;

   my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
     = _type_hashes($caller);

   foreach my $typeID (sort keys %$Type) {
      my @type = @{ $$Type{$typeID} };
      my $i = 0;

      my $type = shift(@type);
      if (length($type) < 40) {
         $type .= " "x(40-length($type));
      } else {
         $type = substr($type,0,40);
      }

      print "${typeID}[$i] = $type  ";
      foreach my $codeset (keys %$Code2TypeID) {
         my $field = "";
         if (exists $$TypeID2Code{$codeset}{$typeID}) {
            my $code = $$TypeID2Code{$codeset}{$typeID};
            my($code_id,$code_i) = @{ $$Code2TypeID{$codeset}{$code} };
            $field = "$code [$code_id,$code_i]";
            $field .= " ERR"  if ($code_id != $typeID);
         }
         $field = $field . " "x(18-length($field));
         print $field;
      }
      print "\n";
      foreach $type (@type) {
         $i++;
         if (length($type) > 40) {
            $type = substr($type,0,40);
         }
         print "    [$i] = $type\n";
      }
   }
}

############################################################################
# DO_ALIASES
############################################################################

sub do_aliases {
   my($caller) = @_;

   my ($type_alias,$TypeAlias,$Type,$TypeID2Code,$Code2TypeID)
     = _type_hashes($caller);

   # Add remaining aliases.

   foreach my $alias (keys %$type_alias) {
      my $type = $$type_alias{$alias};

      next  if (exists $$TypeAlias{lc($type)}  &&
                exists $$TypeAlias{lc($alias)});

      if (! exists $$TypeAlias{lc($type)}  &&
          ! exists $$TypeAlias{lc($alias)}) {
         print "WARNING: unused type in alias list: $type\n";
         print "WARNING: unused type in alias list: $alias\n";
         next;
      }

      my ($typeID);
      if (exists $$TypeAlias{lc($type)}) {
         $typeID = $$TypeAlias{lc($type)}[0];
         $type   = $alias;
      } else {
         $typeID = $$TypeAlias{lc($alias)}[0];
      }

      push @{ $$Type{$typeID} },$type;
      my $i = $#{ $$Type{$typeID} };
      $$TypeAlias{lc($type)} = [ $typeID, $i ];
   }
}

############################################################################
# WRITE_MODULE
############################################################################

sub write_module {
   my($type) = @_;

   my($module,%hashes,$id);

   if ($type eq "country") {
      $module = $CountryModule;
      %hashes = ("id2names"  => "Country",
                 "alias2id"  => "CountryAlias",
                 "code2id"   => "Code2CountryID",
                 "id2code"   => "CountryID2Code");
      $id     = $CountryID;
   } elsif ($type eq "language") {
      $module = $LanguageModule;
      %hashes = ("id2names"  => "Language",
                 "alias2id"  => "LanguageAlias",
                 "code2id"   => "Code2LanguageID",
                 "id2code"   => "LanguageID2Code");
      $id     = $LanguageID;
   } elsif ($type eq "currency") {
      $module = $CurrencyModule;
      %hashes = ("id2names"  => "Currency",
                 "alias2id"  => "CurrencyAlias",
                 "code2id"   => "Code2CurrencyID",
                 "id2code"   => "CurrencyID2Code");
      $id     = $CurrencyID;
   } elsif ($type eq "script") {
      $module = $ScriptModule;
      %hashes = ("id2names"  => "Script",
                 "alias2id"  => "ScriptAlias",
                 "code2id"   => "Code2ScriptID",
                 "id2code"   => "ScriptID2Code");
      $id     = $ScriptID;
   }

   my $file = "$ModDir/$module.pm";

   my $out = new IO::File;
   $out->open(">$file");
   my $timestamp   = `date`;
   chomp($timestamp);

   print $out "package Locale::Codes::$module;

# This file was automatically generated.  Any changes to this file will
# be lost the next time 'get_codes' is run.
#    Generated on: $timestamp

$podstr

$hdstr NAME

Locale::Codes::$module - $type codes for the Locale::$module module

$hdstr SYNOPSIS

This module contains data used by the Locale::$module module. It is
not intended to be used directly, and contains no calleable routines.

$hdstr AUTHOR

See Locale::Codes for full author history.

Currently maintained by Sullivan Beck (sbeck\@cpan.org).

$hdstr COPYRIGHT

   Copyright (c) 1997-2001 Canon Research Centre Europe (CRE).
   Copyright (c) 2001-2010 Neil Bowers
   Copyright (c) 2010-2011 Sullivan Beck

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

use strict;
use warnings;
require 5.002;

our(\$VERSION);
\$VERSION='3.16';

\$Locale::Codes::Data{'$type'}{'id'} = '$id';

";

   foreach my $h qw(id2names alias2id code2id id2code) {
      my $hash = $hashes{$h};
      print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
      _write_hash($out,$hash);

      print $out "};\n\n";
   }

   print $out "1;\n";

   $out->close();
}

sub _write_hash {
   my($out,$hashname) = @_;

   no strict 'refs';
   my %hash = %$hashname;
   use strict 'refs';
   _write_subhash($out,3,\%hash);
}

sub _write_subhash {
   my($out,$indent,$hashref) = @_;

   my %hash = %$hashref;
   my $ind  = " "x$indent;

   foreach my $key (sort keys %hash) {
      my $val = $hash{$key};
      if (ref($val) eq "HASH") {
         print $out "${ind}q($key) => {\n";
         _write_subhash($out,$indent+3,$val);
         print $out "${ind}   },\n";
      } elsif (ref($val) eq "ARRAY") {
         print $out "${ind}q($key) => [\n";
         _write_sublist($out,$indent+3,$val);
         print $out "${ind}   ],\n";
      } else {
         print $out "${ind}q($key) => q($val),\n";
      }
   }
}

sub _write_sublist {
   my($out,$indent,$listref) = @_;

   my @list = @$listref;
   my $ind  = " "x$indent;

   foreach my $val (@list) {
      if (ref($val) eq "HASH") {
         print $out "${ind}{\n";
         _write_subhash($out,$indent+3,$val);
         print $out "${ind}},\n";
      } elsif (ref($val) eq "ARRAY") {
         print $out "${ind}[\n";
         _write_sublist($out,$indent+3,$val);
         print $out "${ind}],\n";
      } else {
         print $out "${ind}q($val),\n";
      }
   }
}

############################################################################
# DO_CLEAN
############################################################################

sub do_clean {
   print "Cleaning...\n";

   system("rm -f $country_iso_file");
   system("rm -f $country_un_file");
   system("rm -f $country_nga_file");
   system("rm -f $country_cia_file");
   system("rm -f $country_iana_file");
   system("rm -f $language_iso_file");
   system("rm -f $currency_iso_file");
   system("rm -f $currency_csv_file");
   system("rm -f $script_iso_file");
   system("rm -f $script_iso_tmp");
   system("rm -rf __MACOSX");
}

############################################################################
# HTML SCRAPING
############################################################################

sub get_row {
   my($type,$inref) = @_;

   return ()  if ($$inref !~ m,^\s*<tr,);

   if ($$inref !~ s,^(.*?)</tr[^>]*>,,) {
      die "ERROR [$type]: malformed HTML\n";
   }
   my $row = $1;

   if ($row =~ m,<table,) {
      die "ERROR [$type]: embedded table\n";
   }

   my @row;
   while ($row =~ s,(?:.*?)<(td|th)[^>]*>\s*(.*?)\s*</\1[^>]*>,,) {
      my $val = $2;
      push(@row,$val);
   }

   return @row;
}

sub jump_to_row {
   my($inref,$header) = @_;

   if ($$inref =~ s,(.*?)\Q$header\E(.*?)</tr[^>]*>(.*?)(?=<tr),,) {
      return 1;
   } else {
      return 0;
   }
}

sub jump_to_entry {
   my($inref,$value) = @_;

   if ($$inref =~ s,(.*?)(?=<(?:td|th)[^>]*>\s*\Q$value\E\s*),,) {
      return 1;
   } else {
      return 0;
   }
}

sub jump_to_table {
   my($inref) = @_;

   if ($$inref =~ s,(.*?)(?=<table),,) {
      return 1;
   } else {
      return 0;
   }
}

sub get_entry {
   my($inref) = @_;

   if ($$inref =~ s,.*?<td[^>]*>\s*(.*?)\s*</td[^>]*>,,) {
      return $1;
   }
   return "";
}

sub strip_tags {
   my($inref,$close,@tags) = @_;

   foreach my $tag (@tags) {
      if ($close) {
         $$inref =~ s,</?$tag[^>]*>, ,g;
      } else {
         $$inref =~ s,<$tag[^>]*>, ,g;
      }
   }
}

# Local Variables:
# mode: cperl
# indent-tabs-mode: nil
# cperl-indent-level: 3
# cperl-continued-statement-offset: 2
# cperl-continued-brace-offset: 0
# cperl-brace-offset: 0
# cperl-brace-imaginary-offset: 0
# cperl-label-offset: -2
# End:
