#!/usr/bin/perl -w
# Copyright (c) 2010-2011 Sullivan Beck.  All rights reserved.
# This program is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.

###############################################################################
###############################################################################
# This script is used to automatically generate the Locale::Codes module
# which contain the actual codes.

require 5.000000;
use YAML;
use IO::File;
use strict;
use warnings;
use Archive::Zip;
use Encode;
use Text::CSV::Slurp;

use lib "./internal";

our $VERSION;
$VERSION='3.17';

our $DEBUG;
$DEBUG = 0;

# so the CPAN indexer won't treat this as a POD file
our $podstr = '=pod';
our $hdstr  = '=head1';

###############################################################################
# GLOBAL VARIABLES
###############################################################################

# We need to create the following variables:
#
#  %ID2Names{COUNTRY_ID}         => [ COUNTRY, COUNTRY, ... ]
#                                   A list of all valid country names that
#                                   correspond to a given COUNTRY_ID.
#                                   The names are all real (i.e. correct
#                                   spelling and capitalization).
#  %Alias{ALIAS}                 => [ COUNTRY_ID, I ]
#                                   A hash of all aliases for a country.
#                                   Aliases are all lowercase.
#  %Code2ID{CODESET}{CODE}       => [ COUNTRY_ID, I ]
#                                   In a given CODESET, CODE corresponds to
#                                   the I'th entry list of countries.
#  %ID2Code{CODESET}{COUNTRY_ID} => CODE
#                                   In the given CODESET, the COUNTRY_ID
#                                   corresponds to the given CODE.

our($ModDir,$Module,$ID,%ID2Names,%Alias,%Code2ID,%ID2Code);

$ModDir = "lib/Locale/Codes";

########################################
# COUNTRY

our(%country_alias);

#
# ISO 3166.
#

our($country_iso_url,$country_iso_1st,%country_iso_orig,%country_iso_ignore);

$country_iso_url    = "http://www.iso.org/iso/list-en1-semic-3.txt";
$country_iso_1st    = "AFGHANISTAN";

#
# UN Stats Division
#

our($country_un_url,%country_un_orig,%country_un_ignore);

$country_un_url    = "http://unstats.un.org/unsd/methods/m49/m49alpha.htm";

#
# National Geospatial-Intelligence Agency
#

our($country_nga_url,%country_nga_orig,%country_nga_ignore);

$country_nga_url    = "http://earth-info.nga.mil/gns/html/digraphs.htm";

#
# IANA domains
#

our($country_iana_url,%country_iana_orig,%country_iana_ignore);

$country_iana_url    = "http://www.iana.org/domains/root/db/";

#
# CIA World Factbook
#

our($country_cia_url,$country_cia_last,%country_cia_ignore,%country_cia_orig,%country_cia_codes);

$country_cia_url = "https://www.cia.gov/library/publications/the-world-factbook/appendix/print_appendix-d.html";
$country_cia_last = "Zimbabwe";

require "data.country.pl";

########################################
# LANGUAGE

our(%language_alias);

#
# We'll first read data from the official ISO 639.
#
# Data available consists of the language names and 2-letter and
# 3-letter codes. Language names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($language_iso2_url,%language_iso2_orig,%language_iso2_ignore);

$language_iso2_url    = "http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt";

our($language_iso5_url,%language_iso5_orig,%language_iso5_ignore);

$language_iso5_url    = "http://www.loc.gov/standards/iso639-5/id.php";

#
# IANA language registration
#

our($language_iana_url,%language_iana_orig,%language_iana_ignore);

$language_iana_url    = "http://www.iana.org/assignments/language-subtag-registry";

require "data.language.pl";

########################################
# CURRENCY

our(%currency_alias);

#
# We'll first read data from the official ISO 4217.
#

our($currency_iso_url,%currency_iso_orig,%currency_iso_ignore);

$currency_iso_url    = "http://www.currency-iso.org/dl_iso_table_a1.xls";

require "data.currency.pl";

########################################
# SCRIPT

our(%script_alias);

#
# We'll first read data from the official ISO 15924.
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($script_iso_url,$script_iso_zip,%script_iso_orig,%script_iso_ignore);

$script_iso_url    = "http://www.unicode.org/iso15924/iso15924.txt.zip";
$script_iso_zip    = qr/^iso15924/;

our($script_iana_url,%script_iana_orig,%script_iana_ignore);

$script_iana_url   = $language_iana_url;

require "data.script.pl";

########################################
# LANGUAGE EXTENSIONS

our(%langext_alias);

#
# IANA language registration
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($langext_iana_url,%langext_iana_orig,%langext_iana_ignore);

$langext_iana_url   = $language_iana_url;

require "data.langext.pl";

########################################
# LANGUAGE VARIATIONS

our(%langvar_alias);

#
# IANA language registration
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#

our($langvar_iana_url,%langvar_iana_orig,%langvar_iana_ignore);

$langvar_iana_url   = $language_iana_url;

require "data.langvar.pl";

# ########################################
# # REGIONS

# our(%region_alias);

# #
# # IANA language registration
# #
# # Data available consists of the script names and 2-letter and
# # 3-letter codes. Script names include non-ASCII characters encoded in
# # UTF-8.
# #

# our($region_iana_url,%region_iana_orig,%region_iana_ignore);

# $region_iana_url   = $language_iana_url;

# require "data.region.pl";

###############################################################################
# HELP
###############################################################################

our($usage);
my $COM = $0;
$COM =~ s/^.*\///;

$usage=
  "usage: $COM OPTIONS
      -h/--help       : Print help.

      -a/--all        : Do all steps

      -c/--country    : Get the country codes
      -l/--language   : Get the language codes
      -r/--currency   : Get the currency codes
      -s/--script     : Get the script codes
      -L/--langext    : Get the language extension codes
      -V/--langvar    : Get the language variation codes
      -C/--clean      : Clean up all temporary files
";

###############################################################################
# PARSE ARGUMENTS
###############################################################################

my $do_all      = 0;
my $do_country  = 0;
my $do_language = 0;
my $do_currency = 0;
my $do_script   = 0;
my $do_langext  = 0;
my $do_langvar  = 0;
my $do_clean    = 0;

while ($_ = shift) {

   (print $usage),   exit  if ($_ eq "-h"   ||  $_ eq "--help");

   $do_all = 1,      next  if ($_ eq "-a"   ||  $_ eq "--all");

   $do_country = 1,  next  if ($_ eq "-c"   ||  $_ eq "--country");
   $do_language = 1, next  if ($_ eq "-l"   ||  $_ eq "--language");
   $do_currency = 1, next  if ($_ eq "-r"   ||  $_ eq "--currency");
   $do_script = 1,   next  if ($_ eq "-s"   ||  $_ eq "--script");
   $do_langext = 1,  next  if ($_ eq "-L"   ||  $_ eq "--langext");
   $do_langvar = 1,  next  if ($_ eq "-V"   ||  $_ eq "--langvar");
   $do_clean = 1,    next  if ($_ eq "-C"   ||  $_ eq "--clean");
}

############################################################################
# MAIN PROGRAM
############################################################################

do_country()    if ($do_all  ||  $do_country);
do_language()   if ($do_all  ||  $do_language);
do_currency()   if ($do_all  ||  $do_currency);
do_script()     if ($do_all  ||  $do_script);
do_langext()    if ($do_all  ||  $do_langext);
do_langvar()    if ($do_all  ||  $do_langvar);
do_clean()      if ($do_all  ||  $do_clean);

############################################################################
# DO_COUNTRY
############################################################################

sub do_country {
   print "Country codes...\n";

   $ID       = "0001";
   $Module   = "Country";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_country_iso();
   print_table("country")  if ($DEBUG == 2);

   do_country_un();
   print_table("country")  if ($DEBUG == 2);

   do_country_nga();
   print_table("country")  if ($DEBUG == 2);

   do_country_iana();
   print_table("country")  if ($DEBUG == 2);

   do_country_cia();
   print_table("country")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("country");
   print_table("country")  if ($DEBUG);

   write_module("country");
}

########################################

#
# ISO 3166-1
#
# The standard contains the alpha-2 codes.
#
# File format:
# =================
#    a long comment^M
#    ^M
#    AFGHANISTAN;AF^M
#    ALBANIA;AL^M
# =================
#
# Country names must be adjusted (since they're all uppercase).  Lines end with
# a windows unprintable character that must be chopped off.  First line is (currently)
# AFGANISTAN.
#

sub do_country_iso {

   my $in = _read_file('url'       => $country_iso_url,
                       'as_list'   => 1,
                       'encoding'  => 'ISO-8859-1',
                       'chop'      => 1,
                      );

   # Make sure that line 2 is blank and line 3 contains the first country.

   if ($$in[1]  ||  $$in[2] !~ /^$country_iso_1st;/) {
      die "ERROR [iso]: country code file format changed!\n";
   }

   shift(@$in);
   shift(@$in);

   foreach my $line (@$in) {
      next  if (! $line);
      if ($line !~ /^(.*);(.*)$/) {
         die "ERROR [iso]: line invalid\n" .
             "             $line\n";
      }
      my($country,$alpha2) = ($1,$2);

      next if ( ($country  &&  exists $country_iso_ignore{'name'}{$country})  ||
                ($alpha2   &&  exists $country_iso_ignore{'alpha2'}{$alpha2}) );

      # Make sure EVERY country is capitalized correctly.

      $alpha2 = lc($alpha2);
      if (exists $country_iso_orig{$country}) {
         $country = $country_iso_orig{$country};
      } else {
         print "WARNING [iso]: unknown country: $country\n";
         next;
      }

      _ascii("iso",$country);

      my $countryID = $ID++;

      $ID2Names{$countryID}          = [ $country ];
      $Alias{lc($country)}           = [ $countryID, 0 ];
      $Code2ID{'alpha2'}{$alpha2}    = [ $countryID, 0 ];
      $ID2Code{'alpha2'}{$countryID} = $alpha2;
   }
}

########################################

#
# UN Stats Division
#
# The UN Stats Division contains some (but not all) of the ISO 3166
# 3-character codes and 3-digit codes. Since they are the maintainers
# of this data, this is an official source.
#
# File format:
# ============
#    <tr>
#      <td align=left valign=top class="theader" width="66">
#        <div align="left"><strong>Numerical<br>code</strong></div>
#      </td>
#      <td valign=top class="theader" width="312">
#        <strong>&nbsp;&nbsp;&nbsp;Country or area name</strong>
#      </td>
#      <td valign=top class="theader" width="121">
#        <strong>ISO ALPHA-3 code</strong>
#      </td>
#    </tr>
#    <tr>
#      <td width="66" align=middle valign=top class="lcont">
#        <p align=left>004 </p>
#      </td>
#      <td width="312" valign=top class="lcont">
#        <p>Afghanistan </p>
#      </td>
#      <td width="121" valign=top class="lcont">
#        <p>AFG </p>
#      </td>
#    </tr>
# ============
#

sub do_country_un {

   print "\nINFO [un]: expect Channel Islands, Sark\n";

   my $in = _read_file('url'        => $country_un_url,
                       'type'       => 'html',
                       'as_list'    => 0,
                       'encoding'   => 'ISO-8859-1',
                       'html_strip' => [ qw(br p strong div) ],
                       'html_repl'  => [ qw(&nbsp;) ],
                      );

   # Look for a table who's first row has the header:
   #    ISO ALPHA-3 code

   my $found = jump_to_row(\$in,"ISO ALPHA-3 code");
   if (! $found) {
      die "ERROR [un]: country code file format changed!\n";
   }

   while (1) {
      my @row = get_row("un",\$in);
      last  if (! @row);

      my($num,$country,$alpha3) = @row;
      next if ( ($country  &&  exists $country_un_ignore{'name'}{$country})  ||
                ($num      &&  exists $country_un_ignore{'num'}{$num})       ||
                ($alpha3   &&  exists $country_un_ignore{'alpha3'}{$alpha3}) );

      $alpha3 = lc($alpha3);

      if ($num) {
         if ($num !~ /^\d+$/  ||  length($num) > 3) {
           print "WARNING [un]: Invalid numeric code: $country => $num\n";
           next;
        }
      }

      if ($alpha3  &&  $alpha3 !~ /^[a-z][a-z][a-z]$/) {
         print "WARNING [un]: Invalid alpha-3 code: $country => $alpha3\n";
         next;
      }

      if (exists $country_un_orig{$country}) {
         $country = $country_un_orig{$country};
      }

      _ascii("un",$country);

      my($err,$countryID,$i,$t) = get_ID("country","un",$country);
      if ($err) {
         print "ERROR [un]: error in country: $country\n";
         next;
      }

      if ($t eq "new") {
         print "INFO [un]: new country: $alpha3/$num\t$country\n";
      }

      if ($num) {
         $num = "0$num"  while (length($num) < 3);
         $Code2ID{"num"}{$num} = [ $countryID, $i ];
         $ID2Code{"num"}{$countryID} = $num;
      }

      if ($alpha3) {
         $Code2ID{"alpha3"}{$alpha3} = [ $countryID, $i ];
         $ID2Code{"alpha3"}{$countryID} = $alpha3;
      }
   }
}

########################################

#
# FIPS 10
#
# The National Geospatial-Intelligence Agency is the official source
# for FIPS 10 codes.
#
# File format:
# ============
#    <tr>
#      <td width="38%" class="style1"><b>SHORT FORM NAME</b></td>
#      <td width="48%" class="style1"><b>LONG FORM NAME</b></td>
#      <td width="14%" class="style1"><b>CODE</b></td>
#    </tr>
#    <tr>
#      <td class="style1">Albania</td>
#      <td class="style1">Republic of Albania </td>
#      <td class="style1">AL</td>
#    </tr>
#    ...
#    <tr>
#      <td class="style1"><b>Short Form Name</b></td>
#      <td class="style1"><b>Long Form Name</b></td>
#      <td class="style1"><b>Code</b></td>
#    </tr>
#    <tr>
#      <td class="style1">American Samoa [United States] </td>
#      <td class="style1">Territory of American Samoa </td>
#      <td class="style1">AQ</td>
#    </tr>
# ============
#

sub do_country_nga {

   print "\nINFO [nga]: expect Serbia, Ashmore, Baker, Bassas, Clipperton,\n" .
           "            Coral, Europa, Gaza, Glorioso, Howland, Jan Mayen,\n" .
           "            Jarvis, Johnston, Juan, Kingman, Midway, Navassa, Netherlands,\n" .
           "            Palmyra, Paracel, Spratly, Svalbard, Tromelin, Wake, West\n";

   my $in = _read_file('url'       => $country_nga_url,
                       'type'      => 'html',
                       'as_list'   => 0,
                       'html_strip' => [ qw(br p strong div) ],
                       'html_repl'  => [ '&nbsp;',
                                         'Other:',
                                         '[United States}',
                                         qr/\(see note[^\)]*\)/,
                                         qr/\[[^\]]*\]/
                                       ],
                      );

   # Look for a table who's first row has the header:
   #    SHORT FORM NAME
   # and then a table with the header:
   #    Short Form Name

   foreach my $table ("SHORT FORM NAME","Short Form Name") {
      my $found = jump_to_row(\$in,$table);
      if (! $found) {
         die "ERROR [nga]: country code file format changed!\n";
      }

      while (1) {
         my @row = get_row("nga",\$in);
         last  if (! @row);

         my($short,$long,$fips) = @row;
         next if ( ($short    &&  exists $country_nga_ignore{'name'}{$short})  ||
                   ($long     &&  exists $country_nga_ignore{'name'}{$long})   ||
                   ($fips     &&  exists $country_nga_ignore{'fips'}{$fips}) );

         $fips  = uc($fips);
         next  if ($fips eq "N/A");

         if ($fips !~ /^[A-Z][A-Z]$/) {
            print "WARNING [nga]: Invalid code: $short => $fips\n";
         }

         if ($short eq "None") {
            print "ERROR [nga]: no short definition: $fips  $long\n";
            next;
         }
         $long  = ""  if ($long  eq "None");

         if (exists $country_nga_orig{$short}) {
            $short = $country_nga_orig{$short};
         }
         if (exists $country_nga_orig{$long}) {
            $long = $country_nga_orig{$long};
         }

         _ascii("nga",$short);
         _ascii("nga",$long);

         my($countryID,$i);
         my($err,$c,$ii,$t) = get_ID("country","nga",$short,1);
         if ($err) {
            print "ERROR [nga]: error in country: $short\n";
            next;
         }

         if ($t ne "new") {
            ($countryID,$i) = ($c,$ii);
         } else {
            ($err,$c,$ii,$t) = get_ID("country","nga",$long,1);
            ($countryID,$i) = ($c,-1);
            if ($t eq "new") {
               print "INFO [nga]: new country: $fips\t$short\n";
            }
         }

         if ($countryID == -1) {
            # New country
            $countryID = $ID++;
            $i = 0;
            $ID2Names{$countryID} = [ $short ];
            $Alias{lc($short)} = [ $countryID, $i ];

         } elsif ($i == -1) {
            # $short is a new alias
            push @{ $ID2Names{$countryID} },$short;
            $i = $#{ $ID2Names{$countryID} };
            $Alias{lc($short)} = [ $countryID, $i ];
         }

         if ($long  &&
             ! exists $Alias{lc($long)}) {
            # $long is a new alias
            push @{ $ID2Names{$countryID} },$long;
            $i = $#{ $ID2Names{$countryID} };
            $Alias{lc($long)} = [ $countryID, $i ];
         }

         $Code2ID{"fips"}{$fips} = [ $countryID, $i ];
         $ID2Code{"fips"}{$countryID} = $fips;
      }
   }
}

########################################

#
# IANA Domain Registry
#
# The IANA domain registry is the official source of domain codes.
#
# File format:
# ============
#    <tr>
#       <th>Domain</th>
#       <th>Type</th>
#       <th>Purpose / Sponsoring Organisation</th>
#    </tr>
#    <tr class="iana-group-1 iana-type-1">
#       <td><a href="/domains/root/db/ad.html">.AD</a></td>
#       <td>country-code</td>
#       <td>Andorra<br/><span class="tld-table-so">Andorra Telecom</span></td>
#    </tr>
# ============
#

sub do_country_iana {

   print "\nINFO [iana]: expect Acension, European, Soviet, Portuguese\n";

   ###
   ### The IANA data contains the domain names
   ###

   my $in = _read_file('url'       => $country_iana_url,
                       'type'      => 'html',
                       'as_list'   => 0,
                       'html_strip' => [ qw(a) ],
                       'html_repl'  => [ '&nbsp;',
                                         '(being phased out)'
                                       ],
                      );

   # Look for a table who's first row has the header:
   #    Sponsoring Organisation

   my $found = jump_to_row(\$in,"Sponsoring Organisation");
   if (! $found) {
      die "ERROR [iana]: country code file format changed!\n";
   }

   while (1) {
      my @row = get_row("iana",\$in);
      last  if (! @row);

      my($dom,$type,$country) = @row;
      next if ( ($country  &&  exists $country_iana_ignore{'name'}{$country})  ||
                ($dom      &&  exists $country_iana_ignore{'dom'}{$dom}) );
      next  unless ($type eq "country-code");

      # *** FIX ***
      # For now, we'll remove foreign character domains.
      next  unless ($dom =~ /^\.[A-Z][A-Z]/);

      $dom     =~ s/^\.//;
      $country =~ s,<br.*,,;
      $country =~ s,\s+$,,;

      if (exists $country_iana_orig{$country}) {
         $country = $country_iana_orig{$country};
      }

      _ascii('iana',$country);

      my($err,$countryID,$i,$t) = get_ID("country","iana",$country);
      if ($err) {
         print "ERROR [iana]: error in country: $country\n";
         next;
      }

      if ($t eq "new") {
         print "INFO [iana]: new country: $dom\t$country\n";
      }

      $Code2ID{"dom"}{$dom} = [ $countryID, $i ];
      $ID2Code{"dom"}{$countryID} = $dom;
   }
}

########################################

#
# The CIA World Factbook (which is assumed to be a reliable source of
# this data) is used to check the alpha-2, alpha-3, numeric, and
# fips-10 codes. It is also used to get the stanag codes.
#
# Due to the complexity of the data, we'll store the last country so
# we know when to stop.
#
# We have to force-override some codes.
#
# File format:
# ============
#    <table width="638" cellpadding="0" cellspacing="0">
#       <tr style="height: 20px;" valign="center" bgcolor="#F8f8e7" class="smalltext">
#          <th align="center"><b>Entity</b></th>
#          <th width="15%" align="center"><b>FIPS 10</b></th>
#          <th width="20%" colspan="3" align="center"><b>ISO 3166</b></th>
#          <th width="10%" align="center"><b>Stanag</b></th>
#          <th width="10%" align="center"><b>Internet</b></th>
#          <th width="28%" align="center"><b>Comment</b></th>
#       </tr>
#    </table>
#
#    <table width="638"  border="0" cellpadding="1" cellspacing="1">
#       <tr>
#          <td width="16%" align="left" valign="middle" class="category_data">
#             <a href="../geos/af.html">Afghanistan</a>
#          </td>
#          <td width="16%" align="center" valign="middle" class="category_data">AF</td>
#          <td valign="middle">
#             <table width="100%" border="0" cellpadding="1" cellspacing="0">
#                <tr class="category_data">
#                   <td width="10%" align="center" valign="middle">AF</td>
#                   <td width="10%" align="center" valign="middle">AFG</td>
#                   <td width="20%" align="center" valign="middle">004</td>
#             </table>
#          </td>
#          <td width="10%" align="center" valign="middle" class="category_data">AFG</td>
#          <td width="10%" align="center" valign="middle" class="category_data">.af</td>
#          <td width="28%" valign="middle" >&nbsp;</td>
#       </tr>
#    </table>
# ============
#
# The first table (containing the headers) is repeated for each letter
# of the alphabet followed by all of the entries for those countries.
#

sub do_country_cia {

   print "\nINFO [cia]: expect Antarctica, Bouvet, British, Christmas, Cocos,\n" .
         "            Curacao, France (Metro), French, Heard, Kosovo, Montenegro,\n" .
         "            Netherlands, Saint Barth, Saint Martin, Serbia, South, Taiwan, US\n";

   my $in = _read_file('url'       => $country_cia_url,
                       'type'      => 'html',
                       'as_list'   => 0,
                       'html_strip' => [ qw(br p strong div a b img) ],
                       'html_repl'  => [ '&nbsp;',
                                         qr/\015/
                                       ],
                      );

   # Move to a table with "Entity" as one of the entries. This
   # table is followed by the entires.

   my $found = jump_to_entry(\$in,"Entity");
   if (! $found) {
      die "ERROR [cia]: country code file format changed!\n";
   }

   # Read each group (those starting with one letter in the alphabet).

   while (1) {

      #
      # Read the next entry
      #

      my($country,$fips,$alpha2,$alpha3,$num,$stanag,$dom);

      my $found   = jump_to_table(\$in);
      if (! $found) {
         print "ERROR [cia]: malformed file\n";
         last;
      }
      $country = get_entry(\$in);
      if (! $country) {
         # The first entry is empty if it's at the end of the entries
         # for a given letter.
         $found = jump_to_entry(\$in,"Entity");
         if (! $found) {
            die "ERROR [cia]: country code file format changed!\n";
         }
         next;
      }

      $fips    = uc(get_entry(\$in));
      $found   = jump_to_table(\$in);
      if (! $found) {
         print "ERROR [cia]: malformed file\n";
         last;
      }
      $alpha2  = lc(get_entry(\$in));
      $alpha3  = lc(get_entry(\$in));
      $num     = get_entry(\$in);
      $stanag  = get_entry(\$in);
      $dom     = uc(get_entry(\$in));
      $dom     =~ s/^\.//;

      next if ( ($country  &&  exists $country_cia_ignore{'name'}{$country})  ||
                ($fips     &&  exists $country_cia_ignore{'fips'}{$fips})     ||
                ($num      &&  exists $country_cia_ignore{'num'}{$num})       ||
                ($stanag   &&  exists $country_cia_ignore{'stanag'}{$stanag}) ||
                ($dom      &&  exists $country_cia_ignore{'dom'}{$dom})       ||
                ($alpha2   &&  exists $country_cia_ignore{'alpha2'}{$alpha2}) ||
                ($alpha3   &&  exists $country_cia_ignore{'alpha3'}{$alpha3}) );

      if (exists $country_cia_orig{$country}) {
         $country = $country_cia_orig{$country};
      }

      _ascii('cia',$country);

      $alpha2 = $country_cia_codes{"alpha2"}{$country}
        if (exists $country_cia_codes{"alpha2"}{$country});
      $alpha3 = $country_cia_codes{"alpha3"}{$country}
        if (exists $country_cia_codes{"alpha3"}{$country});
      $num = $country_cia_codes{"num"}{$country}
        if (exists $country_cia_codes{"num"}{$country});
      $fips = $country_cia_codes{"fips"}{$country}
        if (exists $country_cia_codes{"fips"}{$country});
      $dom  = $country_cia_codes{"dom"}{$country}
        if (exists $country_cia_codes{"dom"}{$country});

      #
      # Get the countryID if the country or ANY of the codes match.
      #

      my($err,$countryID,$i,$type) = get_ID("country","cia",$country,1);
      if ($err) {
         print "ERROR [cia]: error in country: $country\n";
         next;
      }

      if ($countryID == -1  &&  $alpha2 ne "-") {
         if (exists $Code2ID{"alpha2"}{$alpha2}) {
            ($countryID,$i) = @{ $Code2ID{"alpha2"}{$alpha2} };
         }
      }

      if ($countryID == -1  &&  $alpha3 ne "-") {
         if (exists $Code2ID{"alpha3"}{$alpha3}) {
            ($countryID,$i) = @{ $Code2ID{"alpha3"}{$alpha3} };
         }
      }

      if ($countryID == -1  &&  $num ne "-") {
         if (exists $Code2ID{"num"}{$num}) {
            ($countryID,$i) = @{ $Code2ID{"num"}{$num} };
         }
      }

      if ($countryID == -1  &&  $fips ne "-") {
         if (exists $Code2ID{"fips"}{$fips}) {
            ($countryID,$i) = @{ $Code2ID{"fips"}{$fips} };
         }
      }

      if ($countryID == -1  &&  $dom ne "-") {
         if (exists $Code2ID{"dom"}{$dom}) {
            ($countryID,$i) = @{ $Code2ID{"dom"}{$dom} };
         }
      }

      if ($countryID == -1) {
         $countryID = $ID++;
         $i = 0;
         $ID2Names{$countryID} = [ $country ];
         $Alias{lc($country)} = [ $countryID, $i ];
      }

      #
      # Now check that any previously defined values match the
      # CIA data.
      #

      if ($alpha2 ne "-") {
         my $err = check_code("cia","alpha2",$alpha2,$country,$countryID);
         if ($err) {
            print "ERROR [cia]: error in alpha2: $country:$alpha2\n";
            next;
         }
      }

      if ($alpha3 ne "-") {
         my $err = check_code("cia","alpha3",$alpha3,$country,$countryID);
         if ($err) {
            print "ERROR [cia]: error in alpha3: $country:$alpha3\n";
            next;
         }
      }

      if ($num ne "-") {
         my $err = check_code("cia","num",$num,$country,$countryID);
         if ($err) {
            print "ERROR [cia]: error in num: $country:$num\n";
            next;
         }
      }

      if ($fips ne "-") {
         my $err = check_code("cia","fips",$fips,$country,$countryID);
         if ($err) {
            print "ERROR [cia]: error in fips: $country:$fips\n";
            next;
         }
      }

      if ($dom ne "-") {
         my $err = check_code("cia","dom",$dom,$country,$countryID);
         if ($err) {
            print "ERROR [cia]: error in dom: $country:$dom\n";
            next;
         }
      }

      last  if ($country eq $country_cia_last);
   }
}

############################################################################
# DO_LANGUAGE
############################################################################

sub do_language {
   print "Language codes...\n";

   $ID       = "0001";
   $Module   = "Language";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_language_iso2();
   print_table("language")  if ($DEBUG == 2);

   do_language_iso5();
   print_table("language")  if ($DEBUG == 2);

   do_language_iana();
   print_table("language")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("language");
   print_table("language")  if ($DEBUG);

   write_module("language");
}

########################################
sub do_language_iso2 {
   ###
   ### The first set we'll do is the ISO codes.
   ###

   my $in = _read_file('url'       => $language_iso2_url,
                       'as_list'   => 1,
                       'encoding'  => 'UTF-8',
                      );

   # File is a set of lines of fields delimited by "|". Fields are:
   #
   #    alpha3
   #    term
   #    alpha2
   #    English names (semicolon separated list)
   #    French name

   foreach my $line (@$in) {
      my($alpha3,$term,$alpha2,$language,$french) = split(/\|/,$line);
      next if ( ($language  &&  exists $language_iso2_ignore{'name'}{$language})  ||
                ($term      &&  exists $language_iso2_ignore{'term'}{$term})      ||
                ($alpha2    &&  exists $language_iso2_ignore{'alpha2'}{$alpha2})  ||
                ($alpha3    &&  exists $language_iso2_ignore{'alpha3'}{$alpha3}) );

      # The first line has some binary characters at the start.
      if (length($alpha3)>3) {
         $alpha3 = substr($alpha3,length($alpha3)-3);
      }

      if (exists $language_iso2_orig{$language}) {
         $language = $language_iso2_orig{$language};
      }
      my $languageID = $ID++;
      my @language = split(/\s*;\s*/,$language);

      _ascii('iso',$language);

      $ID2Names{$languageID}                  = [ @language ];
      for (my $i=0; $i<=$#language; $i++) {
         my $language = $language[$i];
         $Alias{lc($language)}         = [ $languageID, $i ];
      }

      if ($alpha3) {
         $Code2ID{"alpha3"}{$alpha3}     = [ $languageID, 0 ];
         $ID2Code{"alpha3"}{$languageID} = $alpha3;
      }

      if ($term) {
         $Code2ID{"term"}{$term}         = [ $languageID, 0 ];
         $ID2Code{"term"}{$languageID}   = $term;
      }

      if ($alpha2) {
         $Code2ID{"alpha2"}{$alpha2}     = [ $languageID, 0 ];
         $ID2Code{"alpha2"}{$languageID} = $alpha2;
      }
   }
}

########################################
sub do_language_iso5 {
   ###
   ### The first set we'll do is the ISO codes.
   ###

   my $in = _read_file('url'       => $language_iso5_url,
                       'as_list'   => 0,
                      );

   # Look for a table who's first row has the header:
   #    Identifier

   my $found = jump_to_row(\$in,'Identifier');
   if (! $found) {
      die "ERROR [iso5]: language code file format changed!\n";
   }

   while (1) {
      my @row = get_row("iso5",\$in);
      last  if (! @row);

      my($alpha3,$language) = @row;
      next  if (! $language);
      next if ( ($language  &&  exists $language_iso5_ignore{'name'}{$language})  ||
                ($alpha3    &&  exists $language_iso5_ignore{'alpha3'}{$alpha3}) );

      if (exists $language_iso5_orig{$language}) {
         $language = $language_iso5_orig{$language};
      }
      _ascii('iso',$language);

      if ($alpha3  &&  $alpha3 !~ /^[a-z][a-z][a-z]$/) {
         print "WARNING [iso5]: Invalid alpha-3 code: $language => $alpha3\n";
         next;
      }

      my $languageID;
      if (exists $Code2ID{'alpha3'}{$alpha3}) {
         $languageID = $Code2ID{'alpha3'}{$alpha3}[0];

         my @lang    = @{ $ID2Names{$languageID} };
         my %lang    = map { $_,1 } @lang;
         if (! exists $lang{$language}) {
            push( @{ $ID2Names{$languageID} },$language );
         }

      } else {
         $languageID = $ID++;

         $ID2Names{$languageID} = [ $language ];
         $Alias{lc($language)}  = [ $languageID, 0 ];
         $Code2ID{"alpha3"}{$alpha3}     = [ $languageID, 0 ];
         $ID2Code{"alpha3"}{$languageID} = $alpha3;
      }
   }
}

########################################
sub do_language_iana {

   print "\nINFO [iana]: expect Serbo...\n";

   ###
   ### The IANA language registration data is used to check:
   ###    alpha-2, alpha-3
   ###

   my $in = _read_file('url'       => $language_iana_url,
                       'as_list'   => 1,
                      );

   shift(@$in)  until ($$in[0] eq '%%');

   # Each entry is of the form:
   #   %%
   #   Type: language
   #   Subtag: aa
   #   Description: Afar
   #   Description: Afar 2
   #   Added: 2005-10-16
   #   Deprecated: 2009-01-01
   #
   # Ignore them if they're deprecated.  We're only doing type 'language' here.

 ENTRY: while (@$in) {

      #
      # Read the next entry
      #

      my ($skip,%entry) = _iana_entry($in,'language');
      next ENTRY  if ($skip);

      my(@language,$language,$code,$alpha2,$alpha3);

      $code     = $entry{'Subtag'};
      next ENTRY if ($code &&  exists $language_iana_ignore{'code'}{$code});

      foreach my $language (@{ $entry{'Description'} }) {
         next ENTRY  if ($language  &&  exists $language_iana_ignore{'name'}{$language});

         if (exists $language_iana_orig{$language}) {
            $language = $language_iana_orig{$language};
         }

         _ascii('iana',$language);
         push(@language,$language);
      }

      my %language;
      my @tmp;
      foreach my $language (@language) {
         next  if (exists $language{$language});
         push(@tmp,$language);
         $language{$language} = 1;
      }
      @language = @tmp;

      if (length($code) == 2) {
         $alpha2 = lc($code);
      } elsif (length($code) == 3) {
         $alpha3 = lc($code);
      } else {
         print "WARNING [language_iana]: unknown entry: $language => $code\n";
         next ENTRY;
      }

      #
      # Get the ID if the language or ANY of the codes match.
      #

      my($err,$languageID,$i,$t,%languageID);
      foreach my $language (@language) {

         ($err,$languageID,$i,$t) = get_ID("language","iana",$language,1);
         if ($err) {
            print "ERROR [iana]: problem in language: $language\n";
            next ENTRY;
         }

         $languageID{$languageID} = 1;
      }
      delete $languageID{'-1'};
      my @languageID = keys %languageID;
      if (! @languageID) {
         $languageID = -1;

      } elsif (@languageID == 1) {
         $languageID = $languageID[0];

      } else {
         print "ERROR [iana]: problem in languageID: $language[0]: @languageID\n";
         next ENTRY;
      }

      if ($languageID == -1  &&  $alpha2) {
         if (exists $Code2ID{"alpha2"}{$alpha2}) {
            ($languageID,$i) = @{ $Code2ID{"alpha2"}{$alpha2} };
         }
      }

      if ($languageID == -1  &&  $alpha3) {
         if (exists $Code2ID{"alpha3"}{$alpha3}) {
            ($languageID,$i) = @{ $Code2ID{"alpha3"}{$alpha3} };
         }
      }

      if ($languageID == -1) {
         $languageID = $ID++;
         $i = 0;
         $ID2Names{$languageID} = [ @language ];
         foreach my $language (@language) {
            $Alias{lc($language)}  = [ $languageID, $i ];
         }
      }

      #
      # Now check that any previously defined values match.
      #

      if ($alpha2) {
         my $err = check_code("iana","alpha2",$alpha2,$language[0],$languageID);
         next  if ($err);
      }

      if ($alpha3) {
         my $err = check_code("iana","alpha3",$alpha3,$language[0],$languageID,"noprint");
         next  if ($err);
      }
   }
}

########################################

# Read the next entry from the IANA file
sub _iana_entry {
   my ($in,@type) = @_;
   my %type       = map { $_,1 } @type;

   my %entry;
   shift(@$in);
   while (@$in  &&  $$in[0] ne '%%') {
      my $line      = shift(@$in);
      $line         =~ /^(.*?):\s*(.*)$/;
      my($key,$val) = ($1,$2);
      if ($key eq 'Description') {
         if (exists $entry{$key}) {
            push( @{ $entry{$key} },$val );
         } else {
            $entry{$key} = [ $val ];
         }
      } else {
         $entry{$key} = $val;
      }
   }

   return (1)  if (exists $entry{'Deprecated'}  ||
                   ! exists $entry{'Type'}      ||
                   ! exists $type{ $entry{'Type'} });
   return (0,%entry);
}

############################################################################
# DO_CURRENCY
############################################################################

sub do_currency {
   print "Currency codes...\n";

   $ID       = "0001";
   $Module   = "Currency";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_currency_iso();
   print_table("currency")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("currency");
   print_table("currency")  if ($DEBUG);

   write_module("currency");
}

########################################
sub do_currency_iso {
   ###
   ### The first set we'll do is the ISO 4217 codes.
   ###

   my $in = _read_file('url'       => $currency_iso_url,
                       'as_list'   => 1,
                       'type'      => 'xls',
                       'encoding'  => 'UTF-8',
                      );

   foreach my $ele (@$in) {
      my $ent      = $$ele{'ENTITY'};
      my $currency = $$ele{'Currency'};
      my $alpha    = $$ele{'Alphabetic Code'};
      my $num      = $$ele{'Numeric Code'};
      $num         = ""  if ($num eq "Nil");
      $currency    =~ s/\s+$//;

      next  if (! $alpha  &&  ! $num);

      next if ( ($currency  &&  exists $currency_iso_ignore{'name'}{$currency})  ||
                ($num       &&  exists $currency_iso_ignore{'num'}{$num})      ||
                ($alpha     &&  exists $currency_iso_ignore{'alpha'}{$alpha}) );

      if (exists $currency_iso_orig{$currency}) {
         $currency = $currency_iso_orig{$currency};
      }

      _ascii('iso',$currency);

      if ($num) {
         if ($num !~ /^\d+$/  ||  length($num) > 3) {
            print "WARNING [iso]: Invalid numeric code: $currency => $num\n";
            next LINE;
         }
      }

      $alpha = uc($alpha);
      if ($alpha  &&  $alpha !~ /^[A-Z][A-Z][A-Z]$/) {
         print "WARNING [iso]: Invalid alpha code: $currency => $alpha\n";
         next LINE;
      }

      # There's a lot of duplication, so make sure that if this code
      # exists, it is consistant.

      my $new = 1;
      my @currencyID;
      if (exists $Code2ID{"num"}{$num}) {
         $new = 0;
         my($currencyID,$i) = @{ $Code2ID{"num"}{$num} };
         push(@currencyID,$currencyID);
      }
      if (exists $Code2ID{"alpha"}{$alpha}) {
         $new = 0;
         my($currencyID,$i) = @{ $Code2ID{"alpha"}{$alpha} };
         push(@currencyID,$currencyID);
      }

      if ($#currencyID == 1) {
         if ($currencyID[0] != $currencyID[1]) {
            print "WARNING [iso]: ID mismatch: $currency => $alpha,$num\n";
            next LINE;
         }
      }

      my $currencyID;
      if ($new) {
         $currencyID            = $ID++;
         $ID2Names{$currencyID} = [ $currency ];
         $Alias{lc($currency)}  = [ $currencyID, 0 ]
      } else {
         $currencyID            = $currencyID[0];
      }

      if ($num) {
         $num                   = "0$num"  while (length($num) < 3);
         $Code2ID{"num"}{$num}  = [ $currencyID, 0 ];
         $ID2Code{"num"}{$currencyID} = $num;
      }

      if ($alpha) {
         $Code2ID{"alpha"}{$alpha} = [ $currencyID, 0 ];
         $ID2Code{"alpha"}{$currencyID} = $alpha;
      }
   }
}

############################################################################
# DO_SCRIPT
############################################################################

sub do_script {
   print "Script codes...\n";

   $ID       = "0001";
   $Module   = "Script";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_script_iso();
   print_table("script")  if ($DEBUG == 2);

   do_script_iana();
   print_table("script")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("script");
   print_table("script")  if ($DEBUG);

   write_module("script");
}

########################################
sub do_script_iso {
   ###
   ### The first set we'll do is the ISO 15924 codes. We
   ### can get a zip file which contains the data.
   ###

   my $in = _read_file('url'       => $script_iso_url,
                       'as_list'   => 1,
                       'type'      => 'zip',
                       'file'      => $script_iso_zip,
                       'chop'      => 1,
                  );

   #
   # The zip file contains a series of lines in the form:
   #   alpha;numeric;english;...
   # The data is in UTF-8.
   #
   # Every line has an unprintable character at the end.
   #

   foreach my $line (@$in) {
      next  if (! $line  ||  $line =~ /^\043/);
      my($alpha,$num,$script) = split(/;/,$line);
      next if ( ($script  &&  exists $script_iso_ignore{'name'}{$script})  ||
                ($num     &&  exists $script_iso_ignore{'num'}{$num})      ||
                ($alpha   &&  exists $script_iso_ignore{'alpha'}{$alpha}) );

      $alpha = ucfirst(lc($alpha));

      if (exists $script_iso_orig{$script}) {
         $script = $script_iso_orig{$script};
      }

      _ascii('iso',$script);

      my $scriptID = $ID++;

      $ID2Names{$scriptID}         = [ $script ];
      $Alias{lc($script)}          = [ $scriptID, 0 ];

      $Code2ID{"alpha"}{$alpha}    = [ $scriptID, 0 ];
      $ID2Code{"alpha"}{$scriptID} = $alpha;

      $Code2ID{"num"}{$num}        = [ $scriptID, 0 ];
      $ID2Code{"num"}{$scriptID}   = $num;
   }
}

########################################
sub do_script_iana {

#   print "\nINFO [iana]: expect Serbo...\n";

   ###
   ### The IANA script registration data is used to check:
   ###    alpha
   ###

   my $in = _read_file('url'       => $script_iana_url,
                       'as_list'   => 1,
                      );

   shift(@$in)  until ($$in[0] eq '%%');

   # Each entry is of the form:
   #   %%
   #   Type: script
   #   Subtag: Elba
   #   Description: Elbasan
   #   Added: 2005-10-16
   #   Deprecated: 2009-01-01
   #
   # Ignore them if they're deprecated.  We're only doing type 'script' here.

 ENTRY: while (@$in) {

      #
      # Read the next entry
      #

      my ($skip,%entry) = _iana_entry($in,'script');
      next ENTRY  if ($skip);

      my(@script,$script,$alpha);

      $alpha  = $entry{'Subtag'};
      next ENTRY if ($alpha &&  exists $script_iana_ignore{'alpha'}{$alpha});

      foreach my $script (@{ $entry{'Description'} }) {
         next ENTRY  if ($script  &&  exists $script_iana_ignore{'name'}{$script});

         if (exists $script_iana_orig{$script}) {
            $script = $script_iana_orig{$script};
         }

         _ascii('iana',$script);
         push(@script,$script);
      }

      my %script;
      my @tmp;
      foreach my $script (@script) {
         next  if (exists $script{$script});
         push(@tmp,$script);
         $script{$script} = 1;
      }
      @script = @tmp;

      #
      # Get the ID if the script or ANY of the codes match.
      #

      my($err,$scriptID,$i,$t,%scriptID);
      foreach my $script (@script) {

         ($err,$scriptID,$i,$t) = get_ID("script","iana",$script,1);
         if ($err) {
            print "ERROR [iana]: problem in script: $script\n";
            next ENTRY;
         }

         $scriptID{$scriptID} = 1;
      }
      delete $scriptID{'-1'};
      my @scriptID = keys %scriptID;
      if (! @scriptID) {
         $scriptID = -1;

      } elsif (@scriptID == 1) {
         $scriptID = $scriptID[0];

      } else {
         print "ERROR [iana]: problem in scriptID: $script[0]: @scriptID\n";
         next ENTRY;
      }

      if ($scriptID == -1  &&  $alpha) {
         if (exists $Code2ID{"alpha"}{$alpha}) {
            ($scriptID,$i) = @{ $Code2ID{"alpha"}{$alpha} };
         }
      }

      if ($scriptID == -1) {
         $scriptID = $ID++;
         $i = 0;
         $ID2Names{$scriptID} = [ @script ];
         foreach my $script (@script) {
            $Alias{lc($script)}  = [ $scriptID, $i ];
         }
      }

      #
      # Now check that any previously defined values match.
      #

      if ($alpha) {
         my $err = check_code("iana","alpha",$alpha,$script[0],$scriptID);
         next  if ($err);
      }
   }
}

############################################################################
# DO_LANGEXT
############################################################################

sub do_langext {
   print "LangExt codes...\n";

   $ID       = "0001";
   $Module   = "LangExt";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_langext_iana();
   print_table("langext")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("langext");
   print_table("langext")  if ($DEBUG);

   write_module("langext");
}

########################################
sub do_langext_iana {

#   print "\nINFO [iana]: expect Serbo...\n";

   ###
   ### The IANA langext registration data is used to check:
   ###    alpha
   ###

   my $in = _read_file('url'       => $langext_iana_url,
                       'as_list'   => 1,
                      );


   shift(@$in)  until ($$in[0] eq '%%');

   # Each entry is of the form:
   #   %%
   #   Type: extlang
   #   Subtag: aao
   #   Description: Algerian Saharan Arabic
   #   Prefix: ar
   #   Added: 2005-10-16
   #   Deprecated: 2009-01-01
   #
   # Ignore them if they're deprecated.  We're only doing type 'extlang' here.

 ENTRY: while (@$in) {

      #
      # Read the next entry
      #

      my ($skip,%entry) = _iana_entry($in,'extlang');
      next ENTRY  if ($skip);

      my(@langext,$langext,$alpha);

      $alpha  = $entry{'Subtag'};
      next ENTRY if ($alpha &&  exists $langext_iana_ignore{'alpha'}{$alpha});

      foreach my $langext (@{ $entry{'Description'} }) {
         next ENTRY  if ($langext  &&  exists $langext_iana_ignore{'name'}{$langext});

         if (exists $langext_iana_orig{$langext}) {
            $langext = $langext_iana_orig{$langext};
         }

         _ascii('iana',$langext);
         push(@langext,$langext);
      }

      my %langext;
      my @tmp;
      foreach my $langext (@langext) {
         next  if (exists $langext{$langext});
         push(@tmp,$langext);
         $langext{$langext} = 1;
      }
      @langext = @tmp;

      #
      # Get the ID if the langext or ANY of the codes match.
      #

      my($err,$langextID,$i,$t,%langextID);
      foreach my $langext (@langext) {

         ($err,$langextID,$i,$t) = get_ID("langext","iana",$langext,1);
         if ($err) {
            print "ERROR [iana]: problem in langext: $langext\n";
            next ENTRY;
         }

         $langextID{$langextID} = 1;
      }
      delete $langextID{'-1'};
      my @langextID = keys %langextID;
      if (! @langextID) {
         $langextID = -1;

      } elsif (@langextID == 1) {
         $langextID = $langextID[0];

      } else {
         print "ERROR [iana]: problem in langextID: $langext[0]: @langextID\n";
         next ENTRY;
      }

      if ($langextID == -1  &&  $alpha) {
         if (exists $Code2ID{"alpha"}{$alpha}) {
            ($langextID,$i) = @{ $Code2ID{"alpha"}{$alpha} };
         }
      }

      if ($langextID == -1) {
         $langextID = $ID++;
         $i = 0;
         $ID2Names{$langextID} = [ @langext ];
         foreach my $langext (@langext) {
            $Alias{lc($langext)}  = [ $langextID, $i ];
         }
      }

      if ($alpha) {
         $Code2ID{"alpha"}{$alpha}     = [ $langextID, 0 ];
         $ID2Code{"alpha"}{$langextID} = $alpha;
      }
   }
}

############################################################################
# DO_LANGVAR
############################################################################

sub do_langvar {
   print "LangVar codes...\n";

   $ID       = "0001";
   $Module   = "LangVar";
   %ID2Names = ();
   %Alias    = ();
   %Code2ID  = ();
   %ID2Code  = ();

   do_langvar_iana();
   print_table("langvar")  if ($DEBUG == 2);

   # Go through all aliases to pick up any that haven't already been
   # added (since some aliases are for human convenience rather than
   # dealing with variations between codesets).
   do_aliases("langvar");
   print_table("langvar")  if ($DEBUG);

   write_module("langvar");
}

########################################
sub do_langvar_iana {

#   print "\nINFO [iana]: expect Serbo...\n";

   ###
   ### The IANA langvar registration data is used to check:
   ###    alpha
   ###

   my $in = _read_file('url'       => $langvar_iana_url,
                       'as_list'   => 1,
                      );


   shift(@$in)  until ($$in[0] eq '%%');

   # Each entry is of the form:
   #   %%
   #   Type: variant
   #   Subtag: 1901
   #   Description: Traditional German orthography
   #   Added: 2005-10-16
   #   Prefix: de
   #   Deprecated: 2009-01-01
   #
   # Ignore them if they're deprecated.  We're only doing type 'extlang' here.

 ENTRY: while (@$in) {

      #
      # Read the next entry
      #

      my ($skip,%entry) = _iana_entry($in,'variant');
      next ENTRY  if ($skip);

      my(@langvar,$langvar,$alpha);

      $alpha  = $entry{'Subtag'};
      next ENTRY if ($alpha &&  exists $langvar_iana_ignore{'alpha'}{$alpha});

      foreach my $langvar (@{ $entry{'Description'} }) {
         next ENTRY  if ($langvar  &&  exists $langvar_iana_ignore{'name'}{$langvar});

         if (exists $langvar_iana_orig{$langvar}) {
            $langvar = $langvar_iana_orig{$langvar};
         }

         _ascii('iana',$langvar);
         push(@langvar,$langvar);
      }

      my %langvar;
      my @tmp;
      foreach my $langvar (@langvar) {
         next  if (exists $langvar{$langvar});
         push(@tmp,$langvar);
         $langvar{$langvar} = 1;
      }
      @langvar = @tmp;

      #
      # Get the ID if the langvar or ANY of the codes match.
      #

      my($err,$langvarID,$i,$t,%langvarID);
      foreach my $langvar (@langvar) {

         ($err,$langvarID,$i,$t) = get_ID("langvar","iana",$langvar,1);
         if ($err) {
            print "ERROR [iana]: problem in langvar: $langvar\n";
            next ENTRY;
         }

         $langvarID{$langvarID} = 1;
      }
      delete $langvarID{'-1'};
      my @langvarID = keys %langvarID;
      if (! @langvarID) {
         $langvarID = -1;

      } elsif (@langvarID == 1) {
         $langvarID = $langvarID[0];

      } else {
         print "ERROR [iana]: problem in langvarID: $langvar[0]: @langvarID\n";
         next ENTRY;
      }

      if ($langvarID == -1  &&  $alpha) {
         if (exists $Code2ID{"alpha"}{$alpha}) {
            ($langvarID,$i) = @{ $Code2ID{"alpha"}{$alpha} };
         }
      }

      if ($langvarID == -1) {
         $langvarID = $ID++;
         $i = 0;
         $ID2Names{$langvarID} = [ @langvar ];
         foreach my $langvar (@langvar) {
            $Alias{lc($langvar)}  = [ $langvarID, $i ];
         }
      }

      if ($alpha) {
         $Code2ID{"alpha"}{$alpha}     = [ $langvarID, 0 ];
         $ID2Code{"alpha"}{$langvarID} = $alpha;
      }
   }
}

############################################################################
# PRINT_TABLE
############################################################################

sub _type_hashes {
   my($caller) = @_;

   my ($type_alias);

   if      ($caller eq "country") {
      $type_alias   = \%country_alias;

   } elsif ($caller eq "language") {
      $type_alias   = \%language_alias;

   } elsif ($caller eq "currency") {
      $type_alias   = \%currency_alias;

   } elsif ($caller eq "script") {
      $type_alias   = \%script_alias;

   } elsif ($caller eq "langext") {
      $type_alias   = \%langext_alias;

   } elsif ($caller eq "langvar") {
      $type_alias   = \%langvar_alias;
   }

   return($type_alias);
}

sub print_table {
   my($caller) = @_;

   foreach my $typeID (sort keys %ID2Names) {
      my @type = @{ $ID2Names{$typeID} };
      my $i = 0;

      my $type = shift(@type);
      if (length($type) < 40) {
         $type .= " "x(40-length($type));
      } else {
         $type = substr($type,0,40);
      }

      print "${typeID}[$i] = $type  ";
      foreach my $codeset (keys %Code2ID) {
         my $field = "";

         if (exists $ID2Code{$codeset}{$typeID}) {
            my $code = $ID2Code{$codeset}{$typeID};
            my($code_id,$code_i) = @{ $Code2ID{$codeset}{$code} };
            $field = "$code [$code_id,$code_i]";
            $field .= " ERR"  if ($code_id != $typeID);
         }
         $field = $field . " "x(18-length($field));
         print $field;
      }
      print "\n";
      foreach $type (@type) {
         $i++;
         if (length($type) > 40) {
            $type = substr($type,0,40);
         }
         print "    [$i] = $type\n";
      }
   }
}

############################################################################
# CHECK CODES
############################################################################

sub check_code {
   my($type,$codeset,$code,$name,$currID,$noprint) = @_;

   # Check to make sure that the code is defined.

   if (exists $Code2ID{$codeset}{$code}) {
      return _check_code_exists($type,$codeset,$code,$name,$currID);
   } else {
      return _check_code_new($type,$codeset,$code,$name,$currID,$noprint);
   }
}

sub _check_code_exists {
   my($type,$codeset,$code,$name,$currID) = @_;

   # Check the currID for the code. It must be the same as the one
   # passed in.

   my $oldID = $Code2ID{$codeset}{$code}[0];
   if ($currID != $oldID) {
      print "ERROR [$type]: ID mismatch in code: [$codeset, $name, $code, $currID != $oldID ]\n";
      return 1;
   }

   # If the name is defined, it must be the same ID. If it is not,
   # create a new alias.

   if (exists $Alias{lc($name)}) {

      my $altID = $Alias{lc($name)}[0];

      if ($currID != $altID) {
         print "ERROR [$type]: ID mismatch: [$codeset, $name, $code, $currID != $altID ]\n";
         return 1;
      }

   } else {
      push @{ $ID2Names{$currID} },$name;
      my $i = $#{ $ID2Names{$currID} };
      $Alias{lc($name)} = [ $currID, $i ];
   }

   return 0;
}

# This is a new code.
sub _check_code_new {
   my($type,$codeset,$code,$name,$newID,$noprint) = @_;

   print "INFO [$type]: New code: $codeset [$code] => $name\n"  unless ($noprint);

   # If this code's name isn't defined, create it.

   my $i;
   if (exists $Alias{lc($name)}) {
      $i = $Alias{lc($name)}[1];
   } else {
      push @{ $ID2Names{$newID} },$name;
      $i = $#{ $ID2Names{$newID} };
      $Alias{lc($name)} = [ $newID, $i ];
   }

   # This name is the canonical name for the code.

   $ID2Code{$codeset}{$newID} = $code;
   $Code2ID{$codeset}{$code}  = [ $newID, $i ];

   return 0;
}

########################################
sub get_ID {
   my($op,$type,$name,$no_create) = @_;
   my $type_alias = _type_hashes($op);

   my($currID,$i,$t);
   if (exists $Alias{lc($name)}) {
      # The element is the same name as one previously defined
      ($currID,$i) = @{ $Alias{lc($name)} };
      $t = "same";

   } elsif (exists $$type_alias{$name}) {
      # It's a new alias for an existing element
      my $c = $$type_alias{$name};
      if (! exists $Alias{lc($c)}) {
         print "WARNING [$op,$type]: alias referenced before it is defined: $name => $c\n";
         return (1);
      }
      $currID = $Alias{lc($c)}[0];
      push @{ $ID2Names{$currID} },$name;
      $i = $#{ $ID2Names{$currID} };
      $Alias{lc($name)} = [ $currID, $i ];
      $t = "alias";

   } else {
      # It's a new element.
      if ($no_create) {
         return(0,-1,-1,"new");
      }
      $currID    = $ID++;
      $i         = 0;
      $ID2Names{$currID} = [ $name ];
      $Alias{lc($name)} = [ $currID, $i ];
      $t = "new";
   }

   return(0,$currID,$i,$t);
}

########################################

sub _ascii {
   my($type,$val) = @_;

   if ($val !~ /^[[:ascii:]]*$/) {
      my $tmp = $val;
      $tmp =~ s/[[:ascii:]]//g;
      print "NON-ASCII [$type]: '$val' [$tmp]\n";
   }
}

############################################################################
# DO_ALIASES
############################################################################

sub do_aliases {
   my($caller) = @_;

   my ($type_alias) = _type_hashes($caller);

   # Add remaining aliases.

   foreach my $alias (keys %$type_alias) {
      my $type = $$type_alias{$alias};

      next  if (exists $Alias{lc($type)}  &&
                exists $Alias{lc($alias)});

      if (! exists $Alias{lc($type)}  &&
          ! exists $Alias{lc($alias)}) {
         print "WARNING: unused type in alias list: $type\n";
         print "WARNING: unused type in alias list: $alias\n";
         next;
      }

      my ($typeID);
      if (exists $Alias{lc($type)}) {
         $typeID = $Alias{lc($type)}[0];
         $type   = $alias;
      } else {
         $typeID = $Alias{lc($alias)}[0];
      }

      push @{ $ID2Names{$typeID} },$type;
      my $i = $#{ $ID2Names{$typeID} };
      $Alias{lc($type)} = [ $typeID, $i ];
   }
}

############################################################################
# WRITE_MODULE
############################################################################

sub write_module {
   my($type) = @_;

   my(%hashes) = ("id2names"  => "ID2Names",
                  "alias2id"  => "Alias",
                  "code2id"   => "Code2ID",
                  "id2code"   => "ID2Code");

   my $file = "$ModDir/${Module}_Codes.pm";

   my $out = new IO::File;
   $out->open(">$file");
   my $timestamp   = `date`;
   chomp($timestamp);

   print $out "package Locale::Codes::${Module}_Codes;

# This file was automatically generated.  Any changes to this file will
# be lost the next time 'get_codes' is run.
#    Generated on: $timestamp

$podstr

$hdstr NAME

Locale::Codes::${Module}_Codes - $type codes for the Locale::Codes::$Module module

$hdstr SYNOPSIS

This module contains data used by the Locale::Codes::$Module module. It is
not intended to be used directly, and contains no calleable routines.

$hdstr AUTHOR

See Locale::Codes for full author history.

Currently maintained by Sullivan Beck (sbeck\@cpan.org).

$hdstr COPYRIGHT

   Copyright (c) 1997-2001 Canon Research Centre Europe (CRE).
   Copyright (c) 2001-2010 Neil Bowers
   Copyright (c) 2010-2011 Sullivan Beck

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

use strict;
use warnings;
require 5.002;

our(\$VERSION);
\$VERSION='3.17';

\$Locale::Codes::Data{'$type'}{'id'} = '$ID';

";

   foreach my $h qw(id2names alias2id code2id id2code) {
      my $hash = $hashes{$h};
      print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
      _write_hash($out,$hash);

      print $out "};\n\n";
   }

   print $out "1;\n";

   $out->close();
}

sub _write_hash {
   my($out,$hashname) = @_;

   no strict 'refs';
   my %hash = %$hashname;
   use strict 'refs';
   _write_subhash($out,3,\%hash);
}

sub _write_subhash {
   my($out,$indent,$hashref) = @_;

   my %hash = %$hashref;
   my $ind  = " "x$indent;

   foreach my $key (sort keys %hash) {
      my $val = $hash{$key};
      if (ref($val) eq "HASH") {
         print $out "${ind}q($key) => {\n";
         _write_subhash($out,$indent+3,$val);
         print $out "${ind}   },\n";
      } elsif (ref($val) eq "ARRAY") {
         print $out "${ind}q($key) => [\n";
         _write_sublist($out,$indent+3,$val);
         print $out "${ind}   ],\n";
      } else {
         print $out "${ind}q($key) => q($val),\n";
      }
   }
}

sub _write_sublist {
   my($out,$indent,$listref) = @_;

   my @list = @$listref;
   my $ind  = " "x$indent;

   foreach my $val (@list) {
      if (ref($val) eq "HASH") {
         print $out "${ind}{\n";
         _write_subhash($out,$indent+3,$val);
         print $out "${ind}},\n";
      } elsif (ref($val) eq "ARRAY") {
         print $out "${ind}[\n";
         _write_sublist($out,$indent+3,$val);
         print $out "${ind}],\n";
      } else {
         print $out "${ind}q($val),\n";
      }
   }
}

############################################################################
# DO_CLEAN
############################################################################

sub do_clean {
   print "Cleaning...\n";
   system("rm -f _country*");
   system("rm -f _language*");
   system("rm -f _script*");
   system("rm -f _currency*");
   system("rm -f _langext*");
   system("rm -f _langvar*");
}

############################################################################
# HANDLE CODESET
############################################################################

sub _read_file {
   my(%opts) = @_;

   #
   # Get the URL
   #

   my $file  = (caller(1))[3];
   $file     =~ s/main::do//;
   my $type  = $opts{'type'};
   $type     = 'text'  if (! $type);
   my $file2 = '';

   if ($type eq 'html') {
      $file .= ".htm";
   } elsif ($type eq 'xls') {
      $file .= ".xls";
   } elsif ($type eq 'zip') {
      $file2 = "$file.txt";
      $file .= ".zip";
   } else {
      $file .= ".txt";
   }

   my $url  = $opts{'url'};
   system("wget -N -q -O $file $url");

   #
   # Read the local file
   #

   my(@in);
   if ($type eq 'xls') {
      #
      # Read an XLS file
      #
      my $csv = $file;
      $csv    =~ s/.xls/.csv/;
      my $cmd = "xls2csv -x $file -b WINDOWS-1252 -c $csv -q";
      $cmd   .= " -a $opts{encoding}"  if ($opts{'encoding'});
      system($cmd);
      @in = `cat $csv`;
      shift(@in);
      my $in = Text::CSV::Slurp->load(string => join("",@in));
      @in = @$in;
      $opts{'as_list'} = 1;   # required

   } elsif ($type eq 'zip') {
      #
      # Read one file in a zip file
      #
      my $zip  = Archive::Zip->new($file);
      my @file = grep /$opts{'file'}/,$zip->memberNames();
      my $flag = $zip->extractMember($file[0],$file2);
      if (! defined($flag)) {
         die "ERROR [iso]: zip file changed format\n";
      }

      @in = `cat $file2`;

   } elsif ($opts{'encoding'}) {
      #
      # Read an encoded text file
      #
      open(my $in,"<:encoding($opts{encoding})",$file);
      @in = <$in>;
      close($in);

   } else {
      #
      # Read an ASCII text file
      #
      @in = `cat $file`;
   }
   chomp(@in);
   chop(@in)   if ($opts{'chop'});

   #
   # If it was encoded, make sure it's in UTF-8
   #

   if ($opts{'encoding'}  &&  $opts{'encoding'} ne 'UTF-8') {
      my $in = join("\n",@in);
      $in    = encode('UTF-8',$in);
      @in = split("\n",$in);
   }

   #
   # Strip out some problem strings.
   #

   if ($opts{'html_strip'}  ||  $opts{'html_repl'}) {
      my $in = join("\n",@in);
      strip_tags(\$in,@{ $opts{'html_strip'} })  if ($opts{'html_strip'});
      if ($opts{'html_repl'}) {
         foreach my $repl (@{ $opts{'html_repl'} }) {
            if (ref($repl)) {
               $in =~ s/$repl/ /sg;
            } else {
               $in =~ s/\Q$repl\E/ /sg;
            }
         }
         $in =~ s/\s+/ /sg;
      }
      @in = split("\n",$in);
   }


   #
   # Return the contents of the file as a list or a string.
   #

   if ($opts{'as_list'}) {
      return \@in;
   } else {
      return join(" ",@in);
   }
}

############################################################################
# HTML SCRAPING
############################################################################

sub get_row {
   my($type,$inref) = @_;

   return ()  if ($$inref !~ m,^\s*<tr,);

   if ($$inref !~ s,^(.*?)</tr[^>]*>,,) {
      die "ERROR [$type]: malformed HTML\n";
   }
   my $row = $1;

   if ($row =~ m,<table,) {
      die "ERROR [$type]: embedded table\n";
   }

   my @row;
   while ($row =~ s,(?:.*?)<(td|th)[^>]*>\s*(.*?)\s*</\1[^>]*>,,) {
      my $val = $2;
      push(@row,$val);
   }

   return @row;
}

sub jump_to_row {
   my($inref,$header) = @_;

   if ($$inref =~ s,(.*?)\Q$header\E(.*?)</tr[^>]*>(.*?)(?=<tr),,) {
      return 1;
   } else {
      return 0;
   }
}

sub jump_to_entry {
   my($inref,$value) = @_;

   if ($$inref =~ s,(.*?)(?=<(?:td|th)[^>]*>\s*\Q$value\E\s*),,) {
      return 1;
   } else {
      return 0;
   }
}

sub jump_to_table {
   my($inref) = @_;

   if ($$inref =~ s,(.*?)(?=<table),,) {
      return 1;
   } else {
      return 0;
   }
}

sub get_entry {
   my($inref) = @_;

   if ($$inref =~ s,.*?<td[^>]*>\s*(.*?)\s*</td[^>]*>,,) {
      return $1;
   }
   return "";
}

sub strip_tags {
   my($inref,@tags) = @_;

   foreach my $tag (@tags) {
      $$inref =~ s,</?$tag[^>]*>, ,g;
   }
}

# Local Variables:
# mode: cperl
# indent-tabs-mode: nil
# cperl-indent-level: 3
# cperl-continued-statement-offset: 2
# cperl-continued-brace-offset: 0
# cperl-brace-offset: 0
# cperl-brace-imaginary-offset: 0
# cperl-label-offset: -2
# End:
