#!/usr/bin/perl
# Script to build the LaTeX::Encode::EncodingTable module
# This script is not meant to be installed.

use strict;
use warnings;

use charnames qw();
use HTML::Entities qw(%char2entity);
use Pod::LaTeX;


# Hash of characters for which we specify explicit encodings

my %explicit_encoding

    = ( # LaTeX special characters

	'\\'       =>   '\\textbackslash',  # command character
        '{'        => '\\{',                # begin group
        '}'        => '\\}',                # end group
        '%'        => '\\%',                # comment
        '#'        => '\\#',                # command parameter
        '$'        => '\\$',                # introduces math mode
        '_'        => '\\_',                # subscript
        '^'        => '\\^{ }',             # superscript
        '&'        => '\\&',                # tabbing character
        '~'        => '\\texttildelow',     # non-breaking space

        # Characters that are not set as themselves

        '"'        => '\\textacutedbl',
        '<'        => '\\textlangle',
        '>'        => '\\textrangle',


        # Other characters with explicit encodings

        chr(0x00b8) => '{\\c{}}',
        chr(0x0131) => '{\\i}',
        chr(0x0e3f) => '\\textbaht',
        chr(0x2002) => '\\phantom{N}',
        chr(0x2004) => '\\hspace{.333333em}',
        chr(0x2005) => '\\hspace{.25em}',
        chr(0x2006) => '\\hspace{.166666em}',
        chr(0x2016) => '\\textbardbl',
        chr(0x20a3) => '\\textlira',
        chr(0x20a6) => '\\textnaira',
        chr(0x20a9) => '\\textwon',
        chr(0x20ab) => '\\textdong',
    );


# Map of character code to encoding that is built up, before being output

my %char_encoding = %explicit_encoding;



# Formats for building accented characters

my %accent_format = ( ACUTE        => '\\\'%s',
                      BREVE        => '\\u{%s}',
                      CARON        => '\\v{%s}',
                      CEDILLA      => '\\c{%s}',
                      CIRCUMFLEX   => '\\^%s',
                      'DOT ABOVE'  => '\\.{%s}',
                      GRAVE        => '\\`%s',
                      TILDE        => '\\~%s',
                      DIARESIS     => '\\"%s',
    );


# Comments to intersperse in the encodign table

my %comments   = ( 0x00a0 => 'C1 Controls and Latin-1 Supplement',
                   0x0100 => 'Latin Extended-A',
                   0x0200 => 'Spacing Modifier Letters',
                   0x0390 => 'Greek and Coptic',
                   0x2000 => 'General Punctuation',
                   0x20a0 => 'Currency Symbols',
                   0x2200 => 'Mathematical Operations',
    );



# Generated module header and footer (POD is included as the data of this script) 

my $file_header = join("\n", ( '# LaTeX::Encode character encoding table',
                               '# $Id: $',
                               '# Warning this module was automatically generated',
                               '',
                               'package LaTeX::Encode::EncodingTable;',
                               '',
                               'use strict;',
                               'use warnings;',
                               '',
                               "use parent 'Exporter';",
                               '',
                               'our @EXPORT = qw(%latex_encoding $encoded_char_re);',
                               '',
                               'our $encoded_char_re;',
                               '',
                               'our %latex_encoding = (',
                               '',
                       ) );

my $file_footer = join("\n", ( ');',
                               '',
                               'sub _compile_encoding_regexp {',
                               '    $encoded_char_re = join(\'\', sort keys %latex_encoding);',
                               '    $encoded_char_re =~ s{ ([#$\\[\\]\\\\]) }{\\\\$1}gx;',
                               '    $encoded_char_re = eval "qr{[$encoded_char_re]}x";',
                               '    return;',
                               '}',
                               '',
                               '_compile_encoding_regexp();',
                               '',
                               '',
                               '1;',
                               '',
                               '__END__',
                               '',
                       ) );




# Print out the head of the file

print $file_header;


# Gather the encodings defined in Pod::LaTeX

foreach my $char (sort keys %char2entity) {
    next if exists $explicit_encoding{$char};

    my $charcode  = ord $char;
    my $html_enc  = $char2entity{$char};
    (my $html_name = $html_enc) =~ s/^&(.*);$/$1/;
    my $latex_enc = $Pod::LaTeX::HTML_Escapes{$html_name};

    if (!defined($latex_enc)) {
#        printf(STDERR "ignoring character 0x%x%s - no known encoding\n",
#               ord($char), (ord($char) >= 0x20 && ord($char) < 256) ? " '$char'" : ""); 
        next;
    }
    if ($char eq $latex_enc) {
        printf(STDERR "ignoring character 0x%x%s - encoding equals character\n",
               ord($char), (ord($char) >= 0x20 && ord($char) < 256) ? " '$char'" : ""); 
        next;
    }

    $char_encoding{$char} = $latex_enc;
}


# Generate encodings for recognized accented characters

foreach my $charcode (0x100 .. 0x200) {
    my $char = chr($charcode);
    next if exists $char_encoding{$char};

    my $charname = charnames::viacode($charcode);
    if ($charname =~ /^LATIN (CAPITAL|SMALL) LETTER ([A-Z]) WITH ([\w ]+)$/) {
        my ($case, $letter, $accent) = ($1, $2, $3);
        next unless exists $accent_format{$accent};
        $char_encoding{$char} = sprintf($accent_format{$accent},
                                        $case eq 'SMALL' ? lc $letter : $letter);
    }
}


# Output the encoding table

foreach my $char (sort { $a cmp $b } keys %char_encoding) {
    my $charcode  = ord $char;
    my $latex_enc = $char_encoding{$char};
    my $html_enc  = $char2entity{$char};

    if (my $comment = comment_for_code($charcode)) {
        print "\n    # $comment\n\n";
    }

    # Make sure that characters are properly escaped for inclusion in
    # a Perl single-quoted string

    $latex_enc =~ s{\\}{\\\\}g;
    $latex_enc =~ s{'}{\\'}g;
    $latex_enc =~ s/^(\\.*[a-z])(?:\{\})?$/{$1}/i;


    # Format a line of the encoding table, including the Unicode character name and the
    # HTML entity name (if one exists) as the comment for the entry.

    my $line = sprintf("    %-11s => %-26s # %-44s%s", 
                       sprintf("chr(0x%04x)", $charcode),
                       sprintf("'%s',", $latex_enc || '<undef>'), 
                       charnames::viacode($charcode),
                       $html_enc ? " ($html_enc)" : '');

    $line =~ s/\s*$//;
    print "$line\n";
}


# Print out the tail of the file - the footer and then the POD taken from the data section
# of this file

print $file_footer;

while (<DATA>) {
    s/^#\s?//;
    print;
}

exit(0);



sub comment_for_code {
    my ($code) = @_;
    foreach my $key (sort keys %comments) {
        return delete $comments{$key} if $code >= $key;
    }
    return;
}
        



=head1 NAME

build-character-table

=head1 SYNOPSIS

=head1 DESCRIPTION

This is a script to rebuild the C<LaTeX::Encode::EncodingTable> module.


=head1 AUTHOR

Andrew Ford E<lt>a.ford@ford-mason.co.ukE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2007 Andrew Ford.  All Rights Reserved.

This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

__DATA__

# =head1 NAME
# 
# LaTeX::Encode::EncodingTable - character encoding table for LaTeX::Encode
# 
# =head1 SYNOPSIS
# 
# This module is not intended to be used except by LaTeX::Encode
# 
# =head1 DESCRIPTION
# 
# This module contains the C<%latex_encoding> table, which is used in
# the C<LaTeX::Encode> module in the C<latex_encode()> function.  The
# table is maintained by hand, but there is a script
# C<build-character-table> in the C<LaTeX-Encode> distribution, that
# will build a version of the table based on information in the
# C<HTML::Entities> and C<Pod::LaTeX> modules.  Differences between the
# generated and the maintained versions are integrated manually.
# 
# =head1 SUBROUTINES/METHODS
# 
# Not applicable.
# 
# =head1 DIAGNOSTICS
# 
# Not applicable.
# 
# =head1 CONFIGURATION AND ENVIRONMENT
# 
# Not applicable.
# 
# =head1 DEPENDENCIES
# 
# The C<HTML::Entities> and C<Pod::LaTeX> modules were used for building
# the encoding table in C<LaTeX::Encode::EncodingTable>, but this is not
# rebuilt at installation time.
# 
# 
# =head1 INCOMPATIBILITIES
# 
# Not applicable.
# 
# =head1 BUGS AND LIMITATIONS
# 
# Not all LaTeX special characters are included in the encoding tables
# (more may be added when I track down the definitions).
# 
# =head1 AUTHOR
# 
# Andrew Ford E<lt>a.ford@ford-mason.co.ukE<gt>
# 
# =head1 LICENSE AND COPYRIGHT
# 
# Copyright (C) 2007 Andrew Ford.  All Rights Reserved.
# 
# This module is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
# 
# This software is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# 
# =head1 SEE ALSO
# 
# L<HTML::Entities>, L<Pod::LaTeX>
# 
# =cut
# 
## Local Variables:
## mode: perl
## perl-indent-level: 4
## indent-tabs-mode: nil
## End:
##
## vim: expandtab shiftwidth=4:
