package ShiftJIS::CP932::Correct;

use strict;
use vars qw($VERSION $PACKAGE @ISA @EXPORT @EXPORT_OK);
use vars qw(%CorrCP932);
use Carp;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(correct_cp932);

$VERSION = '0.03';
$PACKAGE = 'ShiftJIS::CP932::Correct';

my $Err_too = "$PACKAGE: Too many arguments for %s";

my $Schar = '(?:[\x81-\x9F\xE0-\xFC][\x00-\xFF]|[\x00-\xFF])';

my $CP932 = '(?:[\x00-\x7f\xa1-\xdf]|'
        . '\x81[\x40-\x7e\x80-\xac\xb8-\xbf\xc8-\xce\xda-\xe8\xf0-\xf7\xfc]|'
        . '\x82[\x4f-\x58\x60-\x79\x81-\x9a\x9f-\xf1]|'
        . '\x83[\x40-\x7e\x80-\x96\x9f-\xb6\xbf-\xd6]|'
        . '\x84[\x40-\x60\x70-\x7e\x80-\x91\x9f-\xbe]|'
        . '\x88[\x9f-\xfc]|\x98[\x40-\x72\x9f-\xfc]|\xea[\x40-\x7e\x80-\xa4]|'
        . '[\x89-\x97\x99-\x9f\xe0-\xe9][\x40-\x7e\x80-\xfc]|'
        . '\x87[\x40-\x5d\x5f-\x75\x7e\x80-\x9c]|'
        . '\xed[\x40-\x7e\x80-\xfc]|\xee[\x40-\x7e\x80-\xec\xef-\xfc]|'
        . '[\xfa\xfb][\x40-\x7e\x80-\xfc]|\xfc[\x40-\x4b])';

%CorrCP932 = (
"\x87\x90" => "\x81\xe0",
"\x87\x91" => "\x81\xdf",
"\x87\x92" => "\x81\xe7",
"\x87\x95" => "\x81\xe3",
"\x87\x96" => "\x81\xdb",
"\x87\x97" => "\x81\xda",
"\x87\x9a" => "\x81\xe6",
"\x87\x9b" => "\x81\xbf",
"\x87\x9c" => "\x81\xbe",
"\xed\x40" => "\xfa\x5c",
"\xed\x41" => "\xfa\x5d",
"\xed\x42" => "\xfa\x5e",
"\xed\x43" => "\xfa\x5f",
"\xed\x44" => "\xfa\x60",
"\xed\x45" => "\xfa\x61",
"\xed\x46" => "\xfa\x62",
"\xed\x47" => "\xfa\x63",
"\xed\x48" => "\xfa\x64",
"\xed\x49" => "\xfa\x65",
"\xed\x4a" => "\xfa\x66",
"\xed\x4b" => "\xfa\x67",
"\xed\x4c" => "\xfa\x68",
"\xed\x4d" => "\xfa\x69",
"\xed\x4e" => "\xfa\x6a",
"\xed\x4f" => "\xfa\x6b",
"\xed\x50" => "\xfa\x6c",
"\xed\x51" => "\xfa\x6d",
"\xed\x52" => "\xfa\x6e",
"\xed\x53" => "\xfa\x6f",
"\xed\x54" => "\xfa\x70",
"\xed\x55" => "\xfa\x71",
"\xed\x56" => "\xfa\x72",
"\xed\x57" => "\xfa\x73",
"\xed\x58" => "\xfa\x74",
"\xed\x59" => "\xfa\x75",
"\xed\x5a" => "\xfa\x76",
"\xed\x5b" => "\xfa\x77",
"\xed\x5c" => "\xfa\x78",
"\xed\x5d" => "\xfa\x79",
"\xed\x5e" => "\xfa\x7a",
"\xed\x5f" => "\xfa\x7b",
"\xed\x60" => "\xfa\x7c",
"\xed\x61" => "\xfa\x7d",
"\xed\x62" => "\xfa\x7e",
"\xed\x63" => "\xfa\x80",
"\xed\x64" => "\xfa\x81",
"\xed\x65" => "\xfa\x82",
"\xed\x66" => "\xfa\x83",
"\xed\x67" => "\xfa\x84",
"\xed\x68" => "\xfa\x85",
"\xed\x69" => "\xfa\x86",
"\xed\x6a" => "\xfa\x87",
"\xed\x6b" => "\xfa\x88",
"\xed\x6c" => "\xfa\x89",
"\xed\x6d" => "\xfa\x8a",
"\xed\x6e" => "\xfa\x8b",
"\xed\x6f" => "\xfa\x8c",
"\xed\x70" => "\xfa\x8d",
"\xed\x71" => "\xfa\x8e",
"\xed\x72" => "\xfa\x8f",
"\xed\x73" => "\xfa\x90",
"\xed\x74" => "\xfa\x91",
"\xed\x75" => "\xfa\x92",
"\xed\x76" => "\xfa\x93",
"\xed\x77" => "\xfa\x94",
"\xed\x78" => "\xfa\x95",
"\xed\x79" => "\xfa\x96",
"\xed\x7a" => "\xfa\x97",
"\xed\x7b" => "\xfa\x98",
"\xed\x7c" => "\xfa\x99",
"\xed\x7d" => "\xfa\x9a",
"\xed\x7e" => "\xfa\x9b",
"\xed\x80" => "\xfa\x9c",
"\xed\x81" => "\xfa\x9d",
"\xed\x82" => "\xfa\x9e",
"\xed\x83" => "\xfa\x9f",
"\xed\x84" => "\xfa\xa0",
"\xed\x85" => "\xfa\xa1",
"\xed\x86" => "\xfa\xa2",
"\xed\x87" => "\xfa\xa3",
"\xed\x88" => "\xfa\xa4",
"\xed\x89" => "\xfa\xa5",
"\xed\x8a" => "\xfa\xa6",
"\xed\x8b" => "\xfa\xa7",
"\xed\x8c" => "\xfa\xa8",
"\xed\x8d" => "\xfa\xa9",
"\xed\x8e" => "\xfa\xaa",
"\xed\x8f" => "\xfa\xab",
"\xed\x90" => "\xfa\xac",
"\xed\x91" => "\xfa\xad",
"\xed\x92" => "\xfa\xae",
"\xed\x93" => "\xfa\xaf",
"\xed\x94" => "\xfa\xb0",
"\xed\x95" => "\xfa\xb1",
"\xed\x96" => "\xfa\xb2",
"\xed\x97" => "\xfa\xb3",
"\xed\x98" => "\xfa\xb4",
"\xed\x99" => "\xfa\xb5",
"\xed\x9a" => "\xfa\xb6",
"\xed\x9b" => "\xfa\xb7",
"\xed\x9c" => "\xfa\xb8",
"\xed\x9d" => "\xfa\xb9",
"\xed\x9e" => "\xfa\xba",
"\xed\x9f" => "\xfa\xbb",
"\xed\xa0" => "\xfa\xbc",
"\xed\xa1" => "\xfa\xbd",
"\xed\xa2" => "\xfa\xbe",
"\xed\xa3" => "\xfa\xbf",
"\xed\xa4" => "\xfa\xc0",
"\xed\xa5" => "\xfa\xc1",
"\xed\xa6" => "\xfa\xc2",
"\xed\xa7" => "\xfa\xc3",
"\xed\xa8" => "\xfa\xc4",
"\xed\xa9" => "\xfa\xc5",
"\xed\xaa" => "\xfa\xc6",
"\xed\xab" => "\xfa\xc7",
"\xed\xac" => "\xfa\xc8",
"\xed\xad" => "\xfa\xc9",
"\xed\xae" => "\xfa\xca",
"\xed\xaf" => "\xfa\xcb",
"\xed\xb0" => "\xfa\xcc",
"\xed\xb1" => "\xfa\xcd",
"\xed\xb2" => "\xfa\xce",
"\xed\xb3" => "\xfa\xcf",
"\xed\xb4" => "\xfa\xd0",
"\xed\xb5" => "\xfa\xd1",
"\xed\xb6" => "\xfa\xd2",
"\xed\xb7" => "\xfa\xd3",
"\xed\xb8" => "\xfa\xd4",
"\xed\xb9" => "\xfa\xd5",
"\xed\xba" => "\xfa\xd6",
"\xed\xbb" => "\xfa\xd7",
"\xed\xbc" => "\xfa\xd8",
"\xed\xbd" => "\xfa\xd9",
"\xed\xbe" => "\xfa\xda",
"\xed\xbf" => "\xfa\xdb",
"\xed\xc0" => "\xfa\xdc",
"\xed\xc1" => "\xfa\xdd",
"\xed\xc2" => "\xfa\xde",
"\xed\xc3" => "\xfa\xdf",
"\xed\xc4" => "\xfa\xe0",
"\xed\xc5" => "\xfa\xe1",
"\xed\xc6" => "\xfa\xe2",
"\xed\xc7" => "\xfa\xe3",
"\xed\xc8" => "\xfa\xe4",
"\xed\xc9" => "\xfa\xe5",
"\xed\xca" => "\xfa\xe6",
"\xed\xcb" => "\xfa\xe7",
"\xed\xcc" => "\xfa\xe8",
"\xed\xcd" => "\xfa\xe9",
"\xed\xce" => "\xfa\xea",
"\xed\xcf" => "\xfa\xeb",
"\xed\xd0" => "\xfa\xec",
"\xed\xd1" => "\xfa\xed",
"\xed\xd2" => "\xfa\xee",
"\xed\xd3" => "\xfa\xef",
"\xed\xd4" => "\xfa\xf0",
"\xed\xd5" => "\xfa\xf1",
"\xed\xd6" => "\xfa\xf2",
"\xed\xd7" => "\xfa\xf3",
"\xed\xd8" => "\xfa\xf4",
"\xed\xd9" => "\xfa\xf5",
"\xed\xda" => "\xfa\xf6",
"\xed\xdb" => "\xfa\xf7",
"\xed\xdc" => "\xfa\xf8",
"\xed\xdd" => "\xfa\xf9",
"\xed\xde" => "\xfa\xfa",
"\xed\xdf" => "\xfa\xfb",
"\xed\xe0" => "\xfa\xfc",
"\xed\xe1" => "\xfb\x40",
"\xed\xe2" => "\xfb\x41",
"\xed\xe3" => "\xfb\x42",
"\xed\xe4" => "\xfb\x43",
"\xed\xe5" => "\xfb\x44",
"\xed\xe6" => "\xfb\x45",
"\xed\xe7" => "\xfb\x46",
"\xed\xe8" => "\xfb\x47",
"\xed\xe9" => "\xfb\x48",
"\xed\xea" => "\xfb\x49",
"\xed\xeb" => "\xfb\x4a",
"\xed\xec" => "\xfb\x4b",
"\xed\xed" => "\xfb\x4c",
"\xed\xee" => "\xfb\x4d",
"\xed\xef" => "\xfb\x4e",
"\xed\xf0" => "\xfb\x4f",
"\xed\xf1" => "\xfb\x50",
"\xed\xf2" => "\xfb\x51",
"\xed\xf3" => "\xfb\x52",
"\xed\xf4" => "\xfb\x53",
"\xed\xf5" => "\xfb\x54",
"\xed\xf6" => "\xfb\x55",
"\xed\xf7" => "\xfb\x56",
"\xed\xf8" => "\xfb\x57",
"\xed\xf9" => "\xfb\x58",
"\xed\xfa" => "\xfb\x59",
"\xed\xfb" => "\xfb\x5a",
"\xed\xfc" => "\xfb\x5b",
"\xee\x40" => "\xfb\x5c",
"\xee\x41" => "\xfb\x5d",
"\xee\x42" => "\xfb\x5e",
"\xee\x43" => "\xfb\x5f",
"\xee\x44" => "\xfb\x60",
"\xee\x45" => "\xfb\x61",
"\xee\x46" => "\xfb\x62",
"\xee\x47" => "\xfb\x63",
"\xee\x48" => "\xfb\x64",
"\xee\x49" => "\xfb\x65",
"\xee\x4a" => "\xfb\x66",
"\xee\x4b" => "\xfb\x67",
"\xee\x4c" => "\xfb\x68",
"\xee\x4d" => "\xfb\x69",
"\xee\x4e" => "\xfb\x6a",
"\xee\x4f" => "\xfb\x6b",
"\xee\x50" => "\xfb\x6c",
"\xee\x51" => "\xfb\x6d",
"\xee\x52" => "\xfb\x6e",
"\xee\x53" => "\xfb\x6f",
"\xee\x54" => "\xfb\x70",
"\xee\x55" => "\xfb\x71",
"\xee\x56" => "\xfb\x72",
"\xee\x57" => "\xfb\x73",
"\xee\x58" => "\xfb\x74",
"\xee\x59" => "\xfb\x75",
"\xee\x5a" => "\xfb\x76",
"\xee\x5b" => "\xfb\x77",
"\xee\x5c" => "\xfb\x78",
"\xee\x5d" => "\xfb\x79",
"\xee\x5e" => "\xfb\x7a",
"\xee\x5f" => "\xfb\x7b",
"\xee\x60" => "\xfb\x7c",
"\xee\x61" => "\xfb\x7d",
"\xee\x62" => "\xfb\x7e",
"\xee\x63" => "\xfb\x80",
"\xee\x64" => "\xfb\x81",
"\xee\x65" => "\xfb\x82",
"\xee\x66" => "\xfb\x83",
"\xee\x67" => "\xfb\x84",
"\xee\x68" => "\xfb\x85",
"\xee\x69" => "\xfb\x86",
"\xee\x6a" => "\xfb\x87",
"\xee\x6b" => "\xfb\x88",
"\xee\x6c" => "\xfb\x89",
"\xee\x6d" => "\xfb\x8a",
"\xee\x6e" => "\xfb\x8b",
"\xee\x6f" => "\xfb\x8c",
"\xee\x70" => "\xfb\x8d",
"\xee\x71" => "\xfb\x8e",
"\xee\x72" => "\xfb\x8f",
"\xee\x73" => "\xfb\x90",
"\xee\x74" => "\xfb\x91",
"\xee\x75" => "\xfb\x92",
"\xee\x76" => "\xfb\x93",
"\xee\x77" => "\xfb\x94",
"\xee\x78" => "\xfb\x95",
"\xee\x79" => "\xfb\x96",
"\xee\x7a" => "\xfb\x97",
"\xee\x7b" => "\xfb\x98",
"\xee\x7c" => "\xfb\x99",
"\xee\x7d" => "\xfb\x9a",
"\xee\x7e" => "\xfb\x9b",
"\xee\x80" => "\xfb\x9c",
"\xee\x81" => "\xfb\x9d",
"\xee\x82" => "\xfb\x9e",
"\xee\x83" => "\xfb\x9f",
"\xee\x84" => "\xfb\xa0",
"\xee\x85" => "\xfb\xa1",
"\xee\x86" => "\xfb\xa2",
"\xee\x87" => "\xfb\xa3",
"\xee\x88" => "\xfb\xa4",
"\xee\x89" => "\xfb\xa5",
"\xee\x8a" => "\xfb\xa6",
"\xee\x8b" => "\xfb\xa7",
"\xee\x8c" => "\xfb\xa8",
"\xee\x8d" => "\xfb\xa9",
"\xee\x8e" => "\xfb\xaa",
"\xee\x8f" => "\xfb\xab",
"\xee\x90" => "\xfb\xac",
"\xee\x91" => "\xfb\xad",
"\xee\x92" => "\xfb\xae",
"\xee\x93" => "\xfb\xaf",
"\xee\x94" => "\xfb\xb0",
"\xee\x95" => "\xfb\xb1",
"\xee\x96" => "\xfb\xb2",
"\xee\x97" => "\xfb\xb3",
"\xee\x98" => "\xfb\xb4",
"\xee\x99" => "\xfb\xb5",
"\xee\x9a" => "\xfb\xb6",
"\xee\x9b" => "\xfb\xb7",
"\xee\x9c" => "\xfb\xb8",
"\xee\x9d" => "\xfb\xb9",
"\xee\x9e" => "\xfb\xba",
"\xee\x9f" => "\xfb\xbb",
"\xee\xa0" => "\xfb\xbc",
"\xee\xa1" => "\xfb\xbd",
"\xee\xa2" => "\xfb\xbe",
"\xee\xa3" => "\xfb\xbf",
"\xee\xa4" => "\xfb\xc0",
"\xee\xa5" => "\xfb\xc1",
"\xee\xa6" => "\xfb\xc2",
"\xee\xa7" => "\xfb\xc3",
"\xee\xa8" => "\xfb\xc4",
"\xee\xa9" => "\xfb\xc5",
"\xee\xaa" => "\xfb\xc6",
"\xee\xab" => "\xfb\xc7",
"\xee\xac" => "\xfb\xc8",
"\xee\xad" => "\xfb\xc9",
"\xee\xae" => "\xfb\xca",
"\xee\xaf" => "\xfb\xcb",
"\xee\xb0" => "\xfb\xcc",
"\xee\xb1" => "\xfb\xcd",
"\xee\xb2" => "\xfb\xce",
"\xee\xb3" => "\xfb\xcf",
"\xee\xb4" => "\xfb\xd0",
"\xee\xb5" => "\xfb\xd1",
"\xee\xb6" => "\xfb\xd2",
"\xee\xb7" => "\xfb\xd3",
"\xee\xb8" => "\xfb\xd4",
"\xee\xb9" => "\xfb\xd5",
"\xee\xba" => "\xfb\xd6",
"\xee\xbb" => "\xfb\xd7",
"\xee\xbc" => "\xfb\xd8",
"\xee\xbd" => "\xfb\xd9",
"\xee\xbe" => "\xfb\xda",
"\xee\xbf" => "\xfb\xdb",
"\xee\xc0" => "\xfb\xdc",
"\xee\xc1" => "\xfb\xdd",
"\xee\xc2" => "\xfb\xde",
"\xee\xc3" => "\xfb\xdf",
"\xee\xc4" => "\xfb\xe0",
"\xee\xc5" => "\xfb\xe1",
"\xee\xc6" => "\xfb\xe2",
"\xee\xc7" => "\xfb\xe3",
"\xee\xc8" => "\xfb\xe4",
"\xee\xc9" => "\xfb\xe5",
"\xee\xca" => "\xfb\xe6",
"\xee\xcb" => "\xfb\xe7",
"\xee\xcc" => "\xfb\xe8",
"\xee\xcd" => "\xfb\xe9",
"\xee\xce" => "\xfb\xea",
"\xee\xcf" => "\xfb\xeb",
"\xee\xd0" => "\xfb\xec",
"\xee\xd1" => "\xfb\xed",
"\xee\xd2" => "\xfb\xee",
"\xee\xd3" => "\xfb\xef",
"\xee\xd4" => "\xfb\xf0",
"\xee\xd5" => "\xfb\xf1",
"\xee\xd6" => "\xfb\xf2",
"\xee\xd7" => "\xfb\xf3",
"\xee\xd8" => "\xfb\xf4",
"\xee\xd9" => "\xfb\xf5",
"\xee\xda" => "\xfb\xf6",
"\xee\xdb" => "\xfb\xf7",
"\xee\xdc" => "\xfb\xf8",
"\xee\xdd" => "\xfb\xf9",
"\xee\xde" => "\xfb\xfa",
"\xee\xdf" => "\xfb\xfb",
"\xee\xe0" => "\xfb\xfc",
"\xee\xe1" => "\xfc\x40",
"\xee\xe2" => "\xfc\x41",
"\xee\xe3" => "\xfc\x42",
"\xee\xe4" => "\xfc\x43",
"\xee\xe5" => "\xfc\x44",
"\xee\xe6" => "\xfc\x45",
"\xee\xe7" => "\xfc\x46",
"\xee\xe8" => "\xfc\x47",
"\xee\xe9" => "\xfc\x48",
"\xee\xea" => "\xfc\x49",
"\xee\xeb" => "\xfc\x4a",
"\xee\xec" => "\xfc\x4b",
"\xee\xef" => "\xfa\x40",
"\xee\xf0" => "\xfa\x41",
"\xee\xf1" => "\xfa\x42",
"\xee\xf2" => "\xfa\x43",
"\xee\xf3" => "\xfa\x44",
"\xee\xf4" => "\xfa\x45",
"\xee\xf5" => "\xfa\x46",
"\xee\xf6" => "\xfa\x47",
"\xee\xf7" => "\xfa\x48",
"\xee\xf8" => "\xfa\x49",
"\xee\xf9" => "\x81\xca",
"\xee\xfa" => "\xfa\x55",
"\xee\xfb" => "\xfa\x56",
"\xee\xfc" => "\xfa\x57",
"\xfa\x4a" => "\x87\x54",
"\xfa\x4b" => "\x87\x55",
"\xfa\x4c" => "\x87\x56",
"\xfa\x4d" => "\x87\x57",
"\xfa\x4e" => "\x87\x58",
"\xfa\x4f" => "\x87\x59",
"\xfa\x50" => "\x87\x5a",
"\xfa\x51" => "\x87\x5b",
"\xfa\x52" => "\x87\x5c",
"\xfa\x53" => "\x87\x5d",
"\xfa\x54" => "\x81\xca",
"\xfa\x58" => "\x87\x8a",
"\xfa\x59" => "\x87\x82",
"\xfa\x5a" => "\x87\x84",
"\xfa\x5b" => "\x81\xe6",
);

sub correct_cp932 {
  my $result;
  if(@_ != 1){ croak sprintf $Err_too, 'correct_cp932' }
  my $str = shift;
  foreach($str =~ /$Schar/go){
    next if ! m|^$CP932$|o;
    $result .= defined $CorrCP932{ $_ } ? $CorrCP932{ $_ } : $_;
  }
  return $result;
}

1;
__END__

=head1 NAME

ShiftJIS::CP932::Correct - Corrects a string in the CP-932 encoding
 (Shift_JIS supported by MS).

=head1 SYNOPSIS

  use ShiftJIS::CP932::Correct;

  $corrected_cp932 = correct_cp932($cp932_string);

=head1 DESCRIPTION

The Microsoft Code Page 932 (CP-932) table comprises 7915 characters:

  JIS X 0201-1976 single-byte characters (191 characters),
  JIS X 0208-1990 double-byte characters (6879 characters),
  NEC special characters (83 characters from SJIS row 13),
  NEC-selected IBM extended characters (374 characters from SJIS row 89 to 92),
  and IBM extended characters (388 characters from SJIS row 115 to 119).

It contains duplicates that do not round trip
map. These duplicates are due to the characters defined
by vendors, NEC and IBM.

For example, there are two characters mapped to U+2252,
namely, 0x81e0 (JIS X 0208) and 0x8790 (NEC special character).

So some programs converting Unicode to CP-932 may carelessly
convert U+2252 to 0x8790, but not to 0x81e0.

Such a behavior is disagreeable
since NEC special characters (or other vendor-defined characters)
are less compatible.

This module corrects (or normalizes) such a (certainly legal but) 
'wrong' CP-932 string.

This modules uses a map provided in Microsoft PRB: Conversion Problem
Between Shift-JIS and Unicode (Article ID: Q170559).

=over 4

=item C<correct_cp932(STRING)>

Corrects a CP-932 string. namely, converts less preferred codepoints
of duplicates (doubly-defined characters) to those preferred.

Does not affect characters that can
be round trip mapped to Unicode. Any undefined characters are deleted.

For example, converts C<\x87\x90> to C<\x81\xe0>.

=back

=head1 AUTHOR

Tomoyuki SADAHIRO

  bqw10602@nifty.com
  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

This program is free software; you can redistribute it and/or 
modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item 1

Microsoft PRB: Conversion Problem Between Shift-JIS and Unicode
(Article ID: Q170559)

=back

=cut
