/*  GNU Moe - My Own Editor
    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
    2014 Antonio Diaz Diaz.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <cctype>
#include <string>

#include "encoding.h"
#include "iso_8859.h"


namespace Encoding {

// Charset independent Base64 alphabet (RFC 3548).
//
int base64_value( const unsigned char ch )
  {
  switch( ch )
    {
    case 'A': return 0;
    case 'B': return 1;
    case 'C': return 2;
    case 'D': return 3;
    case 'E': return 4;
    case 'F': return 5;
    case 'G': return 6;
    case 'H': return 7;
    case 'I': return 8;
    case 'J': return 9;
    case 'K': return 10;
    case 'L': return 11;
    case 'M': return 12;
    case 'N': return 13;
    case 'O': return 14;
    case 'P': return 15;
    case 'Q': return 16;
    case 'R': return 17;
    case 'S': return 18;
    case 'T': return 19;
    case 'U': return 20;
    case 'V': return 21;
    case 'W': return 22;
    case 'X': return 23;
    case 'Y': return 24;
    case 'Z': return 25;
    case 'a': return 26;
    case 'b': return 27;
    case 'c': return 28;
    case 'd': return 29;
    case 'e': return 30;
    case 'f': return 31;
    case 'g': return 32;
    case 'h': return 33;
    case 'i': return 34;
    case 'j': return 35;
    case 'k': return 36;
    case 'l': return 37;
    case 'm': return 38;
    case 'n': return 39;
    case 'o': return 40;
    case 'p': return 41;
    case 'q': return 42;
    case 'r': return 43;
    case 's': return 44;
    case 't': return 45;
    case 'u': return 46;
    case 'v': return 47;
    case 'w': return 48;
    case 'x': return 49;
    case 'y': return 50;
    case 'z': return 51;
    case '0': return 52;
    case '1': return 53;
    case '2': return 54;
    case '3': return 55;
    case '4': return 56;
    case '5': return 57;
    case '6': return 58;
    case '7': return 59;
    case '8': return 60;
    case '9': return 61;
    case '+': return 62;
    case '/': return 63;
    default: return -1;
    }
  }


int map_to_byte( const int code )
  {
  if( code < 0 ) return -1;
  if( code < 256 ) return code;
  switch( code )
    {
    case 0x0160: return 0xA6;	// latin capital letter s with caron
    case 0x0161: return 0xA8;	// latin small letter s with caron
    case 0x0178: return 0xBE;	// latin capital letter y with diaeresis
    case 0x017D: return 0xB4;	// latin capital letter z with caron
    case 0x017E: return 0xB8;	// latin small letter z with caron
    case 0x2022: return 0xB7;	// bullet
    case 0x2024: return '.';
    case 0x2035: return '`';
    case 0x2039: return '<';
    case 0x203A: return '>';
    case 0x2044: return '/';
    case 0x204A: return '&';
    case 0x204B: return 0xB6;	// reversed pilcrow sign
    case 0x204E: return '*';
    case 0x204F: return ';';
    case 0x2052: return '%';
    case 0x2053: return '~';
    case 0x20AC: return 0xA4;	// symbole euro
    }
  if( ( code >= 0x2000 && code <= 0x200B ) || code == 0x202F || code == 0x205F )
    return ' ';
  if( ( code >= 0x2010 && code <= 0x2013 ) || code == 0x2043 ) return '-';
  if( ( code >= 0x2018 && code <= 0x201B ) || code == 0x2032 ) return '\'';
  if( ( code >= 0x201C && code <= 0x201F ) || code == 0x2033 || code == 0x2036 )
    return '"';
  return -1;
  }


const char * map_to_string( const int code )
  {
  switch( code )
    {
    case 0x0152: return "OE";
    case 0x0153: return "oe";
    case 0x2014:
    case 0x2015: return "--";
    case 0x2025: return "..";
    case 0x2026: return "...";
    case 0x2034: return "'''";
    case 0x2037: return "```";
    case 0x203C: return "!!";
    case 0x2047: return "??";
    case 0x2048: return "?!";
    case 0x2049: return "!?";
    case 0x2057: return "''''";
    }
  return 0;
  }


const char * ucs_to_utf8( const int code )
  {
  static char s[7];

  if( code < 0 || code > 0x7FFFFFFF ) { s[0] = 0; return s; } // invalid code
  if( code < 128 ) { s[0] = code; s[1] = 0; return s; }       // plain ascii

  int i, mask;
  if( code < 0x800 ) { i = 2; mask = 0xC0; }		// 110X XXXX
  else if( code < 0x10000 ) { i = 3; mask = 0xE0; }	// 1110 XXXX
  else if( code < 0x200000 ) { i = 4; mask = 0xF0; }	// 1111 0XXX
  else if( code < 0x4000000 ) { i = 5; mask = 0xF8; }	// 1111 10XX
  else { i = 6; mask = 0xFC; }				// 1111 110X

  s[i] = 0; --i;
  int d = 0;
  for( ; i > 0; --i, d+=6 )
    s[i] = 0x80 | ( ( code >> d ) & 0x3F );		// 10XX XXXX
  s[0] = mask | ( code >> d );
  return s;
  }

} // end namespace Encoding


void Encoding::base64_encode( const std::string & in, std::string & out )
  {
  const unsigned char b64str[65] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

  out.clear();
  for( unsigned i = 0; i < in.size(); i += 3 )
    {
    const bool s1 = ( i + 1 < in.size() );
    const bool s2 = ( i + 2 < in.size() );
    const unsigned char c0 = in[i];
    const unsigned char c1 = s1 ? in[i+1] : 0;
    const unsigned char c2 = s2 ? in[i+2] : 0;
    out += b64str[(c0 >> 2) & 0x3f];
    out += b64str[((c0 << 4) + (c1 >> 4)) & 0x3f];
    out += s1 ? b64str[((c1 << 2) + (c2 >> 6)) & 0x3f] : '=';
    out += s2 ? b64str[c2 & 0x3f] : '=';
    }
  }


/* Decode base64 encoded input string 'in' to output string 'out'.
   Return true if decoding was successful, i.e. if the input was valid
   base64 data. Note that as soon as any invalid character is
   encountered, decoding is stopped, the index of the invalid 4-byte
   group is stored in *idxp, and false is returned. This means that you
   must remove any line terminators from the input string before calling
   this function.
*/
bool Encoding::base64_decode( const std::string & in, std::string & out,
                              int * const idxp )
  {
  unsigned i;
  out.clear();

  for( i = 0; i + 3 < in.size(); i += 4 )
    {
    const int i0 = base64_value( in[i] );
    const int i1 = base64_value( in[i+1] );
    if( i0 < 0 || i1 < 0 ) break;
    out += ( i0 << 2 ) | ( i1 >> 4 );
    if( in[i+2] == '=' )
      { if( i + 4 != in.size() || in[i+3] != '=' ) break; }
    else
      {
      const int i2 = base64_value( in[i+2] );
      if( i2 < 0 ) break;
      out += ( ( i1 << 4 ) & 0xf0 ) | ( i2 >> 2 );
      if( in[i+3] == '=')
        { if( i + 4 != in.size() ) break; }
      else
	{
        const int i3 = base64_value( in[i+3] );
        if( i3 < 0 ) break;
        out += ( ( i2 << 6 ) & 0xc0 ) | i3;
	}
      }
    }
  if( idxp ) *idxp = i;
  return ( i == in.size() );
  }


/* Decode quoted-printable encoded input string 'in' to output string 'out'.
   Return 0 if decoding was successful, i.e. if the input was valid
   quoted-printable data. Note that as soon as any invalid character is
   encountered, decoding is stopped, the index of the invalid character
   is stored in *idxp, and false is returned.
*/
bool Encoding::quoted_printable_decode( const std::string & in,
                                        std::string & out, int * const idxp )
  {
  unsigned i;
  out.clear();

  for( i = 0; i < in.size(); ++i )
    {
    const unsigned char ch = in[i];
    if( ch != '=' ) { out += ch; continue; }
    if( i + 1 < in.size() )
      {
      const unsigned char ch1 = in[i+1];
      if( ch1 == '\n' ) { ++i; continue; }
      if( i + 2 >= in.size() ) break;
      const unsigned char ch2 = in[i+2];
      if( ch1 == '\r' )
        { if( ch2 == '\n' ) { i += 2; continue; } else break; }
      const int i1 = ISO_8859::xtoi( ch1 );
      const int i2 = ISO_8859::xtoi( ch2 );
      if( i1 < 0 || i2 < 0 || std::islower( ch1 ) || std::islower( ch2 ) )
        break;
      out += ( i1 << 4 ) + i2;
      i += 2;
      }
    }
  if( idxp ) *idxp = i;
  return ( i == in.size() );
  }


unsigned char Encoding::rot13( const unsigned char ch )
  {
  switch( ch )
    {
    case 'A': return 'N';
    case 'B': return 'O';
    case 'C': return 'P';
    case 'D': return 'Q';
    case 'E': return 'R';
    case 'F': return 'S';
    case 'G': return 'T';
    case 'H': return 'U';
    case 'I': return 'V';
    case 'J': return 'W';
    case 'K': return 'X';
    case 'L': return 'Y';
    case 'M': return 'Z';
    case 'N': return 'A';
    case 'O': return 'B';
    case 'P': return 'C';
    case 'Q': return 'D';
    case 'R': return 'E';
    case 'S': return 'F';
    case 'T': return 'G';
    case 'U': return 'H';
    case 'V': return 'I';
    case 'W': return 'J';
    case 'X': return 'K';
    case 'Y': return 'L';
    case 'Z': return 'M';
    case 'a': return 'n';
    case 'b': return 'o';
    case 'c': return 'p';
    case 'd': return 'q';
    case 'e': return 'r';
    case 'f': return 's';
    case 'g': return 't';
    case 'h': return 'u';
    case 'i': return 'v';
    case 'j': return 'w';
    case 'k': return 'x';
    case 'l': return 'y';
    case 'm': return 'z';
    case 'n': return 'a';
    case 'o': return 'b';
    case 'p': return 'c';
    case 'q': return 'd';
    case 'r': return 'e';
    case 's': return 'f';
    case 't': return 'g';
    case 'u': return 'h';
    case 'v': return 'i';
    case 'w': return 'j';
    case 'x': return 'k';
    case 'y': return 'l';
    case 'z': return 'm';
    default : return  ch;
    }
  }


unsigned char Encoding::rot47( const unsigned char ch )
  {
  if( ch >= 33 && ch <= 126 )
    { if( ch <= 79 ) return ch + 47; else return ch - 47; }
  else return ch;
  }


/* Encode ISO-8859-15 encoded input string 'in' to output string 'out'.
   Return false if no encoding is needed, i.e. if the input is already
   valid UTF-8 data.
*/
bool Encoding::utf8_encode( const std::string & in, std::string & out )
  {
  out.clear();
  for( unsigned i = 0; i < in.size(); )
    {
    int len;
    if( utf8_to_ucs( in, i, &len ) >= 0 ) { i += len; continue; }
    for( i = 0; i < in.size(); ++i )
      out += ucs_to_utf8( (unsigned char)in[i] );
    return true;
    }
  return false;
  }


// 'seq' contains an UTF-8 (possibly) multibyte character sequence.
// Returns the corresponding code and, in *lenp, the characters read.
// Returns -1 if error.
//
int Encoding::utf8_to_ucs( const std::string & seq, const unsigned i,
                           int * const lenp )
  {
  if( i >= seq.size() ) return -1;
  int len = 1;
  unsigned char first = seq[i];
  if( first < 128 )					// plain ascii
    { if( lenp ) { *lenp = len; } return first; }
  if( first < 192 || first > 253 ) return -1;		// invalid byte

  ++len;
  unsigned char bit = 0x20, mask = 0x1F;
  while( first & bit ) { ++len; bit >>= 1; mask >>= 1; }
  int code = first & mask;

  for( int j = 1; j < len; ++j )
    {
    unsigned char next = seq[i+j];
    if( ( next & 0xC0 ) != 0x80 ) return -1;		// invalid byte
    code = ( code << 6 ) | ( next & 0x3F );
    }

  if( code < 0x80 || ( len > 2 && code < 0x800 << ( ( len - 3 ) * 5 ) ) )
    return -1;						// no minimum length
  if( lenp ) *lenp = len;
  return code;
  }


/* Decode UTF-8 encoded input string 'in' to output string 'out'.
   Return 0 if decoding was successful, i.e. if the input was valid
   UTF-8 data in the ISO-8859-[1|15] range. Note that as soon as any
   invalid character is encountered, decoding is stopped, the index of
   the invalid character is stored in *idxp, and:
     if invalid UTF-8 data is found then -1 is returned,
     else the UCS code of the first character out of range is returned.
*/
int Encoding::utf8_decode( const std::string & in, std::string & out,
                           int * const idxp )
  {
  out.clear();
  for( unsigned i = 0; i < in.size(); )
    {
    int len;
    const int code = utf8_to_ucs( in, i, &len );
    if( code < 0 ) { if( idxp ) { *idxp = i; } return -1; }
    const int ch = map_to_byte( code );
    if( ch >= 0 ) { out += ch; i += len; continue; }
    const char * const str = map_to_string( code );
    if( !str ) { if( idxp ) { *idxp = i; } return code; }
    out += str; i += len;
    }
  return 0;
  }
