/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdio.h>
#include <string.h>
#include "udm_uniconv.h"
#include "udm_sgml.h"

/* UTF8 RFC 2279 */

__C_LINK int __UDMCALL
udm_mb_wc_utf8 (UDM_CONV *conv, UDM_CHARSET *cs,int *pwc,
          const unsigned char *s,
          const unsigned char *e)
{
  unsigned char c = s[0];
  int n=e-s;
  const unsigned char *p;
  
  conv->icodes = conv->ocodes = 1;

  if (c < 0x80) {
        if (*s == '&' && (conv->flags & UDM_RECODE_HTML_SPECIAL) ) {
       if ((p = (unsigned char *)strchr((const char *)s, ';')) != NULL) {
         if (s[1] == '#') {
          if (s[2] == 'x' || s[2] == 'X') sscanf((const char *)s + 3, "%x;", (unsigned int *)pwc);
           else  sscanf((const char *)s + 2, "%d;", pwc);
         } else {
           *pwc = UdmSgmlToUni((const char *)s + 1);
         }
         if (*pwc) return conv->icodes = (p - s + 1);
       }
     }
    *pwc = c;
    return 1;
  } else if (c < 0xc2) {
    return UDM_CHARSET_ILSEQ;
  } else if (c < 0xe0) {
    if (n < 2)return UDM_CHARSET_TOOFEW(0);
    if (!((s[1] ^ 0x80) < 0x40))return UDM_CHARSET_ILSEQ2;
    *pwc = ((unsigned int) (c & 0x1f) << 6) | (unsigned int) (s[1] ^ 0x80);
    return conv->icodes = 2;
  } else if (c < 0xf0) {
    if (n < 3)return UDM_CHARSET_TOOFEW(0);
    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
      return UDM_CHARSET_ILSEQ3;
    *pwc = ((unsigned int) (c & 0x0f) << 12) | ((unsigned int) (s[1] ^ 0x80) << 6) | (unsigned int) (s[2] ^ 0x80);
    return conv->icodes = 3;
  } else if (c < 0xf8 && sizeof(unsigned int)*8 >= 32) {
    if (n < 4)return UDM_CHARSET_TOOFEW(0);
    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90)))
      return UDM_CHARSET_ILSEQ4;
    *pwc = ((unsigned int) (c & 0x07) << 18) | ((unsigned int) (s[1] ^ 0x80) << 12) | ((unsigned int) (s[2] ^ 0x80) << 6) | (unsigned int) (s[3] ^ 0x80);
    return conv->icodes = 4;
  } else if (c < 0xfc && sizeof(unsigned int)*8 >= 32) {
    if (n < 5)return UDM_CHARSET_TOOFEW(0);
    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88)))
      return UDM_CHARSET_ILSEQ5;
    *pwc = ((unsigned int) (c & 0x03) << 24)
      | ((unsigned int) (s[1] ^ 0x80) << 18)
      | ((unsigned int) (s[2] ^ 0x80) << 12)
      | ((unsigned int) (s[3] ^ 0x80) << 6)
      | (unsigned int) (s[4] ^ 0x80);
    return conv->icodes = 5;
  } else if (c < 0xfe && sizeof(unsigned int)*8 >= 32) {
     if (n < 6)return UDM_CHARSET_TOOFEW(0);
    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
      && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
      && (s[5] ^ 0x80) < 0x40
      && (c >= 0xfd || s[1] >= 0x84)))
      return UDM_CHARSET_ILSEQ6;
    *pwc = ((unsigned int) (c & 0x01) << 30)
      | ((unsigned int) (s[1] ^ 0x80) << 24)
      | ((unsigned int) (s[2] ^ 0x80) << 18)
      | ((unsigned int) (s[3] ^ 0x80) << 12)
      | ((unsigned int) (s[4] ^ 0x80) << 6)
      | (unsigned int) (s[5] ^ 0x80);
    return conv->icodes = 6;
  } else
    return UDM_CHARSET_ILSEQ;
}



__C_LINK int __UDMCALL
udm_wc_mb_utf8(UDM_CONV *conv, UDM_CHARSET *cs, int *wc, unsigned char *r, unsigned char *e)
{
  int count;

  conv->icodes = conv->ocodes = 1;

  if (*wc < 0x80) {
    r[0] = *wc;
    if ((conv->flags & UDM_RECODE_HTML_SPECIAL) &&
        (r[0] == '"' || r[0] == '&' || r[0] == '<' || r[0] == '>')) 
      return UDM_CHARSET_ILUNI;
    return 1;
  }
  else if (*wc < 0x800) 
    count = 2;
  else if (*wc < 0x10000) 
    count = 3;
  else if (*wc < 0x200000) 
    count = 4;
  else if (*wc < 0x4000000) 
    count = 5;
  else if (*wc <= 0x7fffffff) 
    count = 6;
  else 
    return UDM_CHARSET_ILUNI;
  
  if ( r+count > e)
    return UDM_CHARSET_TOOSMALL;
  
  switch (count) { /* Fall through all cases. */
    case 6: r[5] = 0x80 | (*wc & 0x3f); *wc = *wc >> 6; *wc |= 0x4000000;
    case 5: r[4] = 0x80 | (*wc & 0x3f); *wc = *wc >> 6; *wc |= 0x200000;
    case 4: r[3] = 0x80 | (*wc & 0x3f); *wc = *wc >> 6; *wc |= 0x10000;
    case 3: r[2] = 0x80 | (*wc & 0x3f); *wc = *wc >> 6; *wc |= 0x800;
    case 2: r[1] = 0x80 | (*wc & 0x3f); *wc = *wc >> 6; *wc |= 0xC0;
    case 1: r[0] = *wc;
  }
  return conv->ocodes = count;
}

