/* Time-stamp: <2007-03-02 18:25:02 poser>
 *
 * Convert text containing various 7-bit ASCII escapes to UTF-7 Unicode.
 *
 * Copyright (C) 2005-2007 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LIBINTL_H
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(x) (x)
#endif
#include "unicode.h"
#include "enttbl.h"
#include "exitcode.h"
#include "formats.h"

#if defined(__DATE__) && defined(__TIME__)
#define HAVE_DATE_TIME
char compdate[]= "Compiled " __DATE__ " " __TIME__ ;
#else
char compdate[]= "";
#endif

char version[]=PACKAGE_VERSION;
char pgname[]="ascii2uni";

#ifndef LOCALEDIR 
#define LOCALEDIR "/usr/local/share/locale"
#endif

#define LBUFSIZE 10240

void
ShowVersion(void){
  fprintf(stderr,"\n%s  %s\n",pgname,version);
#ifdef HAVE_DATE_TIME
  fprintf(stderr,"%s\n",compdate);
#endif
  fprintf(stderr,"Copyright (C) 2004-2007 William J. Poser\n");
  fprintf(stderr,"This program is free software; you can redistribute\n\
it and/or modify it under the terms of version 2 of\n\
the GNU General Public License as published by the\n\
Free Software Foundation.\n");
  fprintf(stderr,"Report bugs to: billposer@alum.mit.edu.\n");
}

void
ShowUsage(void){
  fprintf(stderr,_("This program is a filter which converts 7-bit ASCII text\n\
containing various representations for non-ASCII characters\nto UTF-8 Unicode.\n"));
  fprintf(stderr,_("Usage: %s [flags]\n"),pgname);
  fprintf(stderr,_("       -a <format specification>.\n"));
  fprintf(stderr,_("       -h Print this usage message.\n"));
  fprintf(stderr,_("       -L List format codes.\n"));
  fprintf(stderr,_("       -p Input consists of pure escapes except for non-null whitespace.\n"));
  fprintf(stderr,_("       -q Quiet - don't chat.\n"));
  fprintf(stderr,_("       -v Print version information.\n"));
  fprintf(stderr,_("       -8 Convert only tokens above the ASCII range.\n"));
  fprintf(stderr, 
	  _("       -Z <format> Convert input using the supplied format.\n"));
  fprintf(stderr,_("Report bugs to: billposer@alum.mit.edu\n"));
}


char *
ExtractSubstring(char *strptr, char* Start, char* End) {
  char *i;
  char *SavedBeginning;

  SavedBeginning = strptr;
  for (i = Start; i <= End; i++) *strptr++ = *i;
  *strptr = '\0';
  return SavedBeginning;
}


/* The length of the longest character entity */
#define MAXENTLEN 8

/* The library function seems not to be working. Anyhow, we want to keep this
 * independent of locale.
 */
int myisxdigit (int c) {
  switch (c) {
  case '0':
  case '1':
  case '2':
  case '3':
  case '4':
  case '5':
  case '6':
  case '7':
  case '8':
  case '9':
  case 'a':
  case 'b':
  case 'c':
  case 'd':
  case 'e':
  case 'f':
  case 'A':
  case 'B':
  case 'C':
  case 'D':
  case 'E':
  case 'F':
    return 1;
  default:
    return 0;
  }
}

static char *Formats [] = {
"&#x%lX;", 	/* HTMLX */
"&#%ld;", 	/* HTMLD */
"\\#x%lX;", 	/* SGMLX */
"\\#%ld;",	/* SGMLD */
"\\u%8lX",	/* BSLU */
"\\x%lX",	/* BSLX */
"0x%4lX",	/* STDX */
"#x%4lX",	/* CLSX */
"%lX", 		/* RAWX */
"\\x{%lX}",	/* BSLXB */
"<U%lX>",	/* ABUX */
"U%lX",		/* JUUX */
"u%lX",		/* JuUX */
"U+%lX",	/* UPLX */
"X\'%lX\'",	/* XQ */
"\\u%8ld",	/* BSLUD */
"v%ld",		/* PERLV */
"$%04X",	/* DOLLAR */
"16#%04X",	/* PSPT */
"#16r%04X",	/* CLR */
"16#%04X#",	/* ADA */
"\\%03o\\%03o\\%03o",	/* BYTEO */
"\\d%03d\\d%03d\\d%03d",	/* BYTED */
"\\x%02x\\x%02x\\x%02x",	/* BYTEX */
"&%[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789];", /* CHENT */
"=%2lX", 		/* UTF-8  - Ifmt */
"%%%2lX", 		/* UTF-8  - Jfmt */
"\\%3lo" 		/* UTF-8  - Kfmt */
};


int main (int ac, char *av[])
{
  char *SplitFormat = "\\%1[uU]%X%n"; /* This is for BMPSplit */

  char *fmt = Formats[STDX];		/* Default is plain hex format */
  char afmt [67+2+1+2];
  char aHfmt [8+2+1];
  char aDfmt [8+2+1];
  char cbuf[5];

  UTF32 num;
  int oc;			/* Command line option flag */
  int Converted;
  long TokenNumber;
  long ReplacedNumber;
  short BMPSplitP = 0; 
  int VerboseP = 1;
  int UTF8ValueP = 0;		/* Are incoming values UTF-8? */
  short AllHTMLP = 0; 		/* Translate all three kinds of HTML escape */
  int PureP = 0;
  int Word_Length;
  int NConsumed;
  int LineNo;
  char *str;
  char *iptr;
  int eof;
  char SplitStr[3];
  char enam[LBUFSIZE];
  long MicrosoftStyle = 0L;
  char tmpstr [16];
  unsigned char b1;		/* Used for byte-wise encoding */
  unsigned char b2;
  unsigned char b3;
  int FType;
  int UTF8Type;			/* Not used - for compatibility with uni2ascii */
  short UseEntitiesP;	/* Not used - for compatibility with uni2ascii */
  int last;

  char lbuf [LBUFSIZE+1];

  extern int optind;
  extern int opterr;
  extern int optopt;
  extern void putu8 (unsigned long);
  extern char * Get_Word(FILE *, int *, int *);

  extern int CountSlots(char *);
  extern void ListFormatArguments(short);
  extern void SetFormat(char *, int *, short *,int *, short *, short *);

  opterr = 0;

#ifdef HAVE_SETLOCALE
   setlocale(LC_ALL,"");
#endif
#ifdef HAVE_LIBINTL_H
   bindtextdomain (PACKAGE,LOCALEDIR);
   textdomain (PACKAGE);
#endif

  /* Handle command line arguments */

   while( (oc = getopt(ac,av,":Z:a:hLpqv")) != EOF){
     switch(oc){
     case 'a':
       SetFormat(optarg,&FType,&UseEntitiesP, &UTF8Type, &BMPSplitP,&AllHTMLP);
       if(FType == FMT_UNKNOWN) {
	 fprintf(stderr,"Format specification %s not recognized.\n",optarg);
	 exit(BADOPTIONARG);
       }
       fmt = Formats[FType];
       if((FType == IFMT) || (FType == JFMT) || (FType == KFMT)) UTF8ValueP =1;
       if(FType == JFMT) {cbuf[0] = '0'; cbuf[1] = 'x';}
       if(FType == KFMT) {cbuf[0] = '\\';}
       break;
    case 'L':
      ListFormatArguments(0);
      exit(INFO);
     case 'Z':
       fmt = optarg;
       if(CountSlots(fmt) > 1) {
	 fprintf(stderr,"You may not supply a format with more than one empty slot.\n");
	 exit(BADOPTIONARG);
       }
       break;
     case 'p':
       PureP = 1;
       break;
     case 'q':
       VerboseP = 0;
       break;
     case 'h':
       ShowUsage();
       exit(INFO);
       break; 			/* NOTREACHED */
     case 'v':
       ShowVersion();
       exit(INFO);
       break; 			/* NOTREACHED */
     case ':':
       fprintf(stderr,_("%s: missing argument to option flag %c.\n"),pgname,optopt);
       exit(BADOPTIONARG);
     default:
       fprintf(stderr,_("%1$s: invalid option flag %2$c\n"),pgname,optopt);
       ShowVersion();
       ShowUsage();
       exit(INFO);
     }
   }

   if( (FType == RAWX) && (!PureP) ) {
     fprintf(stderr,_("It isn't possible reliably to parse raw hex unicode out of ASCII text.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP && PureP) {
     fprintf(stderr,_("Conversion of all three HTMl formats is not supported in pure mode.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP) {
     sprintf(aDfmt,"%s%%n",Formats[HTMLD]);
     sprintf(aHfmt,"%s%%n",Formats[HTMLX]);
   }

   sprintf(afmt,"%s%%n",fmt);	/* Add %n for NConsumed */
   ReplacedNumber = 0L;
   TokenNumber = 0L;

   /*
    * This is the case in which the input consists entirely of escapes
    * except for arbitrary (but non-null) amounts of intervening whitespace.
    */

   if(PureP) {
     while(1){
       str = Get_Word(stdin,&Word_Length,&eof);
       if(eof) break; 
       if(Word_Length == 0) continue;
       TokenNumber++;
       if(str == NULL){
	 fprintf(stderr,_("%1$s: failed to allocate storage for input token %2$ld.\n"),
		 pgname,TokenNumber);
	 exit(OUTOFMEMORY);
       }
       if(FType == CHENT) {
	 Converted = sscanf(str,afmt,&enam,&NConsumed);
	 num = LookupCodeForEntity(enam);
	 if(!num) {
	   num = UNI_REPLACEMENT_CHAR;
	   fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\"\n",
		   enam);
	   ReplacedNumber++;
	   Converted = (-1);
	 }
	 else Converted = 1;
       }
       else if( (BYTEO == FType) || (BYTED == FType) || (BYTEH == FType)) {
	 Converted = sscanf(str,afmt,&b1,&b2,&b3,&NConsumed);
	 switch(Converted)
	   {
	   case 3:
	     num = (((b1 * 256) + b2) * 256) + b3;
	     break;
	   case 2:
	     num = (b1 * 256) + b2;
	     break;
	   case 1:
	     num = b1;
	     break;
	   default:
	     break;
	     /* This case is handled below */
	 }
       }
       else {
	 Converted = sscanf(str,afmt,&num,&NConsumed);
       }

       if(Converted < 1) {
	 fprintf(stderr,_("Ill-formed input %1$s at token %2$lu\n"),str,TokenNumber);
	 exit(BADRECORD); 
       }
       else if(Converted > 3) {
	 fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
		 str,TokenNumber);
	 putu8(UNI_REPLACEMENT_CHAR);
       } 
       else {
	 if (UTF8ValueP) putchar(num);
	 else putu8(num);
	 if( (FType == HTMLD) || (FType == HTMLX) || (FType == CHENT)) {
	   if(*(str+NConsumed-1) != ';') {
	     MicrosoftStyle++;
	     fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),str,TokenNumber);
	   }
	 }
       }
       free((void *)str);
     }
     goto done;
   } /* End of PureP */

   /* This is the case in which the Unicode escapes are embedded in ASCII text */

   LineNo = 0;
   while(fgets(lbuf,LBUFSIZE,stdin) != NULL) {
     LineNo++;
     last = strlen(lbuf) - 1;
     if(lbuf[last] == '\n') {lbuf[last] = '\0'; last--;}
     if(last < 0) continue;
     iptr = lbuf;
     if(FType == JFMT) {
       while(*iptr) {
	 if(*iptr == '%') {
	   if(*++iptr) {
	     if(myisxdigit(*iptr)) {
	       if(*++iptr) {
		 if(myisxdigit(*iptr)) { /* match */
		   cbuf[2] = *(iptr-1);
		   cbuf[3] = *iptr;
		   cbuf[4] = '\0';
		   num = (unsigned char)strtoul(cbuf,NULL,16);
		   putchar(num);
		   TokenNumber++;
		   iptr++;
		 }
		 else {		/* We have % X foo */
		   putchar('%');
		   putchar(*(iptr-1));
		   if(*iptr != '%') putchar(*iptr++);
		   continue;
		 }
	       }
	       else {		/* We have % X EOL */
		 putchar('%');
		 putchar(*(iptr-1));
		 putchar('\n');
		 break;
	       }
	     }
	     else { 		/* We have % foo */
		 putchar('%');
		 if(*iptr != '%') putchar(*iptr++);
		 continue;
	     }
	   }	     
	   else {		/* We have % EOL */
	     putchar('%');
	     putchar('\n');
	     break;
	   }
	 }
	 else {
	   putchar(*iptr++);
	   continue;
	 }
       }
     } /* End of special case for J format */

     while (*iptr) {
       if(BMPSplitP) {
	 if(sscanf(iptr,SplitFormat,&SplitStr,&num,&NConsumed)) {
	   if( (num <= 0xFFFF) && (SplitStr[0] == 'U')) {
	     fprintf(stderr,_("Warning: the code \\U%1$08lX at line %2$d falls within the BMP.\n"),
		     num,LineNo);
	   }
	   if( (num > 0xFFFF) && (SplitStr[0] == 'u')) {
	     fprintf(stderr,_("Warning: the code \\u%1$08lX at line %2$d falls outside the BMP.\n"),
		     num,LineNo);
	   }
	   putu8(num);
	   iptr+=NConsumed;
	   TokenNumber++;
	 }
	 else putchar(*iptr++);
       }
       else if (FType == CHENT) {
	 if (AllHTMLP){
	   if(sscanf(iptr,aHfmt,&num,&NConsumed)) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') {
	       MicrosoftStyle++;
	       fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber);
	     }
	     TokenNumber++;
	     continue;
	   }
	   else if(sscanf(iptr,aDfmt,&num,&NConsumed)) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') {
	       MicrosoftStyle++;
	       fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber);
	     }
	     TokenNumber++;
	     continue;
	   }
	 }
	 if(sscanf(iptr,afmt,&enam,&NConsumed)) {
	   if( (num = LookupCodeForEntity(enam))) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') {
	       MicrosoftStyle++;
	       fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber);
	     }
	     TokenNumber++;
	   }
	   else {
	     fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\" at line %d\n",
		     enam,LineNo);
	     putu8(UNI_REPLACEMENT_CHAR);
	     iptr+=NConsumed;
	     ReplacedNumber++;
	   }
	 }
	 else putchar(*iptr++);
       } /* End of Qfmt case */
       else if( (BYTEO == FType) || (BYTED == FType) || (BYTEH == FType)) {
	 Converted=sscanf(iptr,afmt,&b1,&b2,&b3,&NConsumed);
	 switch(Converted)
	   {
	   case 3:
	     num = (((b1 * 256) + b2) * 256) + b3;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 2:
	     num = (b1 * 256) + b2;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 1:
	     num = b1;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 0:
	     putchar(*iptr++);
	     break;
	   default:
	     fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
		     str,TokenNumber);
	     putu8(UNI_REPLACEMENT_CHAR);
	   }
	   TokenNumber++;
       }
       else {			/* Default - not BMPSplitP, Q, or byte format */
	 //	 NConsumed=0;
	 if(sscanf(iptr,afmt,&num,&NConsumed)) {
	   if (UTF8ValueP) putchar(num);
	   else putu8(num);
	   iptr+=NConsumed;
	   if(FType== HTMLX) {
	     if(*(iptr-1) != ';') {
	       MicrosoftStyle++;
	       /*	       fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber); */
	       fprintf(stderr,"The HTML entity %s at token %lu lacks the requisite final semicolon.\n",ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber);
	     }
	   }
	   else if(FType == HTMLD) {
	     if(*(iptr-1) != ';') {
	       MicrosoftStyle++;
	       fprintf(stderr,_("The HTML entity %1$s at token %2$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr-NConsumed,iptr-1),TokenNumber);
	     }
	   }
	   TokenNumber++;
	 }
	 else putchar(*iptr++);
       }
     } /* Loop over current line */
     putchar('\n');
   } /* Loop over input lines */

done:
   if(VerboseP) {
     if (TokenNumber == 1)  fprintf(stderr,_("%ld token converted\n"),TokenNumber);
     else fprintf(stderr,_("%ld tokens converted\n"),TokenNumber);
     if (ReplacedNumber) {
       if (ReplacedNumber == 1) fprintf(stderr,
		_("%ld token replaced with Unicode Replacement Character\n"),ReplacedNumber);
       else fprintf(stderr,_("%ld tokens replaced with Unicode Replacement Character\n"),ReplacedNumber);
     }
     if(MicrosoftStyle) {
       fprintf(stderr,
	       _("%ld Microsoft-style (lacking final semi-colon)\n"),MicrosoftStyle);
     }
   }
   exit(SUCCESS);
}

