/*
 * A C++ scanner. Uses the longest match construction.
 * << <= <<= >> >= >>= are left out since angle brackets are used in templates.
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define TK_Dlit 256
#define TK_Slit 257
#define TK_Float 258
#define TK_Id 259
#define TK_NameSep 260
#define TK_Arrow 261
#define TK_PlusPlus 262
#define TK_MinusMinus 263
#define TK_ArrowStar 264
#define TK_DotStar 265
#define TK_ShiftLeft 266
#define TK_ShiftRight 267
#define TK_IntegerDecimal 268
#define TK_IntegerOctal 269
#define TK_IntegerHex 270
#define TK_EqualsEquals 271
#define TK_NotEquals 272
#define TK_AndAnd 273
#define TK_OrOr 274
#define TK_MultAssign 275
#define TK_DivAssign 276
#define TK_PercentAssign 277
#define TK_PlusAssign 278
#define TK_MinusAssign 279
#define TK_AmpAssign 280
#define TK_CaretAssign 281
#define TK_BarAssign 282
#define TK_DotDotDot 283
#define TK_Whitespace 284
#define TK_Comment 285

#define BUFSIZE 128

/* EOF char used to flush out that last token. This should be a whitespace
 * token. */

#define LAST_CHAR 0

char buf[BUFSIZE];
int line = 1, col = 1;

struct Scanner
{
	int act, have;
	char *tokstart, *tokend;
	int cs;

	int init( );
	void token( int tok );
	int execute( char *data, int len );
};

%%{
	machine Scanner;

	# Floating literals.
	fract_const = digit* '.' digit+ | digit+ '.';
	exponent = [eE] [+\-]? digit+;
	float_suffix = [flFL];

	c_comment := 
		any* @0 '*/' @1
		@{ fgoto main; };

	main := |*

	# Single and double literals.
	( 'L'? "'" ( [^'\\\n] | /\\./ )* "'" ) 
		{token( TK_Slit );};
	( 'L'? '"' ( [^"\\\n] | /\\./ )* '"' ) 
		{token( TK_Dlit );};

	# Identifiers
	( [a-zA-Z_] [a-zA-Z0-9_]* ) 
		{token( TK_Id );};

	# Floating literals.
	( fract_const exponent? float_suffix? | digit+ exponent float_suffix? ) 
		{token( TK_Float );};
	
	# Integer decimal. Leading part buffered by float.
	( ( '0' | [1-9] [0-9]* ) [ulUL]{0,3} ) 
		{token( TK_IntegerDecimal );};

	# Integer octal. Leading part buffered by float.
	( '0' [0-9]+ [ulUL]{0,2} ) 
		{token( TK_IntegerOctal );};

	# Integer hex. Leading 0 buffered by float.
	( '0' ( 'x' [0-9a-fA-F]+ [ulUL]{0,2} ) ) 
		{token( TK_IntegerHex );};

	# Only buffer the second item, first buffered by symbol. */
	'::' {token( TK_NameSep );};
	'==' {token( TK_EqualsEquals );};
	'!=' {token( TK_NotEquals );};
	'&&' {token( TK_AndAnd );};
	'||' {token( TK_OrOr );};
	'*=' {token( TK_MultAssign );};
	'/=' {token( TK_DivAssign );};
	'%=' {token( TK_PercentAssign );};
	'+=' {token( TK_PlusAssign );};
	'-=' {token( TK_MinusAssign );};
	'&=' {token( TK_AmpAssign );};
	'^=' {token( TK_CaretAssign );};
	'|=' {token( TK_BarAssign );};
	'++' {token( TK_PlusPlus );};
	'--' {token( TK_MinusMinus );};
	'->' {token( TK_Arrow );};
	'->*' {token( TK_ArrowStar );};
	'.*' {token( TK_DotStar );};

	# Three char compounds, first item already buffered. */
	'...' {token( TK_DotDotDot );};

	# Single char symbols.
	( punct - [_"'] ) {token( tokstart[0] );};

	# Comments and whitespace.
	'/*' { fgoto c_comment; };
	'//' [^\n]* '\n';
	( any - 33..126 )+;

	*|;
}%%

%% write data nofinal;

void Scanner::token( int tok )
{
	char *data = tokstart;
	int len = tokend - tokstart;

	printf( "<%i> ", tok );
	for ( int i = 0; i < len; i++ )
		fputc( data[i], stdout );
	fputc( '\n', stdout );
	
	/* Count newlines and columns. This code is here mainly for having some
	 * code in the token routine when commenting out the above output during
	 * performance testing. */
	for ( int i = 0; i < len; i ++ ) {
		if ( data[i] == '\n' ) {
			line += 1;
			col = 1;
		}
		else {
			col += 1;
		}
	}
}

int Scanner::init( )
{
	%% write init;
	return 1;
}

int Scanner::execute( char *data, int len )
{
	char *p = data;
	char *pe = data + len;

	%% write exec;

	if ( cs == Scanner_error )
		return -1;
	return 0;
}

int main()
{
	Scanner scanner;
	scanner.init();

	/* Do the first read. */
	int have = 0;
	while ( true ) {
		char *data = buf + have;
		int space = BUFSIZE - have;
		int len = fread( data, 1, space, stdin );
		if ( len == 0 )
			break;

		int rtn = scanner.execute( data, len );
		if ( rtn < 0 ) {
			/* Machine failed before finding a token. */
			fprintf(stderr, "PARSE ERROR\n" );
			exit(1);
		}
		else if ( scanner.tokstart == buf ) {
			/* No failure yet, buffer is full. */
			fprintf(stderr, "TOKEN TOO BIG\n" );
			exit(1);
		}
		else if ( scanner.tokstart != 0 ) {
			/* No failure yet, room still left in buffer. Shift over data and
			 * read more. */
			have = len - (scanner.tokstart-data);
			memmove( buf, scanner.tokstart, have );
			scanner.tokend -= (scanner.tokstart-buf);
			scanner.tokstart = buf;
		}
	}

	char eof = 0;
	scanner.execute( &eof, 1 );
	return 0;
}
