/***************************************************************************
 *  libbab.cc -- BabyTrans ( Babylon Translator front-end for GTK )        *
 *                                                                         *
 *  Copyright (C) 1999-2002  Frederic Jolliton -- <babytrans@tuxee.net>    *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <list>
#include <cctype>
#include <cerrno>

#ifdef ENABLE_NLS
#include <libintl.h>
#define _(str) gettext(str)
#else
#define _(str) (str)
#endif
#define N_(str) (str)

#include "libbab.h"

namespace {

const size_t		MAX_WORD_LENGTH = 63 ;

/* Assume that character 'a' to 'z' (as in english alphabet)
 * are consecutive character in the current charset of
 * the C compiler/host operating system
 *
 * The conversion is done as follow:
 *
 * The quote character (') is converted to 1.
 *
 * Character in 'a'..'z' (respectively 'A'..'Z')
 * are converted to range [2..27]
 *
 * All other characters are converted to 0.
 *
 * FIXME: Who is really the characted that convert to 0 ?
 *
 */
int
charIndex( char c ) {
	if ( c == '\'' )
		return 1 ;
	if ( c >= 'a' && c <= 'z' )
		return c - 'a' + 2 ;
	if ( c >= 'A' && c <= 'Z' )
		return c - 'A' + 2 ;
	return 0 ;
}

/* Compute the index value from word (using first 3 letters)
 *
 * Some examples:
 *
 * "aaa" ->  2 * 28^2 +  2 * 28 +  2 =  1626
 * "abc" ->  2 * 28^2 +  3 * 28 +  4 =  1658
 * "i'm" -> 11 * 28^2 +  1 * 28 + 15 =  8667
 * "zzz" -> 27 * 28^2 + 27 * 28 + 27 = 21951
 */
long
wordIndex( const std::string& word )
{
	long				idx = 0 ;

	/* Note the hack: switch used to short circuit word indexation
	 * in case it is too short. Yes, a hack, so if I have to document
	 * it, it is bad. FIXME */
	switch( word.size() ) {
	default:
		idx += charIndex( word[ 2 ] ) ;
	case 2:
		idx += charIndex( word[ 1 ] ) * 28 ;
	case 1:
		idx += charIndex( word[ 0 ] ) * 28 * 28;
	case 0:
		break ;
	}
	return idx ;
}

} // anonymous namespace

/* Convert value used by Babylon dictionnaries to character */
char
Babylon::babToChar( int c )
{
	static char			conv[] = "abcdefghijklmnopqrstuvwxyz* **'*" ;
	if ( c < 0 || c >= static_cast< int >( sizeof( conv ) ) ) {
		return '?' ;
	}
	return conv[ c ] ;
}

/* Open dictionnary ('filename' is the definition file filename) */
bool
Babylon::open( const std::string& language )
{
	/* Close any previously open dictionnary */
	close();

	/* Try to open dictionnary
	 *
	 * Filename is constructed as follow:
	 *
	 * [specified path] + '/' + [name of the language]
	 *
	 */
	myFileDef.open( ( myPath + '/' + language ).c_str() ) ;

	/* Set an error if file cannot be opened */
	if ( ! myFileDef.is_open() ) {
		setError( myPath + '/' + language + ":\n"
				   + strerror( errno ) ) ;
		myIsOk = false ;
	} else {
		/* Some filename to try for finding the word index file */
		const char*		 fileList[] = {
			"english.dic" ,
			"English.dic" ,
			"ENGLISH.DIC" ,
			0
		} ;
		for ( size_t i = 0 ; fileList[ i ] != 0 ; ++ i ) {
			myFileIdx.clear() ;
			myFileIdx.open( ( myPath + '/' + fileList[ i ] ) .c_str() ) ;
			if ( myFileIdx.is_open() ) break ;
		}
		if ( ! myFileIdx.is_open() ) {
			/* Note that the error message is based on the first filename */
			setError( myPath + '/' + fileList[ 0 ] + ":\n"
					   + strerror( errno ) ) ;
			myIsOk = false ;
		} else {
			myIsOk = true ;
		}
	}
	return myIsOk ;
}

/* Close the current opened dictionnary */
void
Babylon::close()
{
	if ( myFileDef.is_open() ) {
		myFileDef.close() ;
		myFileDef.clear() ;
	}
	if ( myFileIdx.is_open() ) {
		myFileIdx.close() ;
		myFileIdx.clear() ;
	}
	myIsOk = false ;
}

/* Cleanup */
Babylon::~Babylon()
{
	close() ;
}

/* Read (and decode !) a word of 'length' character */
std::string
Babylon::readWord( unsigned int length )
{
	/* Some frequent occurence of pair (or 3-tuple) of letters
	 * are coded as integer between 0 and 31 and index the following
	 * table */
	static char*		compactTable[] = {
		"<0>" , "ion" , "ies" , "ing" , "ous" , "ses" ,		   /*  0 ..  5 */
		"al" , "an" ,										   /*  6 ..  7 */
		"at" , "ed" , "en" , "er" , "es" , "ia" , "ic" , "ie", /*  8 .. 15 */
		"in" , "io" , "is" , "it" , "le" , "ly" , "ne" , "on", /* 16 .. 23 */
		"or" , "ra" , "se" , "ss" , "st" , "te" , "ti" , "th"  /* 24 .. 31 */
	} ;

	std::string			word ;

	/* Reserve enough place to avoid reallocation. Optionnal, just an optimization. */
	word.reserve( length ) ;

	/* While character remain to be read.. */
	while ( word.size() < length ) {
		u16					data ;
		data = readU16( myFileDef ) ;
		if ( ( data & 0x8000U ) == 0 ) {
			/* 3 character are encoded in lower 15 bits of data
			 * (from bit 0 to 14):
			 *
			 * 15 14     10         5         0
			 *  | |       |         |         |
			 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
			 * |0 c c c c c b b b b b a a a a a|
			 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
			 */
			word += babToChar( ( data       ) & 0x1F ) ;
			word += babToChar( ( data >> 5  ) & 0x1F ) ;
			word += babToChar( ( data >> 10 ) & 0x1F ) ;
		} else {
			/* Two value are encoded in lower 15 bits of data
			 * (from bit 0 to 6, and to bit 8 to 14):
			 *
			 * 15 14          8   6           0
			 *  | |           |   |           |
			 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
			 * |1 b b b b b b b . a a a a a a a|
			 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
			 *
			 * That is, two values between 0 and 127.
			 *
			 * For each of them, 3 possibles cases:
			 *
			 * - Value in range [0..5]:
			 *
			 *   This is a compressed 3 characters sequence
			 *   (see above, compactTable)
			 *
			 * - Value in range [6..31]:
			 *
			 *   This is a compressed 2 characters sequence
			 *   (see above, compactTable)
			 *
			 * - Value in range [32..127]:
			 *
			 *   This is the corresponding character in
			 *   ASCII charset with value as character code.
			 *
			 * In the following code:
			 *
			 * lsb = value a
			 * msb = value b
			 *
			 */
			int				 	lsb = (int)( ( data      ) & 0x7F ) ;
			int				 	msb = (int)( ( data >> 8 ) & 0x7F ) ;

			/* Process the first value */
			if ( lsb >= 32 ) {
				word += (char)lsb ;
			} else {
				word += compactTable[ lsb ] ;
			}
			/* If we have reached the end of word, stop here and
			 * do not process the other value
			 */
			if ( word.size() >= length )
				break ;

			/* Process the second value */
			if ( msb >= 32 ) {
				word += (char)msb ;
			} else {
				word += compactTable[ msb ] ;
			}
		}
	}
	return word ;
}

/* Read a definition and return it as a Item
 *
 * 'pos' indicate the position in the definition file
 *
 */
Babylon::Item
Babylon::makeDefinition( size_t pos )
{
	Item				def ;

	myFileDef.seekg( pos ) ;

	/* There is seven bytes of attributes */
	for ( int i = 0 ; i < 7 ; ++ i ) {
		u8					v = readU8( myFileDef ) ;
		def.attrib += "0123456789ABCDEF"[ v / 16 ] ;
		def.attrib += "0123456789ABCDEF"[ v % 16 ] ;
	}

	size_t				wordLength = readU8( myFileDef ) ;

	def.word = readWord( wordLength ) ;

	size_t				definitionLength = readU8( myFileDef ) ;

	/* Text definition are encoded as follow:
	 *
	 * - one character on two is XORed with 0x80
	 *   (most signifiant bit inverted)
	 *
	 * - there is particular sequence of 2 bytes (after correct XORing)
	 *   that expand to various common string:
	 *
	 *   Sequence   |  Expand to:
	 *   -----------+------------
	 *   0x7d 0xef  |  ",  "
	 *   0x7f 0xef  |  ". " (but note that this count as 3 characters !)
	 *   0xfb 0xff  |  ".." (same remark as above, count as 3 characters !)
	 *   0xff 0xf3  |  "..;"
	 *   0xff 0xef  |  ".. "
	 *   0xff 0xff  |  "..."
	 *
	 */
	std::string			definition ;

	/* mask change between 0x80 and 0x00 at
	 * each character processed. It is used
	 * to XOR the character.
	 */
	int					mask = 0 ;

	/* To handle sequence of two character, we save
	 * the previous character in previousChar. If
	 * 0, it mean that there is no previous character,
	 * or the previous character was part of a sequence.
	 */
	int					previousChar ;
	int					c = 0 ;
	int					extra = 0 ;
	while ( definition.size() + extra < definitionLength ) {
		/* Save the current character as the previous char */
		previousChar = c ;

		/* Read the next character and apply the XOR (to invert,
		 * 1 times of 2, the 8th bits) */
		c = (unsigned char)readU8( myFileDef ) ^ mask ;

		/* Change mask to swap between 0x80 and 0x00 */
		mask ^= 0x80 ;

		/* Skip null character */
		if ( c == 0 ) continue ;

		if ( previousChar == 0x7d ) {
			if ( c == 0xef ) {
				definition += ",  " ;
				c = 0 ;
			} else {
				/* If not a sequence, output the previous
				 * and the current char
				 */
				definition += previousChar ;
				definition += c ;
			}
		} else if ( previousChar == 0x7f ) {
			if ( c == 0xef ) {
				definition += ". " ;
				++ extra ; // Considered as 3 chars.. Hum..
				c = 0 ;
			} else {
				/* If not a sequence, output the previous
				 * and the current char
				 */
				definition += previousChar ;
				definition += c ;
			}
		} else if ( previousChar == 0xfb ) {
			if ( c == 0xff ) {
				definition += ".." ;
				++ extra ; // Considered as 3 chars.. Hum..
				c = 0 ;
			} else {
				/* If not a sequence, output the previous
				 * and the current char
				 */
				definition += previousChar ;
				definition += c ;
			}
		} else if ( previousChar == 0xff ) {
			if ( c == 0xf3 ) { // [FF F3]
				definition += "..;" ;
				c = 0 ;
			} else if ( c == 0xef ) { // [FF EF]
				definition += ".. " ;
				c = 0 ;
			} else if ( c == 0xff ) { // [FF FF]
				definition += "..." ;
				c = 0 ;
			} else {
				/* If not a sequence, output the previous
				 * and the current char
				 */
				definition += previousChar ;
				definition += c ;
			}
		} else if ( c == 0x7d || c == 0x7f || c == 0xfb || c == 0xff ) {
			/* don't 'output' char now, and wait the next char to see
			 * if it is a know sequence
			 */
		} else {
			/* In all other cases, simple add the character */
			definition += c ;
		}
	}
	def.definition = definition ;
	return def ;
}

/* The main function that do the main work
 *
 * Return true if at least one word was found.
 */
bool
Babylon::translatePriv( const std::string& word ,
						std::list< Babylon::Item >& lst )
{
	/* 1) Compute main index .. */

	long				mainPosition = 100 + 4 * wordIndex( word ) ;

	/* 2) .. then seek to both file */

	myFileIdx.seekg( mainPosition ) ;
	myFileDef.seekg( mainPosition ) ;

	/* 3) Read corresponding position */

	u32					positionIdx = readU32( myFileIdx ) ;
	u32					positionDef = readU32( myFileDef ) ;

	/* Test in next position is identical.
	 * In this case, that mean that there is no word
	 * than begin with the same 3 letters in the dictionnary
	 */
	if ( positionDef == readU32( myFileDef ) ) {
		return false ;
	}

	/* Seek in index file to point at the beginning of the list
	 * of word that start with the same 3 letters as the searched word
	 */
	myFileIdx.seekg( positionIdx ) ;

	/* There is 2 bytes at this position that give respectively:
	 *
	 * - the size of the first word of this list (the shorter)
	 * - the size of the latest word of this list (the longer)
	 *
	 */
	size_t				minSize = readU8( myFileIdx ) ;
	size_t				maxSize = readU8( myFileIdx ) ;

	/* ..so we can test if the length of word searched is in
	 * this range
	 */
	if ( ( word.size() < minSize ) || ( word.size() > maxSize ) ) {
		return false ;
	}

	/* The following value are 16 bits values. Each of them
	 * give the count of word of size in the range given
	 * by the previous value (minimum size and maximum size
	 * of word begginning with the first 3 letters of the searched
	 * word.)
	 *
	 * There is ( max_size - min_size + 1 ) 16 bits values
	 * (one for each possible size.)
	 *
	 * After this table, we found each word (except that the 3
	 * first characters that are not stored, obviously since
	 * the index to them is unique and correspond to only
	 * one 3 sequence character that we know. Oops, I hope
	 * you understand me. FIXME)
	 *
	 * Given that, we compute how many character we have
	 * to skip to point at the first word of the same size
	 * as the searched word.
	 *
	 * And at the same time, we compute how many word we
	 * have skipped. This is necessary because we have to
	 * index a table (indexed by the position of the word
	 * if found in the word list.)
	 */

	/* Number of byte to skip */
	size_t				byteToSkip = 0 ;
	/* Position in the table of the current word */
	int					wordPosition = 0 ;
	/* Number of word for the current size */
	int			 		wordCount ;

	/* Loop to process all word of size less than the searched word */
	for ( size_t size = minSize ; size < word.size() ; ++ size ) {
		wordCount = (unsigned int)readU16( myFileIdx ) ;
		wordPosition += wordCount ;
		/* We skip ( size - 3 ) * wordCount character,
		 * because for a size 'size' there is 'wordCount' word,
		 * and since the first 3 character are not
		 * stored, this give ( size - 3 ) * wordCount total
		 * characters.
		 */
		byteToSkip += ( size - 3 ) * wordCount ;
	}
	/* Now read the number of word of the same size as the searched word */
	wordCount = (unsigned int)readU16( myFileIdx ) ;

	/* Skip the rest of the table */
	myFileIdx.seekg( ( maxSize - word.size() ) * 2 , std::ifstream::cur ) ;

	/* Go to the first word in the list */
	myFileIdx.seekg( byteToSkip , std::ifstream::cur ) ;

	bool				wordFound = false ;

	/* Now we read each word in the list to find a match */
	for ( int size = 0 ; size < (int)wordCount ; ++ size ) {
		std::string::size_type	j ;
		for ( j = 3 ; j < word.size() && word[ j ] == myFileIdx.get() ; ++ j ) {
			/* ... */
		}

		if ( j == word.size() ) {
			/* We found the word */
			wordFound = true ;

			/* Jump into the table in index file */
			myFileDef.seekg( positionDef + 4 * wordPosition ) ;

			/* Read the position of the definition in the definition file */
			u32					def = readU32( myFileDef ) ;

			/* If position is superior or equal to 2^24 (or say
			 * otherwise, if a bit in range [24..31] is set) then
			 * this is a reference to another position in another table
			 *
			 * In this case the value is composed of two 16 bits
			 * value.
			 *
			 * The higher 16 bits are the table index (same as
			 * the word computed index), and the lower 16 bits
			 * are the index in this table.
			 */
			if ( def & 0xff000000UL ) {
				mainPosition = 100 + 4 * ( ( def >> 16 ) & 0xFFFF ) ;
				myFileDef.seekg( mainPosition ) ;
				positionDef = readU32( myFileDef ) ;
				myFileDef.seekg( positionDef + 4 * (def & 0xffff) ) ;
				def = readU32( myFileDef ) ;
			}
			/* Now, we have the position into the definition file
			 * where we can extract the definition for the searched word.
			 */
			Item				current = makeDefinition( def ) ;

			/* We append it to the list provided by the caller */
			lst.push_back( current ) ;

			/* Note: it is possible to found several definition, so
			 * we continue to iterate
			 */
		} else {
			/* Skip the word that we have began(?) to read */
			myFileIdx.seekg( word.size() - j - 1 , std::ifstream::cur ) ;
		}
		++ wordPosition ;
	}

	return wordFound ;
}

/* Search the translation of a word and fill a provided list reference
 *
 * Return value:
 *
 *   true if a word was found,
 *   false otherwise.
 */
bool
Babylon::translate( const std::string& word , std::list< Babylon::Item >& lst )
{
	/* Check that dictionnary files are open and ready */
	if ( ! myIsOk ) {
		setError( _( "libbab incorrectly initialized" ) ) ;
		return false ;
	}

	/* 'Trim' the word to search */

	/* Skip initial blank character */
	/* FIXME: What we really want is to find the first non blank character as
	 * defined in the C++ standard (isspace is a good start) */
	std::string::size_type	beg = word.find_first_not_of( " \t\n" ) ;
	if ( beg == std::string::npos ) {
		setError( _( "Invalid word" ) ) ;
		return false ;
	}

	/* Keep only alphabetic and the quote character */
	std::string::size_type	end ;
	for ( end = beg ; isalpha( word[ end ] ) || word[ end ] == '\'' ; ++ end ) {
		/* ... */
	}

	/* Check that the word is not followed by a non-blank character */
	if ( end != word.size()
		 && word[ end ] != ' '
		 && word[ end ] != '\t'
		 && word[ end ] != '\n' ) {
		setError( _( "Invalid word" ) ) ;
		return false ;
	}

	/* Extract the word */
	std::string			trimedWord = word.substr( beg , end - beg ) ;

	/* Check if maximum word size is not exceeded */
	if  ( trimedWord.size() >= MAX_WORD_LENGTH ) {
		lst.clear() ;
		return true ;
	}

	/* Lower the word */
	for ( size_t i = 0 ; i < trimedWord.size() ; ++ i )
		trimedWord[ i ] = tolower( trimedWord[ i ] ) ;

	/* As a special case, word whose size in inferior to 3
	 * must be padded with enough '_' to be 3 characters length.
	 */
	while ( trimedWord.size() < 3 )
		trimedWord += '_' ;

	/* Call the real function to search the word and fill the list
	 * with all found definition.
	 */
	translatePriv( trimedWord , lst ) ;
	return true ;
}

#ifdef STANDALONE
/*
 * You can compile this file with -DSTANDALONE as parameter to gcc
 * to obtain a binary to test this library.
 *
 * Simply provide a word a first argument and it will
 * output all definitions on standard input.
 *
 * ------------------------------------------------------
 * g++ -O2 -Wall -DSTANDALONE libbab.cc -o babytrans-text
 * ------------------------------------------------------
 */
int main( int argc , char * argv[] )
{
	if ( argc != 2 ) {
		std::cout << "Usage: " << argv[ 0 ] << " word" << std::endl ;
		return EXIT_FAILURE ;
	}

	Babylon			 	bab ;
	bab.setPath( "/usr/local/share/babytrans" ) ;
	if ( ! bab.open( "EngtoFre.dic" ) ) {
		std::cerr << "Error while opening dictionnary.\n" ;
		return EXIT_FAILURE ;
	}
	Babylon::ContainerType
						lst ;

	std::string			word = argv[ 1 ] ;
	if ( ! bab.translate( word , lst ) ) {
		std::cerr << "** Definition not found for word `" << word << "'\n" ;
		return EXIT_FAILURE ;
	}
	for ( Babylon::ContainerType::const_iterator
		  it = lst.begin() ;
		  it != lst.end() ;
		  ++ it ) {
		std::cout << "- " << it->word ;
#ifdef WANT_ATTRIB
		std::cout << "(" << it->attrib << ")" ;
#endif
		std::cout << ":\n" << it->definition << std::endl ;
	}

	return 0 ;
}
#endif // STANDALONE
