#line 1 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
/* -*- c -*-
|| This file is part of Pike. For copyright information see COPYRIGHT.
|| Pike is distributed under GPL, LGPL and MPL. See the file COPYING
|| for more information.
|| $Id: unicode_module.cmod,v 1.12 2004/04/11 20:44:28 per Exp $
*/

#include "global.h"
#include "stralloc.h"
#include "global.h"
RCSID("$Id: unicode_module.cmod,v 1.12 2004/04/11 20:44:28 per Exp $");
#include "pike_macros.h"
#include "interpret.h"
#include "program.h"
#include "program_id.h"
#include "object.h"
#include "operators.h"
#include "module_support.h"
#include "array.h"

#include "config.h"
#include "normalize.h"
#include "split.h"
#include "buffer.h"

/*! @module Unicode
 */

static void push_words( struct buffer *data, struct words *w )
{
  int *d = data->data;
  struct array *r = allocate_array( w->size );
  unsigned int i;
  for( i=0; i<w->size; i++ )
  {
    r->item[i].type = PIKE_T_STRING;
    r->item[i].u.string =
      make_shared_binary_string2( d+w->words[i].start,
				  w->words[i].size );
  }
  r->type_field = BIT_STRING;
  push_array( r );
  uc_buffer_free( data );
  uc_words_free( w );
}

static void push_words0( struct pike_string *ds, struct words *w )
{
  /*    push_object( clone_object( words_program, 0 ) ); */
  struct array *r = allocate_array( w->size );
  unsigned char *d = ds->str;
  unsigned int i;
  for( i=0; i<w->size; i++ )
  {
    r->item[i].type = PIKE_T_STRING;
    r->item[i].u.string =
      make_shared_binary_string( d+w->words[i].start,
				 w->words[i].size );
  }
  r->type_field = BIT_STRING;
  pop_stack();
  push_array( r );
  uc_words_free( w );
}


/*! @decl array(string) split_words(string input)
 *! Splits the input string into an array of words, on the boundaries between
 *! the different kinds of word characters as defined by @[is_wordchar]. The
 *! result is an array of words, with the non-word characters between them
 *! thrown away.
 */
#define f_split_words_defined
ptrdiff_t f_split_words_fun_num = 0;
void f_split_words(INT32 args) {
#line 73 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
struct pike_string * input;
#line 73 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(args != 1) wrong_number_of_args_error("split_words",args,1);
#line 73 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(Pike_sp[0-1].type != PIKE_T_STRING) SIMPLE_BAD_ARG_ERROR("split_words",1,"string");
#line 73 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
debug_malloc_pass(input=Pike_sp[0-1].u.string);
#line 75 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
{
  struct words *res;
  struct buffer * data;
  data = uc_buffer_from_pikestring( input );
  res = unicode_split_words_buffer( data );
  pop_n_elems( args );
  push_words( data, res );
}

}
/*! @decl array(string) split_words_and_normalize(string input)
 *! A less wasteful equivalent of @[split_words](@[normalize](@[input])).
 */
#define f_split_words_and_normalize_defined
ptrdiff_t f_split_words_and_normalize_fun_num = 0;
void f_split_words_and_normalize(INT32 args) {
#line 87 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
struct pike_string * input;
#line 87 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(args != 1) wrong_number_of_args_error("split_words_and_normalize",args,1);
#line 87 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(Pike_sp[0-1].type != PIKE_T_STRING) SIMPLE_BAD_ARG_ERROR("split_words_and_normalize",1,"string");
#line 87 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
debug_malloc_pass(input=Pike_sp[0-1].u.string);
#line 89 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
{
  struct words *res;
  if( !input->size_shift && (res = unicode_split_words_pikestr0( input )) )
  {
    push_words0( input, res );
    return;
  }

  {
    struct buffer *data;
    data = uc_buffer_from_pikestring(input);
    pop_n_elems( args );
    data = unicode_decompose_buffer( data, COMPAT_BIT );
    res = unicode_split_words_buffer( data );
    push_words( data, res );
  }
}

}
/*! @decl string normalize( string data, string method );
 *!
 *! Normalize the given unicode string according to the specified method.
 *! 
 *! The methods are:
 *!
 *!  NFC, NFD, NFKC and NFKD.
 *!  
 *! The methods are described in detail in the UAX #15 document, which
 *! can currently be found at
 *! http://www.unicode.org/unicode/reports/tr15/tr15-21.html
 *!
 *! A short description:
 *! 
 *! C and D specifies whether to decompose (D) complex characters to
 *! their parts, or compose (C) single characters to complex ones.
 *!
 *! K specifies whether or not do a canonical or compatibility
 *! conversion. When K is present, compatibility transformations are
 *! performed as well as the canonical transformations.
 *!
 *! @i{In the following text, 'X' denotes the single character 'X', even
 *!  if there is more than one character inside the quotation marks. 
 *!  The reson is that it's somewhat hard to describe unicode in
 *!  iso-8859-1.@}
 *!
 *! The Unicode Standard defines two equivalences between characters:
 *! canonical equivalence and compatibility equivalence. Canonical
 *! equivalence is a basic equivalency between characters or
 *! sequences of characters. 
 *!
 *! ''  and  'A'' (combining ring above)' are canonically equivalent.
 *!
 *! For round-trip compatibility with existing standards, Unicode has
 *! encoded many entities that are really variants of existing nominal
 *! characters. The visual representations of these character are
 *! typically a subset of the possible visual representations of the
 *! nominal character. These are given compatibility decompositions in
 *! the standard. Because the characters are visually distinguished,
 *! replacing a character by a compatibility equivalent may lose
 *! formatting information unless supplemented by markup or styling.
 *!
 *! Examples of compatibility equivalences:
 *! @ul
 *!   @item
 *!     Font variants (thin, italic, extra wide characters etc)
 *!   @item
 *!     Circled and squared characters
 *!   @item
 *!     super/subscript ('' -> '2')
 *!   @item
 *!     Fractions       ('' -> '1/2')
 *!   @item
 *!     Other composed characters ('fi' -> 'f' 'i',  'kg' -> 'k' 'g')
 *! @endul
 *!
 */
#define f_normalize_defined
ptrdiff_t f_normalize_fun_num = 0;
void f_normalize(INT32 args) {
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
struct pike_string * s;
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
struct pike_string * flags;
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(args != 2) wrong_number_of_args_error("normalize",args,2);
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(Pike_sp[0-2].type != PIKE_T_STRING) SIMPLE_BAD_ARG_ERROR("normalize",1,"string");
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
debug_malloc_pass(s=Pike_sp[0-2].u.string);
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(Pike_sp[1-2].type != PIKE_T_STRING) SIMPLE_BAD_ARG_ERROR("normalize",2,"string");
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
debug_malloc_pass(flags=Pike_sp[1-2].u.string);
#line 166 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
{
  int _flags=0, i;

  for( i = 0; i<flags->len; i++ )
    switch( flags->str[ i ] )
    {
      case 'K': _flags|=1; break;
      case 'C': _flags|=2; break;
    }

  do { struct pike_string * ret_=(unicode_normalize( s, _flags )); pop_n_elems(2); push_string(ret_); return; }while(0);
#line 177 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
}

}
/*! @decl int is_wordchar(int c)
 *! Returns whether a unicode character @[c] is a word, part of a word or not.
 *! @returns
 *!   @int
 *!     @value 2
 *!       The character is an ideograph (a CJK single-word character)
 *!     @value 1
 *!       The character is a letter, number or non-spacing mark, as defined by
 *!       its unicode (general category) specification
 *!     @value 0
 *!       Any other character (such as symbols, punctuation and separators)
 *!   @endint
 */
#define f_is_wordchar_defined
ptrdiff_t f_is_wordchar_fun_num = 0;
void f_is_wordchar(INT32 args) {
#line 192 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
INT_TYPE c;
#line 192 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(args != 1) wrong_number_of_args_error("is_wordchar",args,1);
#line 192 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
if(Pike_sp[0-1].type != PIKE_T_INT) SIMPLE_BAD_ARG_ERROR("is_wordchar",1,"int");
c=Pike_sp[0-1].u.integer;
{
  do { INT_TYPE ret_=(unicode_is_wordchar( c )); pop_stack(); push_int(ret_); return; }while(0);
#line 196 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
}

}
/*! @endmodule
 */

#line 201 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
PIKE_MODULE_INIT
{
  
#ifdef f_split_words_defined
  f_split_words_fun_num =
#line 73 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
    ADD_FUNCTION2("split_words", f_split_words, tFunc(tString,tArr(tString)), 0, OPT_TRY_OPTIMIZE);

#endif /* f_split_words_defined */

#ifdef f_split_words_and_normalize_defined
  f_split_words_and_normalize_fun_num =
#line 87 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
    ADD_FUNCTION2("split_words_and_normalize", f_split_words_and_normalize, tFunc(tString,tArr(tString)), 0, OPT_TRY_OPTIMIZE);

#endif /* f_split_words_and_normalize_defined */

#ifdef f_normalize_defined
  f_normalize_fun_num =
#line 164 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
    ADD_FUNCTION2("normalize", f_normalize, tFunc(tString tString,tString), 0, OPT_TRY_OPTIMIZE);

#endif /* f_normalize_defined */

#ifdef f_is_wordchar_defined
  f_is_wordchar_fun_num =
#line 192 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
    ADD_FUNCTION2("is_wordchar", f_is_wordchar, tFunc("\10\200\0\0\0\177\377\377\377","\10\200\0\0\0\177\377\377\377"), 0, OPT_TRY_OPTIMIZE);

#endif /* f_is_wordchar_defined */
#line 204 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
unicode_normalize_init();
}

PIKE_MODULE_EXIT
{
  
#line 210 "/tmp/pikedeb.470504af38/7.6/src/post_modules/Unicode/unicode_module.cmod"
}

