/*
 *
 *   (C) Copyright IBM Corp. 2002, 2003
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * 	Module: ece.c
 */
#include <ece.h>
#include <sys/poll.h>
#include "eceinternal.h"

#if (HA_MAJOR < 1) || ((HA_MAJOR == 1) && (HA_MINOR < 2))
#define HA_USE_FD
#else
#define HA_USE_IPC
#endif

/* NOTE:  ENGINE IS THE SLAVE AND DAEMON IS THE MASTER */
/* NOTE any variable that starts with gl_ is a global variable */


/* NOTE the following datastructure is used to cache the crcs of
 * MAX_MEMB_HISTORY number of membership instances this node has seen.
 * This is required to filter out messages sent by nodes in the other
 * partitions, the membership instance of which; we have never seen
 */
#define MAX_MEMB_HISTORY 4  /* we will track past 4 membership history through crc */
typedef struct  memb_crc_s {
	uint	memb_n_entry;/* total number of entries */
	uint	memb_next; /* next entry to be filled */
	u_int32_t*   memb_crc;
} memb_crc_t;


static memb_crc_t 		*gl_memb_crc = NULL;
static ece_event_t 		*gl_ece_ev=NULL;
static int 			gl_ece_ev_size=0;
static oc_ev_membership_t 	*gl_ece_oc=NULL;
static size_t 			gl_ece_oc_size;
static size_t 			gl_ece_quorum=0;
static gboolean			gl_memb_init = FALSE;

static ece_mode_t 		gl_ece_mode;

static u_int32_t 		gl_ece_cor;  /* correlator to the client if
						the client does not give one */
static u_int32_t		gl_ece_ackval = 0; /* used to match acknowledgements
						      with the corresponding
						      sent message, before
						      unblocking the ece_send_msg()
						    */

static GSList 			*gl_ece_rlist=NULL;
static gboolean 		gl_rlist_in_use=FALSE; /* guard it by lock */
static pthread_cond_t  		gl_rlist_cond =PTHREAD_COND_INITIALIZER;

static pthread_mutex_t 		gl_ece_mutex;
static pthread_t		gl_ece_thread,  /* the core ece thread */
				gl_ece_cb_thread; /* the thread to deliver event */

static GMainLoop       		*gl_ece_mainloop=NULL;
static GSList			*gl_ece_source=NULL; /*note this linked list
					      need not be gaurded by a lock
				      	      because it is never used
					      simultaneously by two threads
					      */

static IPC_Channel 		*gl_ece_peer=NULL; /* the peer's connection channel */

static ll_cluster_t		*gl_hb_handle=NULL;
static oc_ev_t 			*gl_ccm_handle=NULL;


engine_functions_t 	*gl_ece_engine_funcs=NULL;/* the helper functions provided
						   by evms */



typedef struct {
	gboolean quorum;
	size_t  size;
	const 	oc_ev_membership_t * oc;
} memb_event_t;

typedef struct event_s {
	ece_callback_class_t 	class;
	size_t			size;		
	void			*data;
} event_t;

typedef struct eventq_s {
	GSList 	*head;
	pthread_mutex_t mutex;
	pthread_cond_t  cond;
} eventq_t;

static eventq_t gl_ev_q = {NULL,
	PTHREAD_MUTEX_INITIALIZER,
	PTHREAD_COND_INITIALIZER};


/* start of prototypes */
static void source_delete_all(void);
static void mainloop_cleanup(void);
static void cleanup_membership(void);
static inline void cleanup_messaging(void);
static void   unregister_all(void);
static void deliver_memb(memb_event_t *);
static int hb_acknowledge(const char *, const char *,
		const char *, const char *);
static void cleanup_eventq(void);

extern void oc_ev_special(const oc_ev_t *, oc_ev_class_t , int );



static void
global_init(ece_mode_t mode)
{
	LOG_ENTRY();

	cleanup_membership();
	cleanup_messaging();
	cleanup_eventq();
	unregister_all();
	source_delete_all();
	mainloop_cleanup();
	frag_cleanup();
	condition_init();

	gl_ece_mode = mode;
	gl_ece_peer = NULL;

	LOG_EXIT_VOID();
}

/* gauranteed to be called when the other thread has terminated */
static void
global_cleanup(void)
{
	LOG_ENTRY();
	global_init(0);
	LOG_EXIT_VOID();
}



/* BEGIN  of source related functions */

typedef gboolean (*destroy_t)(gpointer data);
typedef struct source_s {
	destroy_t sr_func; /* function that destroys this source */
	gpointer  sr_source;
} source_t;

static void
source_delete(gpointer data, gpointer data2)
{
	source_t *source = (source_t *)data;

	LOG_ENTRY();

	source->sr_func(source->sr_source);
	g_free(source);
	LOG_EXIT_VOID();
}

static void
source_add(destroy_t func, gpointer source)
{
	source_t *new;
	LOG_ENTRY();
	new = g_malloc(sizeof(source_t));
	new->sr_func = func;
	new->sr_source = source;
	
  	gl_ece_source = g_slist_append(gl_ece_source, new);
	LOG_EXIT_VOID();
}

static void
source_delete_all(void)
{
	LOG_ENTRY();
  	g_slist_foreach(gl_ece_source,
				source_delete, NULL);
  	g_slist_free(gl_ece_source);
	gl_ece_source = NULL;
	LOG_EXIT_VOID();
}

/* END  of source related functions */

static void
mainloop_cleanup(void)
{
	LOG_ENTRY();
	switch(gl_ece_mode) {
	case MASTER:
		if(gl_ece_mainloop) {
			g_main_quit(gl_ece_mainloop);
			g_main_destroy(gl_ece_mainloop);
		}
		gl_ece_mainloop = NULL;
		break;

	case SLAVE:
		if(GLIB_CHECK_VERSION(2,0,0)) {
			if(gl_ece_mainloop) {
				g_main_quit(gl_ece_mainloop);
				g_main_destroy(gl_ece_mainloop);
			}
			gl_ece_mainloop = NULL;
		} else {
			ece_main_destroy();
		}
		break;
	}

	LOG_EXIT_VOID();
}




/*  		START OF FUNCTION			    */
/* set of functions that deal with the last membership info */


static void
cleanup_memb_crc()
{
	LOG_ENTRY();
	if(gl_memb_crc) {
		if(gl_memb_crc->memb_crc) {
			g_free(gl_memb_crc->memb_crc);
		}
		g_free(gl_memb_crc);
		gl_memb_crc=NULL;
	}
	LOG_EXIT_VOID();
}

/* generate the crc from the given membership info */
static u_int32_t
get_crc(ece_event_t *ece_ev)
{
	u_int32_t crc;
	int i, n = ece_ev->num_entries;
	int transid = ece_ev->transid;

	LOG_ENTRY();

	crc =	CALC_CRC(0xffffffff, (void *)&n, sizeof(int));
	crc =	CALC_CRC(crc, (void *)&transid, sizeof(int));
	for ( i = 0 ; i < n ; i++ ) {
		crc = CALC_CRC(crc, (void *)&(ece_ev->node[i]),
				strlen((char *)&(ece_ev->node[i])));
	}
	LOG_EXIT_INT(crc);
	return crc;
}


/* add the given crc to the set of previously cached crcs */
static void
store_crc(u_int32_t crc)
{
	int next;
	LOG_ENTRY();
	if(gl_memb_crc==NULL) {
		gl_memb_crc = g_malloc(sizeof(memb_crc_t));
		gl_memb_crc->memb_n_entry =  MAX_MEMB_HISTORY;
		gl_memb_crc->memb_next =  0;
		gl_memb_crc->memb_crc =
		(u_int32_t *)g_malloc0(MAX_MEMB_HISTORY*sizeof(u_int32_t));
	}
	next = gl_memb_crc->memb_next;
	gl_memb_crc->memb_crc[next] = crc;
	gl_memb_crc->memb_next = (next+1)%MAX_MEMB_HISTORY;
	LOG_EXIT_VOID();
}

/* get crc of the current membership */
static u_int32_t
get_curr_crc()
{
	int pre_indx;
	LOG_ENTRY();
	pre_indx = (gl_memb_crc->memb_next+
		MAX_MEMB_HISTORY-1)%MAX_MEMB_HISTORY;
	LOG_EXIT_INT(gl_memb_crc->memb_crc[pre_indx]);
	return gl_memb_crc->memb_crc[pre_indx];
}

/* check if the given crc matches the ones that are
 * cached
 */
static gboolean
memb_check_crc(u_int32_t crc)
{
	int i;
	LOG_ENTRY();
	if(!gl_memb_crc) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}
	for(i=0; i < MAX_MEMB_HISTORY; i++) {
		if(gl_memb_crc->memb_crc[i]==crc){
			LOG_EXIT_BOOL(TRUE);
			return TRUE;
		}
	}
	LOG_EXIT_BOOL(FALSE);
	return FALSE;
}


static int
get_membership(ece_event_t *outev)
{
	int n;
	LOG_ENTRY();
	if(gl_ece_ev == NULL){
            	LOG_WARNING("Error: Membership not initialized yet.  Try again.\n");
		outev->num_entries = 0;
		LOG_EXIT_INT(EAGAIN);
		return EAGAIN;
	}
	n = gl_ece_ev->num_entries;
	if(outev->num_entries < n){
		outev->num_entries = n;
		LOG_EXIT_INT(ENOSPC);
		return ENOSPC;
	}
	memcpy(outev, gl_ece_ev,
		gl_ece_ev_size);
	LOG_EXIT_INT(0);
	return 0;
}


static void
store_membership(memb_event_t *eventdata)
{
	ece_event_t *ece_ev;
	int size;

	LOG_ENTRY();
	ece_ev = create_ece_ev(eventdata->quorum, eventdata->oc,
			MEMBERSHIP, &size);
	if(gl_ece_ev) {
		delete_ece_ev(gl_ece_ev);
	}
	gl_ece_ev = ece_ev;
	gl_ece_ev_size = size;

	ECE_ASSERT(eventdata->oc && eventdata->size>=0);


	/* note this datastructure is used to send
	 * current membership information to new connecting
	 * clients. Hence the lost members and new members
	 * does not make any sense to the new clients,
	 * since they wont have the intial membership
	 * with which it corresponds to. Hence
	 * zero out the lost members, and make the
	 * new members same as the current membership */
	g_free(gl_ece_oc);
	gl_ece_oc = g_memdup(eventdata->oc, eventdata->size);
	gl_ece_oc->m_n_out=0;
	gl_ece_oc->m_n_in= gl_ece_oc->m_n_member;
	gl_ece_oc->m_in_idx = gl_ece_oc->m_memb_idx;
	gl_ece_oc_size =  eventdata->size;


	/* store the crc of this membership */
	store_crc(get_crc(ece_ev));

	gl_ece_quorum = eventdata->quorum;
	gl_memb_init = TRUE;

	LOG_EXIT_VOID();
}

static void
cleanup_membership(void)
{
	LOG_ENTRY();
	g_free(gl_ece_ev);
	gl_ece_ev = NULL;
	g_free(gl_ece_oc);
	gl_ece_oc = NULL;
	gl_ece_oc_size =  0;
	gl_ece_ev_size =  0;
	gl_ece_quorum =  0;
	gl_memb_init = FALSE;
	cleanup_memb_crc();
	LOG_EXIT_VOID();
}


static inline void
cleanup_messaging(void)
{
	LOG_ENTRY();
	gl_ece_cor = 1;
	gl_ece_ackval = 0;
	LOG_EXIT_VOID();
}


/* BEGIN OF REGISTRATION RELATION FUNCTIONS */

/* registration tracking datastructure */
typedef struct registercb_s {
	IPC_Channel 		*reg_chanl;   /* channel */
	ece_callback_type_t 	reg_type;
	ece_cb_t	 	reg_callback;
} registercb_t;


/*
 * helper function that finds if the given registercb
 * contains the given callback function.
 */
static gint
cb_find_func(gconstpointer a, gconstpointer b)
{
	const registercb_t *registercb = (const registercb_t *)a;
	const ece_cb_t ece_cb = (const ece_cb_t)b;

	LOG_ENTRY();
	if(registercb->reg_callback == ece_cb){
		LOG_EXIT_INT(0);
		return 0;
	}
	LOG_EXIT_INT(1);
	return 1;
}


/*
 * free up memory allocated for each registration instance
 */
static void
register_cleanup(gpointer data, gpointer userdata)
{
	LOG_ENTRY();
	/* note only data has to be freed */
	g_free(data);
	
	/* userdata is NULL */
	ECE_ASSERT(userdata==NULL);
	LOG_EXIT_VOID();
}


/* unregister */
static int
ece_unregister_callback(ece_cb_t cb)
{
	registercb_t *registercb;
	GSList *element;

	LOG_ENTRY();
   	pthread_mutex_lock(&gl_ece_mutex);
	while(gl_rlist_in_use){
		pthread_cond_wait(&gl_rlist_cond, &gl_ece_mutex);
	}
	element = g_slist_find_custom(gl_ece_rlist,
			cb, cb_find_func);
	if(element==NULL) {
   		pthread_mutex_unlock(&gl_ece_mutex);
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}
	registercb = g_slist_nth_data(element, 0);
	gl_ece_rlist = g_slist_remove(gl_ece_rlist, registercb);
   	pthread_mutex_unlock(&gl_ece_mutex);

	ECE_ASSERT(registercb && registercb->reg_callback == cb);
	register_cleanup(registercb, NULL);

	LOG_EXIT_INT(0);
	return 0;
}

/* register */
static int
ece_register_callback(ece_callback_type_t type, ece_cb_t cb)
{
	registercb_t *registercb;
	ece_event_t *ece_ev=NULL;
	size_t		size=0;

	LOG_ENTRY();
	/* open a new registration instance */
	registercb = (registercb_t *)g_malloc(sizeof(registercb_t));

	registercb->reg_type = type;
	registercb->reg_callback =   cb;
	registercb->reg_chanl =  NULL;

	/*
	 * hold a thread lock, because this can interfere
	 * with the ece thread
	 */
   	pthread_mutex_lock(&gl_ece_mutex);
	if(gl_memb_init){
		ece_ev = g_malloc(gl_ece_ev_size);
		size = gl_ece_ev_size;
		memcpy(ece_ev, gl_ece_ev, gl_ece_ev_size);
	}
   	pthread_mutex_unlock(&gl_ece_mutex);

	if(ece_ev) {
		registercb->reg_callback(CALLBACK_MEMBERSHIP,
				size, ece_ev);
	}

   	pthread_mutex_lock(&gl_ece_mutex);
	while(gl_rlist_in_use){
		pthread_cond_wait(&gl_rlist_cond, &gl_ece_mutex);
	}
	gl_ece_rlist = g_slist_append(gl_ece_rlist, registercb);
   	pthread_mutex_unlock(&gl_ece_mutex);

	/* release the thread lock */
	LOG_EXIT_INT(0);
	return 0;
}

static void
unregister_all(void)
{
	LOG_ENTRY();
	g_slist_foreach(gl_ece_rlist, register_cleanup, NULL);
	g_slist_free(gl_ece_rlist);
	gl_ece_rlist = NULL;
	LOG_EXIT_VOID();
}


/* END OF REGISTRATION RELATION FUNCTIONS */

/*  		BEGIN OF FUNCTIONS 				
 * that distribute the membership events to the registered
 * clients instances
 */

/*
 *  Wake up any send_msg() waiting on this acknowledgement message
 */
static inline void
process_ack(const char *orig, const char *ackstr, const char *retstr)
{
	LOG_ENTRY();
	LOG_DEBUG("%s %s %s\n", orig, ackstr, retstr);
	condition_check(orig, atoi(ackstr), atoi(retstr));
	LOG_EXIT_VOID();
}

/* process the status of ece on other nodes of the cluster */
static void
process_ece_status(const char *orig, const char *status)
{
	struct ha_msg *h_msg;

	LOG_ENTRY();

	LOG_DEBUG("%s %s\n", orig, status);

	if(strcmp(status, LEAVESTATUS) == 0) {
		llm_set_ece_status(orig, FALSE);
		process_ack(orig, "0", "112" /* EHOSTDOWN */);
		 /* clean any message fragments
		  * from that node	
		  */
		frag_clean(orig);
		LOG_EXIT_VOID();
		return;
	} else if(strcmp(status, JOINSTATUS) == 0) {
		llm_set_ece_status(orig, TRUE);
		if(strcmp(orig, llm_getmynodeid())==0){
			LOG_EXIT_VOID();
			return;
		}
	} else if(strcmp(status, ECE_JOINSTATUS) == 0) {
		llm_set_ece_status(orig, TRUE);
		LOG_EXIT_VOID();
		return;
	}

	if(gl_ece_mode == SLAVE) {
		LOG_EXIT_VOID();
		return;
	}
	/* when ece registers with heartbeat, other ece's running
	 * on other nodes of the cluster receive the join message.
	 * however the joining ece does not receive the other eces
	 * that have already registered with ece. This is a bug
	 * in heartbeat daemon. The following workaround fixes it.
	 *
	 * this patch, sends a message back to the newly joined ece,
	 * informing it about its presence.
	 */
	if ((h_msg=ha_msg_new(0)) == NULL) {
		LOG_SERIOUS("Cannot respond to a join message\n");
		return;
	}
	if ((ha_msg_add(h_msg, F_TYPE, T_APICLISTAT) == HA_FAIL)
	||(ha_msg_add(h_msg, F_STATUS, ECE_JOINSTATUS) == HA_FAIL)) {
		LOG_SERIOUS("Cannot respond to a join message\n");
	} else {
		hb_send_to_node(h_msg, orig);
	}

	ha_msg_del(h_msg);

	LOG_EXIT_VOID();
}

/*
 * recover from the nodes that left the membership.
 */
static void
process_client_dead(const char *node)
{
	LOG_ENTRY();
	LOG_DEBUG("Dead node: %s\n", node);
	frag_clean(node);
	LOG_EXIT_VOID();
}

/*
 * recover from the nodes that left the membership.
 */
static void
process_leave_nodes(const oc_ev_membership_t *oc)
{
	int i,	size;
	ece_event_t *ece_ev;

	LOG_ENTRY();
	ece_ev = create_ece_ev(0/*dont care*/, oc, DELTA_LEAVE, &size);
	for(i=0; i< ece_ev->num_entries;  i++) {
		process_ack((char *)(ece_ev->node+i), "0", "112" /* EHOSTDOWN */);
		/* also cleanup all fragments received from that node */
		frag_clean((char *)(ece_ev->node+i));
	}
	delete_ece_ev(ece_ev);
	LOG_EXIT_VOID();
}


/* deliver the membership to all the registeration instance */
static void
reg_deliver_all_memb(gboolean quorum, size_t size, const oc_ev_membership_t *oc)
{
	memb_event_t eventdata;
	char   *mbr_str;


	LOG_ENTRY();
	if(gl_ece_mode==MASTER) {
		/*
		 * create and send membership message
		 * to the slave
		 */
		mbr_str = oc2membstr(quorum, oc, size);
		peer_send(mbr_str, strlen(mbr_str)+1, gl_ece_peer);
	}

	eventdata.quorum = quorum;
	eventdata.size  = size;
	eventdata.oc    = oc;

	/*
	 * hold a thread lock, because this can interfere
	 * with the ece thread
	 */
	/* place this event into a queue */
   	pthread_mutex_lock(&gl_ece_mutex);
	store_membership(&eventdata);
  	deliver_memb(&eventdata);
	process_leave_nodes(oc);
   	pthread_mutex_unlock(&gl_ece_mutex);

	LOG_EXIT_VOID();
}


/* Launch the failover script and wait for it to exit. */
static void *
recovery(void * arg)
{
	char * argv[4];
	pid_t pid;

	LOG_ENTRY();

	pid = fork();
	switch(pid) {
		/* error */
	        case -1:
			LOG_SERIOUS("Cannot run %s.  "
				    "fork() failed with error code %d: %s.\n",
				    DEACTIVATE_PATH, errno, strerror(errno));
			break;

                /* child */
	        case 0:
			argv[0] = DEACTIVATE_PATH;
			argv[1] = "dontcare";
			argv[2] = "startspecial";
			argv[3] = NULL;

			if(execvp(argv[0], argv)) {
				LOG_SERIOUS("Cannot run %s. "
					    "execvp() failed with error code %d: %s.\n",
					    DEACTIVATE_PATH, errno, strerror(errno));
				_exit(0);
			}
			break;

                /* parent */
		default:
			/* Wait for the child to exit and cleanup after it. */
			waitpid(pid, NULL, 0);
			break;
	}

	LOG_EXIT_VOID();
	return NULL;
}

/* Spawn the recovery script.  ECE takes the responsibility of deactivating
 * shared volumes on quorum loss on behalf of the cluster manager.
 */
static void
spawn_recovery(void)
{
	struct stat buf;
	pthread_t tid;

	LOG_ENTRY();

	/* Check if the deactivate script exists and if it does, check if it is
	 * executable and and owned by root and writable only by root.  Note we
	 * don't want to introduce insecurity in our system.
	 */
	if(stat(DEACTIVATE_PATH, &buf) ||
		!S_ISREG(buf.st_mode) ||
		!(buf.st_mode&(S_IXUSR)) ||
		(buf.st_mode&(S_IWGRP|S_IWOTH))  ||
		(buf.st_uid != 0) ||
		(buf.st_gid != 0)) {
		LOG_SERIOUS("Cannot run %s.  Check the permissions on the script.\n", DEACTIVATE_PATH);
		LOG_EXIT_VOID();
		return;
	}

	/* Launch the recovery process on another thread so that this thread
	 * can resume handling events.
	 */
	pthread_create(&tid, NULL, recovery, NULL);
	pthread_detach(tid);

	LOG_EXIT_VOID();
}


static void
enqueue(eventq_t *q, void *event)
{
	LOG_ENTRY();
	pthread_mutex_lock(&(q->mutex));
  	q->head = g_slist_append(q->head, event);
	pthread_cond_signal(&(q->cond));
	pthread_mutex_unlock(&(q->mutex));
	LOG_EXIT_VOID();
}

static void *
dequeue(eventq_t *q)
{
	void *event;
	LOG_ENTRY();
	pthread_mutex_lock(&q->mutex);
	while(q->head==NULL){
		pthread_cond_wait(&(q->cond), &(q->mutex));
	}
	event = g_slist_nth_data(q->head,0);
	q->head = g_slist_remove(q->head, event);
	pthread_mutex_unlock(&(q->mutex));
	LOG_EXIT_PTR(event);
	return event;
}

static void
enqueue_event(ece_callback_class_t class, size_t size, void *data)
{
	event_t *event = (event_t *)g_malloc(sizeof(event_t));
	LOG_ENTRY();
	event->class = class;
	event->size = size;
	event->data = data;
	enqueue(&gl_ev_q, (void *)event);
	LOG_EXIT_VOID();
}


/* deliver the received message to a given registration instance
 * This function must be called holding the global lock
 */
static void
deliver(gpointer data, gpointer userdata)
{
	registercb_t *registercb = (registercb_t *)data;
	event_t   *event = (event_t *)userdata;
	ece_event_t *ece_ev = (ece_event_t *)event->data;
	LOG_ENTRY();
	switch(event->class) {
	case CALLBACK_MEMBERSHIP:
		switch(registercb->reg_type) {
		case DELTAS:
			if(ece_ev->type != MEMBERSHIP) {
				registercb->reg_callback(event->class,
					event->size, event->data);
			}
			break;
		case FULL_MEMBERSHIP:
			if(ece_ev->type == MEMBERSHIP) {
				registercb->reg_callback(event->class,
					event->size, event->data);
			}
			break;
		}
		break;
	case CALLBACK_MESSAGE:
		registercb->reg_callback(event->class,
			event->size, event->data);
		break;
	}
	LOG_EXIT_VOID();
}

static void
free_event(event_t *event)
{
	LOG_ENTRY();
	switch(event->class) {
		ece_msg_t *ecemsg;
	case CALLBACK_MESSAGE:
		ecemsg = (ece_msg_t *)event->data;
		g_free(ecemsg->msg);
		/*
		 * FALL THROUGH
		 */
	case CALLBACK_MEMBERSHIP:
		g_free(event->data);
		g_free(event);
		break;
	}
	LOG_EXIT_VOID();
}

static void
cleanup_eventq()
{
	event_t *event;
	LOG_ENTRY();
	while((event = g_slist_nth_data(gl_ev_q.head,0))) {
		gl_ev_q.head = g_slist_remove(gl_ev_q.head,
				event);
		free_event(event);
	}
	pthread_mutex_destroy(&(gl_ev_q.mutex));
	pthread_mutex_init(&(gl_ev_q.mutex),NULL);
	pthread_cond_destroy(&(gl_ev_q.cond));
	pthread_cond_init(&(gl_ev_q.cond),NULL);
	LOG_EXIT_VOID();
}

static void *
deliver_event(void *arg)
{
	event_t *event;
	LOG_ENTRY();
	while((event = (event_t *)dequeue(&gl_ev_q))){
		ece_event_t *ece_ev = (ece_event_t *)event->data;

		switch (event->class) {
			case CALLBACK_MEMBERSHIP:
				LOG_DEBUG("Deliver membership.\n");
				break;
			case CALLBACK_MESSAGE:
				LOG_DEBUG("Deliver message.\n");
				break;
		}
		pthread_mutex_lock(&gl_ece_mutex);
		gl_rlist_in_use=TRUE;
		pthread_mutex_unlock(&gl_ece_mutex);

		g_slist_foreach(gl_ece_rlist, deliver,
				(gpointer)event);


		pthread_mutex_lock(&gl_ece_mutex);
		gl_rlist_in_use=FALSE;
		pthread_cond_signal(&gl_rlist_cond);
		pthread_mutex_unlock(&gl_ece_mutex);

		/* check if a recovery has to be spawned */
		if(gl_ece_mode==MASTER 		 	 &&
		    event->class == CALLBACK_MEMBERSHIP  &&
		    ece_ev->type == MEMBERSHIP) {
			spawn_recovery();
		}

		free_event(event);
	}
	LOG_EXIT_PTR(NULL);
	return NULL;
}


/* NOTE: data_str contains the character message
 * that is converted to a binary message before
 * delivering.
 */
static void
fill_and_deliver_ece_msg(const char *orig,
		const char *corr_str,
		const char *cmd_str,
		const char *data_str)
{
	ece_msg_t *ecemsg;
	int 	len, outbytes;
	void 	*data;

	LOG_ENTRY();
	len = strlen(orig)+1;
	ECE_ASSERT(len<sizeof(ece_nodeid_t));

	ecemsg = (ece_msg_t *)g_malloc0(sizeof(ece_msg_t));
	memcpy(&(ecemsg->node), orig, len);

	ecemsg->corrolator = atoi(corr_str);

	ecemsg->cmd = atoi(cmd_str);
	
	len    = strlen(data_str);
	outbytes   = B64_maxbytelen(len);
	data = 	(void *)g_malloc0(outbytes);
	ecemsg->size   = base64_to_binary(data_str, len,
			data, outbytes);
	ecemsg->msg    = data;


	enqueue_event(CALLBACK_MESSAGE, sizeof(ece_msg_t),
			(void*)ecemsg);

	LOG_EXIT_VOID();
}



/* deliver the received membership to a given registration
 * instance. This function must be called holding the global
 * lock
 */
static void
deliver_memb(memb_event_t *eventdata)
{
	int 	size;
	ece_event_t *ece_ev;

	const oc_ev_membership_t *oc   = eventdata->oc;
	
	gboolean quorum = eventdata->quorum;

	LOG_ENTRY();


	if((ece_ev = create_ece_ev(quorum, oc, DELTA_JOIN, &size))) {
		enqueue_event(CALLBACK_MEMBERSHIP, size, (void *)ece_ev);
	}
	if((ece_ev = create_ece_ev(quorum, oc, DELTA_LEAVE, &size))) {
		enqueue_event(CALLBACK_MEMBERSHIP, size, (void *)ece_ev);
	}
	if((ece_ev = create_ece_ev(quorum, oc, MEMBERSHIP, &size))){
		enqueue_event(CALLBACK_MEMBERSHIP, size, (void *)ece_ev);
	}
	LOG_EXIT_VOID();
}


/*  		END OF FUNCTIONS 				
 * that distribute the membership events to the registered
 * clients instances
 */



/*  CCM REGISTER FUNCTIONS BEGIN */


static void
ccm_events(oc_ed_t event, void *cookie,
		size_t size, const void *data)
{
	const oc_ev_membership_t *oc = (const oc_ev_membership_t *)data;
	LOG_ENTRY();

	if(event==OC_EV_MS_NEW_MEMBERSHIP){
		reg_deliver_all_memb(1, size, oc);
	}
	if(event== OC_EV_MS_INVALID) {
		reg_deliver_all_memb(0, size, oc);
	}

	oc_ev_callback_done(cookie);
	LOG_EXIT_VOID();
}



static int
ccm_register(void)
{
	int ret, my_ev_fd;

	/* connect to the ccm daemon */
	LOG_ENTRY();

	ret = oc_ev_register(&gl_ccm_handle);
	if (ret) {
		LOG_EXIT_INT(-1);
		return -1;
	}

	if(oc_ev_set_callback(gl_ccm_handle,
			OC_EV_MEMB_CLASS,
			ccm_events, NULL)) {
		oc_ev_unregister(gl_ccm_handle);
		LOG_EXIT_INT(-1);
		return -1;
	}

	oc_ev_special(gl_ccm_handle, OC_EV_MEMB_CLASS, 0/*don't care*/);

 	ret = oc_ev_activate(gl_ccm_handle, &my_ev_fd);
        if(ret){
		oc_ev_unregister(gl_ccm_handle);
		LOG_EXIT_INT(-1);
		return -1;
	}
	LOG_EXIT_INT(my_ev_fd);
	return my_ev_fd;
}

static inline int
ccm_unregister(void)
{
	int ret;
	LOG_ENTRY();
	ret = oc_ev_unregister(gl_ccm_handle);
	LOG_EXIT_INT(ret);
	return ret;
}

static gboolean
ccm_input_dispatch(int fd, gpointer user_data)
{
	LOG_ENTRY();
	if(oc_ev_handle_event(gl_ccm_handle)) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}
	LOG_EXIT_BOOL(TRUE);
	return TRUE;
}

static void
ccm_input_destroy(gpointer user_data)
{
	LOG_ENTRY();
	ccm_unregister();
	LOG_EXIT_VOID();
}
/*  CCM REGISTER FUNCTIONS END */








/*  HEARTBEAT REGISTER FUNCTIONS BEGIN */
static int
hb_unregister(void)
{
	int ret;
	LOG_ENTRY();
	llm_cleanup();
	ret = gl_hb_handle->llc_ops->signoff(gl_hb_handle);
	gl_hb_handle = NULL;
	LOG_EXIT_INT(ret);
	return ret;
}


#ifdef HA_USE_IPC
static IPC_Channel *
hb_register(void)
#else
static int
hb_register(void)
#endif
{
	const char *node;
	const char *hname;
#ifdef HA_USE_IPC
	IPC_Channel * hb_ch = NULL;
#else
	int           hb_fd = -1;
#endif
	
	/* register with heartbeat, if failed return
	 * -1
	 */
	LOG_ENTRY();
	gl_hb_handle = ll_cluster_new("heartbeat");

	if(!gl_hb_handle) {
		goto exit;
	}

	if (gl_hb_handle->llc_ops->signon(gl_hb_handle, "evms")!= HA_OK) {
		goto signoff;
	}

	if((hname = gl_hb_handle->llc_ops->get_mynodeid(gl_hb_handle)) == NULL) {
		goto signoff;
	}

        if (gl_hb_handle->llc_ops->init_nodewalk(gl_hb_handle) != HA_OK) {
		goto signoff;
	}
	llm_init();
	while((node = gl_hb_handle->llc_ops->nextnode(gl_hb_handle))!= NULL) {
		/* ignore non normal nodes */
		if(strcmp(gl_hb_handle->llc_ops->node_type(gl_hb_handle, node),
				"normal") != 0) {
			if(strcmp(node,hname) == 0) {
				LOG_SERIOUS("heartbeat inconsistency"
					"running on a ping node\n");
				goto signoff;
			}
		       	continue;
		}

		/* add the node to the low level membership list */
		llm_add(node,strlen(node));
	}
	llm_end(hname);
        if (gl_hb_handle->llc_ops->end_nodewalk(gl_hb_handle) != HA_OK) {
		goto signoff;
	}

#ifdef HA_USE_IPC
	hb_ch = gl_hb_handle->llc_ops->ipcchan(gl_hb_handle);
#else
	hb_fd = gl_hb_handle->llc_ops->inputfd(gl_hb_handle);
#endif

signoff:
#ifdef HA_USE_IPC
	if (hb_ch == NULL) {
#else
	if (hb_fd == -1) {
#endif
		gl_hb_handle->llc_ops->signoff(gl_hb_handle);
		llm_cleanup();
	}
exit:
#ifdef HA_USE_IPC
	LOG_EXIT_PTR(hb_ch);
	return hb_ch;
#else
	LOG_EXIT_INT(hb_fd);
	return hb_fd;
#endif
}


static gboolean
handle_msg(struct ha_msg *h_msg)
{
	const char *type, *cmd_str, *cntl_str,
		 *orig, *corr_str, *ret_str;
	const char	*str;
	char 	 	*data_str;
	int ret, ackval, flen, tlen;
	gboolean memb_context_ok;
	u_int32_t crc;
	ece_mode_t who;
	gboolean mcast;

	LOG_ENTRY();
	type = ha_msg_value(h_msg, F_TYPE);
	ECE_ASSERT(type);

	if(strcmp(type, ECE_ACKMSG)==0){
		const char *ackstr, *retstr;
		const char *who_str = ha_msg_value(h_msg, ECE_WHO);
		if(strcmp(who_str, ECE_MASTER)==0) {
			/*this acknowledgement is for the master */
			ECE_ASSERT(orig = ha_msg_value(h_msg, F_ORIG));
			ECE_ASSERT(ackstr = ha_msg_value(h_msg, ECE_ACK));
			ECE_ASSERT(retstr = ha_msg_value(h_msg, ECE_RET));
			msg_track(TRUE, NULL, ackstr, TRUE, DFR);
			process_ack(orig, ackstr, retstr);
		} else {
			/*this acknowledgement is for the slave */
			ECE_ASSERT(ackstr = ha_msg_value(h_msg, ECE_ACK));
			msg_track(TRUE, NULL, ackstr, TRUE, DRTS);
			str = msg2string(h_msg);
			peer_send((void *)str, strlen(str)+1, gl_ece_peer);
		}
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	/* a new daemon has joined/left the cluster */
	if(strcmp(type, T_APICLISTAT)==0) {
		const char *orig = ha_msg_value(h_msg, F_ORIG);
		const char *status = ha_msg_value(h_msg, F_STATUS);
		str = msg2string(h_msg);
		peer_send((void *)str, strlen(str)+1, gl_ece_peer);
		process_ece_status(orig, status);
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	/* client died on some node */
	if(strcmp(type, ECE_CLIENTDEAD)==0) {
		const char *orig = ha_msg_value(h_msg, F_ORIG);
		process_client_dead(orig);
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	/* if it is not a message of type ECE_MSG then reject it */
	if(strcmp(type, ECE_MSG)) {
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}


	/* who sent this message */
        ECE_ASSERT(orig = ha_msg_value(h_msg, F_ORIG));
        ECE_ASSERT(corr_str = ha_msg_value(h_msg, ECE_CORRELATOR));
        ECE_ASSERT(cmd_str = ha_msg_value(h_msg, ECE_CMD));
        ECE_ASSERT(cntl_str = ha_msg_value(h_msg, ECE_CNTL));

	sscanf(cntl_str,"%d,%d,%d,%d,%d,%d",
		&ackval,
		&crc,
		&flen,
		&tlen,
		(int *)&who,
		&mcast);


	/* if the message has been sent from a different partitioned
	 * membership return error.
	 * Eg. if A, B, C, D  partition into {A,B} and {C,D} and
	 * if C sends a message which reaches B. B has to reject
	 * the message.
	 *
	 * NOTE: since memb_check_crc() is called in the context of
	 * the ece's thread, and since we are not modifying the crc
	 * cache, it is safe to access it without locking.
	 */
	if((memb_context_ok = memb_check_crc((u_int32_t)crc))) {
		ret = 0;
		/* send the message to the slave only if the
		 * message has been sent by the master */
		if(who==MASTER) {
			const char *seqno_str;
			int  seqno;

			msg_track(FALSE, h_msg, NULL, FALSE, DRTS);
			str = msg2string(h_msg);
			ret = peer_send((void *)str, strlen(str)+1, gl_ece_peer);
			ret_str = ret?"-1":"0";

			/* acknowledge only the last fragment */
        		ECE_ASSERT(seqno_str = ha_msg_value(h_msg, ECE_SEQNO));
			seqno = atoi(seqno_str);
			if((tlen-1)/flen > seqno) {
				LOG_EXIT_BOOL(TRUE);
				return TRUE;
			}

		} else {
			/* agregate all the fragments and acknowledge only
			 *  the last fragment
			 */
			msg_track(FALSE, h_msg, NULL, FALSE, DFR);
			if(frag_assemble(h_msg, &data_str)){
				ret_str = "0";
			} else {
				LOG_EXIT_BOOL(TRUE);
				return TRUE;
			}
		}
	} else {
		/* message is received in the wrong membership context */
		ret_str = "-1";
	}

	/* acknowledge only point-to-point  messages */
	if(!mcast){
		char *ack_str;
		/* I am any way the master. We acknowledge the
		 * receipt of the message irrepective of
		 * the destination. (implies we proxy respond
		 * on behalf of  the slave)
		 */
		ack_str = g_strdup_printf("%d", ackval);
		hb_acknowledge(orig,
			who==MASTER?ECE_MASTER:ECE_SLAVE,
			ack_str, ret_str);
		g_free(ack_str);
	}

	/* deliver the message to local registeration instances */
	/* only if the membership context is ok and the messsage */
	/* has been sent by some slave			 	*/
	if(memb_context_ok && who==SLAVE) {
		fill_and_deliver_ece_msg(orig, corr_str,
				cmd_str, data_str);
		g_free(data_str);
	}
	LOG_EXIT_BOOL(TRUE);
	return TRUE;
}

#ifdef HA_USE_IPC
static gboolean
hb_input_dispatch(IPC_Channel * ch, gpointer user_data)
#else
static gboolean
hb_input_dispatch(int fd, gpointer user_data)
#endif
{
	gboolean ret;
	struct ha_msg *h_msg;
	/* check if any messages are pending,
	 * if so procure and deliver them.
	 */
	LOG_ENTRY();
	if(!(h_msg = gl_hb_handle->llc_ops->readmsg(gl_hb_handle, 0))) {
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}
	ret = handle_msg(h_msg);
	ha_msg_del(h_msg);
	LOG_EXIT_BOOL(ret);
	return ret;
}

static void
hb_input_destroy(gpointer user_data)
{
	LOG_ENTRY();
	hb_unregister();
	LOG_EXIT_VOID();
}

void
hb_send_to_cluster(struct ha_msg *m)
{
	int count = 0;
	LOG_ENTRY();
	if(!gl_hb_handle) {
		LOG_EXIT_VOID();
		return;
	}
	while(gl_hb_handle->llc_ops->sendclustermsg(gl_hb_handle, m)==HA_FAIL){
		/* no recovery action if
		 * send failure prolongs.
		 */
		LOG_WARNING("heartbeat channel blocked");
		ECE_ASSERT(count++ != 10);
		cl_shortsleep();
	}
	LOG_EXIT_VOID();
}


void
hb_send_to_node(struct ha_msg *m, const char *node)
{
	int count = 0;
	LOG_ENTRY();

	if(!gl_hb_handle) {
		LOG_EXIT_VOID();
		return;
	}
	while(gl_hb_handle->llc_ops->sendnodemsg(gl_hb_handle,
			m, node)==HA_FAIL){
		LOG_WARNING("heartbeat channel blocked");
		/* no recovery action if
		 * send failure prolongs.
		 */
		ECE_ASSERT(count++ != 10);
		cl_shortsleep();
	}
	LOG_EXIT_VOID();
}

/* send a acknowledgement message to the sender */
static int
hb_acknowledge(const char *orig,
		const char *whostr,
		const char *ackstr,
		const char *retstr)
{
	struct ha_msg *h_msg;
	int ret=0;

	LOG_ENTRY();
	if ((h_msg=ha_msg_new(0)) == NULL) {
		LOG_CRITICAL("Cannot create a message\n");
		ret = EAGAIN;
		goto end;
	}

	if ((ha_msg_add(h_msg, F_TYPE, ECE_ACKMSG) == HA_FAIL)
		||(ha_msg_add(h_msg, ECE_ACK, ackstr) == HA_FAIL)
		||(ha_msg_add(h_msg, ECE_RET, retstr) == HA_FAIL)
		||(ha_msg_add(h_msg, ECE_WHO, whostr) == HA_FAIL)) {
			LOG_CRITICAL("Cannot fill the message\n");
			ret = EAGAIN;
			goto end;
	}


	if(strcmp(orig, llm_getmynodeid())==0){
		/* short circuit the acknowledgement */
		ha_msg_add(h_msg,F_ORIG,orig);
		handle_msg(h_msg);
	} else {
		msg_track(TRUE, NULL, ackstr, TRUE, DLTR);
		hb_send_to_node(h_msg, orig);
	}
end:
	if(h_msg) ha_msg_del(h_msg);
	LOG_EXIT_INT(ret);
	return ret;
}



/*  HEARTBEAT REGISTER FUNCTIONS END */


/* BEGIN OF SLAVE CHANNEL RELATED FUNCTIONS */
static int
ms_ch_init(IPC_Channel *newclient)
{
	char *str;
	int ret=0;

	LOG_ENTRY();
	if(gl_ece_peer) {
		/* reject the new slave */
		newclient->ops->destroy(newclient);
		LOG_EXIT_INT(-1);
		return -1;
 	}


	/*
	 * Acknowledge the client with a welcome message.
	 *
	 * The welcome string must be ha_malloc()ed memory since
	 * peer_send() will ha_free() it.  ha_malloc() and
	 * g_malloc() are NOT the same.
	 */
	str = ha_malloc(strlen("welcome") + 1);
	if (!str) {
		goto end;
	}
	strcpy(str, "welcome");
	if((ret = peer_send(str, strlen(str)+1, newclient))!=0) {
		goto end;
	}

	/* send the initial lowlevel membership to the slave */
	str = llm_llm2str();
	if((ret = peer_send(str, strlen(str)+1, newclient))==0) {
		pthread_mutex_lock(&gl_ece_mutex);
		if(gl_memb_init){
			str = oc2membstr(gl_ece_quorum, gl_ece_oc,
					gl_ece_oc_size);
			ret = peer_send(str, strlen(str)+1,
					newclient);
		}
		pthread_mutex_unlock(&gl_ece_mutex);
	}

	gl_ece_peer = newclient;
end:
	LOG_EXIT_INT(ret);
	return ret;
}





static gboolean
ms_input_dispatch(IPC_Channel *ms_ch,
	              gpointer	user_data)
{
	IPC_Message *ipcmsg;
	int ret;
	struct ha_msg *h_msg;
	const char *type;

	LOG_ENTRY();
	/* receive the slave's message, and process it */
	ret = peer_recv(&ipcmsg, ms_ch, 0);
	if(ret == IPC_BROKEN ){
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}
	if(ret == IPC_FAIL) {
		/* retry  later, probably a false invocation */
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	h_msg = string2msg(ipcmsg->msg_body, ipcmsg->msg_len);
	ECE_ASSERT(h_msg);
	peer_recv_done(ipcmsg);

	ECE_ASSERT(type = ha_msg_value(h_msg, F_TYPE));
	if(strcmp(type, ECE_MSG) == 0) {
		const char *towhom_str =
			ha_msg_value(h_msg, ECEi_TOWHOM);

		/* if towhom is specified send the message
		 * to the particular node
		 * else to the entire cluster,
		 */
		if(towhom_str) {
			ECE_ASSERT(llm_is_present(towhom_str));
			if(strcmp(towhom_str, llm_getmynodeid())==0){
				/* short circuit the message */
				ha_msg_add(h_msg,F_ORIG,towhom_str);
				handle_msg(h_msg);
			} else {
				msg_track(FALSE, h_msg, NULL, FALSE, DSTR);
				hb_send_to_node(h_msg, towhom_str);
			}
		} else {
			hb_send_to_cluster(h_msg);
		}
	}
	/* send the message to all */
	ha_msg_del(h_msg);
	LOG_EXIT_BOOL(TRUE);
	return TRUE;
}



static void
ms_input_destroy(gpointer user_data)
{
        struct ha_msg *h_msg;

	LOG_ENTRY();
	gl_ece_peer = NULL;

	/* multicast to all daemons informing the death of the client */
	if ((h_msg=ha_msg_new(0)) == NULL) {
		LOG_CRITICAL("Error: Internal resource allocation problem. Try again.\n");
		LOG_EXIT_VOID();
		return;
	}
	if (ha_msg_add(h_msg, F_TYPE, ECE_CLIENTDEAD) == HA_FAIL){
		LOG_SERIOUS("Cannot respond to a join message\n");
	} else {
		hb_send_to_cluster(h_msg);
	}
	ha_msg_del(h_msg);

	LOG_EXIT_VOID();
}



/* END OF SLAVE CHANNEL RELATED FUNCTIONS */

/*  WAITCHANNEL REGISTER FUNCTIONS BEGIN */

static IPC_WaitConnection *
wait_channel_init(void)
{
	IPC_WaitConnection *wait_ch;

	GHashTable * attrs = g_hash_table_new(g_str_hash,g_str_equal);

	LOG_ENTRY();
	g_hash_table_insert(attrs, IPC_PATH_ATTR, ECEFIFO);
	g_hash_table_insert(attrs, IPC_MODE_ATTR, "700");

	wait_ch = ipc_wait_conn_constructor(IPC_DOMAIN_SOCKET, attrs);

	g_hash_table_destroy(attrs);

	LOG_EXIT_PTR(wait_ch);
	return wait_ch;
}

static gboolean
waitch_input_dispatch(IPC_Channel *newclient,
		gpointer user_data)
{
	GCHSource* source;

	LOG_ENTRY();
	/* if slave is already connected return TRUE*/
	if(ms_ch_init(newclient)) {
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	source = G_main_add_IPC_Channel(G_PRIORITY_LOW, newclient,
		FALSE, ms_input_dispatch, newclient, ms_input_destroy);

	source_add((destroy_t)G_main_del_IPC_Channel, source);

	LOG_EXIT_BOOL(TRUE);
	return TRUE;
}

static inline void
waitch_input_destroy(gpointer user_data)
{
	LOG_ENTRY();
	LOG_EXIT_VOID();
}

/*  WAITCHANNEL REGISTER FUNCTIONS END */

static gboolean
timeout_dispatch(gpointer user_data)
{
	LOG_EXTRA_ENTRY();
	/* check if the main thread is waiting on any s
	 * messages to be sent
	 */
	frag_send_check();
	pthread_testcancel();
	LOG_EXTRA_EXIT_BOOL(TRUE);
	return TRUE;
}



/*   MASTER IPC  REGISTER FUNCTIONS BEGIN  	  */
/*   THESE FUNCTIONS ARE CALLED ONLY ON THE SLAVE */

static gboolean
sm_recv_llm(IPC_Channel *ch)
{
	gboolean ret;
	struct IPC_MESSAGE *msg;

	LOG_ENTRY();

	if(peer_recv(&msg, ch, 1) == IPC_OK) {
		llm_init();
		llm_str2llm(msg->msg_body, msg->msg_len);
		peer_recv_done(msg);
		ret = TRUE;
	} else {
		ret = FALSE;
	}
	LOG_EXIT_BOOL(ret);
	return ret;
}

/* note sm stands for  'on Slave connection to Master' */
static gboolean
sm_input_dispatch(IPC_Channel *sm_ch, gpointer user_data)
{
	IPC_Message *ipcmsg;
	int 	ret;
	char 	*msg_str;
	int	msg_size;
	struct ha_msg *h_msg;
	const	char *type, *cmd_str, *orig, *corr_str,
		*ack_str, *who_str, *ret_str;
	char  	*data_str;

	LOG_ENTRY();
	ECE_ASSERT(sm_ch);

	ret = peer_recv(&ipcmsg, sm_ch, 0);
	if(ret == IPC_BROKEN ){
		condition_check(llm_getmynodeid(), 0, ENOLINK);
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}
	if(ret == IPC_FAIL) {
		/* retry  later, probably a false invocation */
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	msg_str = (char *)ipcmsg->msg_body;
	msg_size = ipcmsg->msg_len;
	if(msg_size == 0) {
		peer_recv_done(ipcmsg);
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}

	if(strlen(msg_str) != msg_size-1) {
		LOG_SERIOUS("Internal inconsistency detected. "
				"ECE service stopped\n");
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}

	h_msg = string2msg(msg_str, msg_size);

	peer_recv_done(ipcmsg);

	type = ha_msg_value(h_msg, F_TYPE);
	/* check if we received a membership or messaging event
	 */
	if(strcmp(type, ECE_MSG)==0) {
		ECE_ASSERT(cmd_str = ha_msg_value(h_msg, ECE_CMD));
		ECE_ASSERT(orig = ha_msg_value(h_msg, F_ORIG));
		ECE_ASSERT(corr_str = ha_msg_value(h_msg, ECE_CORRELATOR));
		/* NOTE: we shall not check for the membership crc on
		 * message because the master would have already
		 * filtered it out.
		 */
		msg_track(FALSE, h_msg, NULL, FALSE, SFL);
		if(frag_assemble(h_msg, &data_str)){
			fill_and_deliver_ece_msg(orig, corr_str,
					cmd_str, data_str);
			g_free(data_str);
		}
	} else if(strcmp(type, ECE_ACKMSG)==0) {
		ECE_ASSERT(who_str = ha_msg_value(h_msg, ECE_WHO));
		ECE_ASSERT(strcmp(who_str, ECE_SLAVE)==0);
		ECE_ASSERT(orig = ha_msg_value(h_msg, F_ORIG));
		ECE_ASSERT(ack_str = ha_msg_value(h_msg, ECE_ACK));
		ECE_ASSERT(ret_str = ha_msg_value(h_msg, ECE_RET));
		msg_track(TRUE, NULL, ack_str, TRUE, SFL);
		process_ack(orig, ack_str, ret_str);
	} else if(strcmp(type, T_APICLISTAT)==0) {
		const char *orig = ha_msg_value(h_msg, F_ORIG);
		const char *status = ha_msg_value(h_msg, F_STATUS);
		process_ece_status(orig, status);
	} else if(strcmp(type, ECEi_MEM)==0) {
		oc_ev_membership_t *oc;
		gboolean 	   quorum;
		int size = membstr2oc(h_msg, &oc, &quorum);
		reg_deliver_all_memb(quorum, size, oc);
		g_free(oc);
	}
	ha_msg_del(h_msg);

	LOG_EXIT_BOOL(TRUE);
	return TRUE;
}



static void
sm_input_destroy(gpointer user_data)
{
	LOG_ENTRY();
	gl_ece_peer = NULL;
	LOG_EXIT_VOID();
}

static IPC_Channel*
sm_cnct(void)
{
	IPC_Channel *sm_ch;

	GHashTable * attrs = g_hash_table_new(g_str_hash,g_str_equal);
	g_hash_table_insert(attrs, IPC_PATH_ATTR, ECEFIFO);
	sm_ch = ipc_channel_constructor(IPC_DOMAIN_SOCKET, attrs);

	/* connect to master ece */
	LOG_ENTRY();
	if(sm_ch ) {
		struct IPC_MESSAGE *msg;
		if (sm_ch->ops->initiate_connection(sm_ch) != IPC_OK){
        		LOG_SERIOUS("Error: Daemon may not be running\n");
			sm_ch->ops->destroy(sm_ch);
			sm_ch = NULL;
		} else if(peer_recv(&msg, sm_ch, 1) != IPC_OK){
        		LOG_SERIOUS("Error: Daemon rejected the connection.  There must be some other client already connected.");
			sm_ch->ops->destroy(sm_ch);
			sm_ch = NULL;
		} else {
			peer_recv_done(msg);
			/* recieve and initialize the low level membership */
			if(sm_recv_llm(sm_ch) == FALSE) {
				LOG_SERIOUS("Error: Daemon did not send the llm information.  Probably is misbehaving.");
				sm_ch->ops->destroy(sm_ch);
				sm_ch = NULL;
			}
		}
	}
	g_hash_table_destroy(attrs);
	gl_ece_peer = sm_ch;
	LOG_EXIT_PTR(sm_ch);
	return sm_ch;
}




/*  MASTER IPC  REGISTER FUNCTIONS END */


static void *
start_ece(void *arg)
{
	GMainLoop       *mainloop=(GMainLoop *)arg;
	LOG_ENTRY();
	if (pthread_create(&gl_ece_cb_thread, NULL, deliver_event, NULL)) {
            	LOG_SERIOUS("Error: Failed to spawn the callback thread.  Try again.\n");
		LOG_EXIT_PTR(NULL);
		return NULL;
	}
	switch(gl_ece_mode) {
	case MASTER :
		g_main_run(mainloop);
		break;
	case SLAVE:
		if(GLIB_CHECK_VERSION(2,0,0)) {
			g_main_run(mainloop);
		} else {
			ece_main_run();
		}
		break;
	}
	LOG_EXIT_PTR(NULL);
	return NULL;
}

static int
ece_send_msg(ece_msg_t *ecemsg)
{
	char  *node = NULL;
	gboolean multicast=FALSE;
	ece_nodeid_t allnode  = ECE_ALL_NODES;
	u_int32_t corr, crc, ackval;
	int ret;


	LOG_ENTRY();

	/* there can be only one send in progress */

   	pthread_mutex_lock(&gl_ece_mutex);
	if(!gl_memb_init) {
   		pthread_mutex_unlock(&gl_ece_mutex);
        	LOG_WARNING("Error: Membership status not known.  Try again.\n");
		LOG_EXIT_INT(EAGAIN);
		return EAGAIN;
	}
   	pthread_mutex_unlock(&gl_ece_mutex);

	if(memcmp(&(ecemsg->node), &allnode,
			sizeof(ece_nodeid_t)) == 0) {
		 multicast = TRUE;
	} else {
		 node = (char *)ecemsg->node.bytes;
		 if(!llm_is_present(node)) {
			LOG_ERROR("Error: %s not a member.\n", node);
			LOG_EXIT_INT(EINVAL);
			return EINVAL;
		 }
		 /* the ece deamon on the node is not initialized */
		 if(!llm_get_ece_status(node)) {
        		 LOG_WARNING("Error: ECE Daemon on %s is not running yet.\n", node);
			 LOG_EXIT_INT(EAGAIN);
			return EAGAIN;
		 }
	}


   	pthread_mutex_lock(&gl_ece_mutex);
	crc = get_curr_crc();
	ackval = ++gl_ece_ackval;
	if((corr = ecemsg->corrolator)==0) {
		corr = gl_ece_cor++;
	}
   	pthread_mutex_unlock(&gl_ece_mutex);

	/* this message has to be sent in the context of the
	 * mainloop thread, because the ipc library which
	 * sends this message is not thread safe. So make sure
	 * all our messages are sent only the mainloop thread
	 */
	ret = frag_send_msg_wait(ecemsg,
			multicast,
			node,
			corr,
			crc,
			ackval);

	if(!ret) {
		ecemsg->corrolator = corr;
		if(!multicast &&
		(gl_ece_mode == SLAVE || strcmp(node, llm_getmynodeid())!=0)){
			/* wait till an acknowledgement from the receiver is
			 * received
			 */
			ret = condition_wait(node, ackval);
		}
	}

	LOG_EXIT_INT(ret);
	return ret;
}

inline ece_mode_t
ece_get_mode()
{
	return gl_ece_mode;
}


inline IPC_Channel *
ece_get_peer()
{
	return gl_ece_peer;
}

static void
ece_cleanup(void)
{
	LOG_ENTRY();
	/* Wait a bit to let the other threads settle into a cancellation state. */
	poll(NULL,0,ECE_TIMEOUT);

	if(pthread_cancel(gl_ece_cb_thread)==0) {
	 	ECE_ASSERT(pthread_join(gl_ece_cb_thread, NULL)==0);
	}
	if(pthread_cancel(gl_ece_thread)==0) {
		ECE_ASSERT(pthread_join(gl_ece_thread, NULL) == 0);
	}
	pthread_mutex_destroy(&gl_ece_mutex);
	global_cleanup();
	LOG_EXIT_VOID();
}

static int
ece_init( engine_functions_t *func)
{
#ifdef HA_USE_IPC
	IPC_Channel * hb_ch;
#else
	int           hb_fd;
#endif
	int ccm_fd;
	IPC_WaitConnection *wait_ch;
	IPC_Channel 	*sm_ch;
 	guint 		timout;

	ece_mode_t 	mode;

	EngFncs = func;
	LOG_ENTRY();

	if(!func || !func->get_engine_mode) {
            	LOG_ERROR("Error: Invalid Input (Arg #1)\n");
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}

	mode = (func->get_engine_mode()==ENGINE_DAEMON)? MASTER: SLAVE;

	global_init(mode);

	// if called with SLAVE mode, register with the MASTER's ece.
	// if called by the MASTER register with the heartbeat.
	switch(mode) {

		gpointer 	 hbSource;
		GFDSource	*ccmSource;
		GWCSource	*waitSource;

		case SLAVE:
			if((sm_ch = sm_cnct())==NULL) {
            			LOG_SERIOUS("Error connecting to ECE Daemon\n");
				LOG_EXIT_INT(ENODEV);
				return ENODEV;
			}

			if(GLIB_CHECK_VERSION(2,0,0)) {
				GCHSource	*mstSource;
				mstSource = G_main_add_IPC_Channel(
					G_PRIORITY_LOW, sm_ch, FALSE,
					sm_input_dispatch,
					sm_ch, sm_input_destroy);

				if(mstSource == NULL) {
					LOG_SERIOUS("Error: Internal resource allocation failure.  Try again.\n");
					LOG_EXIT_INT(EAGAIN);
					return EAGAIN;
				}

				gl_ece_mainloop = g_main_new(TRUE);
				source_add((destroy_t)G_main_del_IPC_Channel,
						mstSource);
				timout = Gmain_timeout_add_full(
					G_PRIORITY_LOW, ECE_TIMEOUT,
					timeout_dispatch, NULL, NULL);
				source_add((destroy_t)g_source_remove,
					      	(gpointer)timout);
			} else {
				ece_main_init(sm_ch, sm_input_dispatch,
					sm_input_destroy, ECE_TIMEOUT,
					timeout_dispatch);
			}
			break;

		case MASTER:
			/* connect to the heartbeat daemon, for
			 * communication services
			 */
#ifdef HA_USE_IPC
			if ((hb_ch = hb_register()) == NULL){
            			LOG_SERIOUS("Error connecting to heartbeat\n");
				LOG_EXIT_INT(ENODEV);
				return ENODEV;
			}
#else
			if ((hb_fd = hb_register()) == -1){
            			LOG_SERIOUS("Error connecting to heartbeat\n");
				LOG_EXIT_INT(ENODEV);
				return ENODEV;
			}
#endif
			/* connect to the ccm daemon for membership
			 * services
			 */
			if ((ccm_fd = ccm_register()) == -1) {
				hb_unregister();
            			LOG_SERIOUS("Error connecting to CCM\n");
				LOG_EXIT_INT(ENODEV);
				return ENODEV;
			}

		 	/* open up a IPC wait channel listening for
			 * connection requests from SLAVE ece */
			if((wait_ch = wait_channel_init()) == NULL) {
				hb_unregister();
				ccm_unregister();
            			LOG_SERIOUS("Error opening the IPC channel for the slave.  Try again.\n");
				LOG_EXIT_INT(EAGAIN);
				return EAGAIN;
			}

			gl_ece_mainloop = g_main_new(TRUE);
#ifdef HA_USE_IPC
			hbSource = G_main_add_IPC_Channel(G_PRIORITY_HIGH,
					hb_ch,
					FALSE, hb_input_dispatch,
					NULL, hb_input_destroy);
#else
			hbSource = G_main_add_fd(G_PRIORITY_HIGH,
					hb_fd,
					FALSE, hb_input_dispatch,
					NULL, hb_input_destroy);
#endif
			ccmSource = G_main_add_fd(G_PRIORITY_HIGH,
					ccm_fd,
					FALSE, ccm_input_dispatch,
					NULL, ccm_input_destroy);

			waitSource = G_main_add_IPC_WaitConnection(
					G_PRIORITY_LOW, wait_ch, NULL,
					FALSE, waitch_input_dispatch,
					wait_ch, waitch_input_destroy);
#ifdef HA_USE_IPC
			source_add((destroy_t)G_main_del_IPC_Channel, hbSource);
#else
			source_add((destroy_t)G_main_del_fd, hbSource);
#endif
			source_add((destroy_t)G_main_del_fd, ccmSource);
			source_add((destroy_t)
					G_main_del_IPC_WaitConnection,
					waitSource);

			timout = Gmain_timeout_add_full(G_PRIORITY_LOW,
					ECE_TIMEOUT,
					timeout_dispatch, NULL, NULL);
			source_add((destroy_t)g_source_remove,  (gpointer)timout);
			break;
		default:
            		LOG_ERROR("Error: The mode is invalid.  It should be MASTER or SLAVE.\n");
			LOG_EXIT_INT(EINVAL);
			return EINVAL;
	}

	// initialize the message assembly subsystem.
	frag_init(llm_getconfignodes());

	// create a internal thread, and wait on messages in the mainloop.
   	pthread_mutex_init(&gl_ece_mutex, NULL);
	if (pthread_create(&gl_ece_thread, NULL, start_ece,
				(void *)gl_ece_mainloop)) {
		ece_cleanup();
            	LOG_SERIOUS("Error: Failed to spawn a thread, try again\n");
		LOG_EXIT_INT(EAGAIN);
		return EAGAIN;
	}

	LOG_EXIT_INT(0);
	return 0;
}




static int
ece_get_my_nodeid(ece_nodeid_t *nodeid)
{
	const char *id = llm_getmynodeid();
	LOG_ENTRY();
	memset(nodeid, 0, sizeof(ece_nodeid_t));
	strncpy((char *)nodeid, id, strlen(id));
	LOG_EXIT_INT(0);
	return 0;
}

/*
 * return the string representation of the nodeid in
 * 'str' whose size is in '*strlen'. If '*strlen' is
 * not sufficient, return the required size in '*strlen'
 * and return error.
 */
static int
ece_nodeid_to_string(const ece_nodeid_t *nodeid,
		char *str, uint *str_len)
{
	uint len;
	LOG_ENTRY();
	if(!nodeid) {
            	LOG_ERROR("Error: Invalid Input (Arg #1)\n");
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}
	if((len = strlen((char *)nodeid)) >= *str_len){
		*str_len = len+1;
		LOG_EXIT_INT(ENOSPC);
		return ENOSPC;
	}
	if(!str) {
            	LOG_ERROR("Error: Invalid Input (Arg #2)\n");
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}
	strncpy(str, (char *)nodeid, *str_len);
	LOG_EXIT_INT(0);
	return 0;
}


/*
 * return number of configured nodes in the cluster
 */
static int
ece_get_num_config_nodes(uint *num)
{
	int n = llm_getconfignodes();
	LOG_ENTRY();
	if (n>=0) {
		*num = n;
		LOG_EXIT_INT(0);
		return 0;
	}
        LOG_WARNING("Error: Not initialized\n");
	LOG_EXIT_INT(-1);
	return -1;
}

static int
ece_get_all_nodes(unsigned int *num_entries,
		ece_nodeid_t *nodes)
{
	int n = llm_getconfignodes();

	LOG_ENTRY();

	if(!nodes || !num_entries) {
            	LOG_ERROR("Error: Invalid Input\n");
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}

	if ( n > *num_entries) {
		*num_entries = n;
		LOG_EXIT_INT(ENOSPC);
		return ENOSPC;
	}
	llm_getallnodes(nodes);
	LOG_EXIT_INT(0);
	return 0;
}

static int
ece_get_membership(ece_event_t *event)
{
	int ret=0;
	LOG_ENTRY();
   	pthread_mutex_lock(&gl_ece_mutex);
	ret = get_membership(event);
   	pthread_mutex_unlock(&gl_ece_mutex);
	LOG_EXIT_INT(ret);
	return ret;
}


static int
ece_get_clusterid(ece_clusterid_t *clusterid)
{
	const char *str =  llm_get_clusterid();
	LOG_ENTRY();
	memset(clusterid, 0, sizeof(ece_clusterid_t));
	strcpy((char *)clusterid, str);
	LOG_EXIT_INT(0);
	return 0;
}


static int
ece_string_to_nodeid(const char *str,
		ece_nodeid_t *nodeid)
{
	LOG_ENTRY();

	if(!str || !nodeid || strlen(str) >= sizeof(ece_nodeid_t)) {
            	LOG_ERROR("Error: Invalid Input\n");
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}

	memset(nodeid,0,sizeof(ece_nodeid_t));
	strcpy((char *)nodeid, str);

	LOG_EXIT_INT(0);
	return 0;
}


static int
ece_get_plugin_functions(ece_nodeid_t * nodeid,
	function_info_array_t * * actions)
{
	/* not sure what to fill here TOBEDONE*/
	LOG_ENTRY();
        LOG_ERROR("Error: Not supported Yet\n");
	LOG_EXIT_INT(ENOSYS);
	return ENOSYS;
}

static int
ece_plugin_function(ece_nodeid_t * nodeid,
		   task_action_t  action,
		   list_anchor_t  objects,
		  option_array_t * options)
{
	/* not sure what to fill here TOBEDONE*/
	LOG_ENTRY();
        LOG_ERROR("Error: Not supported Yet\n");
    	LOG_EXIT_INT(ENOSYS);
	return ENOSYS;
}


static int
ece_get_plugin_info(char  * info_name,
                 extended_info_array_t * * info_array) {

    extended_info_array_t   * info = NULL;
    char                    buffer[51] = {0};
    int                     i = 0;

    LOG_ENTRY();

    /* Parameter check */
    if (!info_array) {
        LOG_EXIT_INT(EFAULT);
        return EFAULT;
    }

    if (!info_name) {
        if (!(info = ENGINE_ALLOC(sizeof(extended_info_array_t) +
			sizeof(extended_info_t)*6))) {
            LOG_CRITICAL("Error allocating memory for info array\n");
            LOG_EXIT_INT(ENOMEM);
            return ENOMEM;
        }

        info->info[i].name = EngFncs->engine_strdup("ShortName");
        info->info[i].title = EngFncs->engine_strdup(_("Short Name"));
        info->info[i].desc = EngFncs->engine_strdup(_("A short name given to this plug-in"));
        info->info[i].type = EVMS_Type_String;
        info->info[i].value.s = EngFncs->engine_strdup(ece_plugin_record.short_name);
        i++;

        info->info[i].name = EngFncs->engine_strdup("LongName");
        info->info[i].title = EngFncs->engine_strdup(_("Long Name"));
        info->info[i].desc = EngFncs->engine_strdup(_("A longer, more descriptive name for this plug-in"));
        info->info[i].type = EVMS_Type_String;
        info->info[i].value.s = EngFncs->engine_strdup(ece_plugin_record.long_name);
        i++;

        info->info[i].name = EngFncs->engine_strdup("Type");
        info->info[i].title = EngFncs->engine_strdup(_("Plug-in Type"));
        info->info[i].desc =
		EngFncs->engine_strdup(_("There are various types of plug-ins, each responsible for some kind of storage object or logical volume."));
        info->info[i].type = EVMS_Type_String;
        info->info[i].value.s = EngFncs->engine_strdup(_("Cluster Manager"));
        i++;

        info->info[i].name = EngFncs->engine_strdup("Version");
        info->info[i].title = EngFncs->engine_strdup(_("Plug-in Version"));
        info->info[i].desc =
		EngFncs->engine_strdup(_("This is the version number of the plug-in."));
        info->info[i].type = EVMS_Type_String;
        snprintf(buffer, 50, "%d.%d.%d", MAJOR_VERSION,
			MINOR_VERSION, PATCH_LEVEL);
        info->info[i].value.s = EngFncs->engine_strdup(buffer);
        i++;

        info->info[i].name = EngFncs->engine_strdup("Required_Engine_Version");
        info->info[i].title = EngFncs->engine_strdup(_("Required Engine Services Version"));
        info->info[i].desc =
		EngFncs->engine_strdup(_("This is the version of the Engine services that this plug-in requires.  "
					 "It will not run on older versions of the Engine services."));
        info->info[i].type = EVMS_Type_String;
        snprintf(buffer, 50, "%d.%d.%d",
		ece_plugin_record.required_engine_api_version.major,
		ece_plugin_record.required_engine_api_version.minor,
		ece_plugin_record.required_engine_api_version.patchlevel);
        info->info[i].value.s = EngFncs->engine_strdup(buffer);
        i++;

        info->info[i].name = EngFncs->engine_strdup("Required_Cluster_Version");
        info->info[i].title = EngFncs->engine_strdup(_("Required Cluster API Version"));
        info->info[i].desc =
		EngFncs->engine_strdup(_("This is the version of the Engine's cluster plug-in API that this plug-in requires.  "
					 "It will not run on older versions of the Engine's cluster plug-in API."));
        info->info[i].type = EVMS_Type_String;
        snprintf(buffer, 50, "%d.%d.%d",
		ece_plugin_record.required_plugin_api_version.cluster.major,
		ece_plugin_record.required_plugin_api_version.cluster.minor,
	  	ece_plugin_record.required_plugin_api_version.cluster.patchlevel);
        info->info[i].value.s = EngFncs->engine_strdup(buffer);
        i++;
    } else {
        LOG_ERROR("No support for extra plugin information about \"%s\"\n", info_name);
        LOG_EXIT_INT(EINVAL);
        return EINVAL;
    }

    info->count = i;
    *info_array = info;

    LOG_EXIT_INT(0);
    return 0;
}

static cluster_functions_t ft={
    setup_evms_plugin:          ece_init,
    cleanup_evms_plugin:        ece_cleanup,
    register_callback:          ece_register_callback,
    unregister_callback:        ece_unregister_callback,
    send_msg:                   ece_send_msg,
    get_clusterid:              ece_get_clusterid,
    get_my_nodeid:              ece_get_my_nodeid,
    get_num_config_nodes:       ece_get_num_config_nodes,
    get_all_nodes:              ece_get_all_nodes,
    get_membership:             ece_get_membership,
    nodeid_to_string:           ece_nodeid_to_string,
    string_to_nodeid:           ece_string_to_nodeid,
    get_plugin_info:            ece_get_plugin_info,
    get_plugin_functions:       ece_get_plugin_functions,
    plugin_function:            ece_plugin_function
};


plugin_record_t ece_plugin_record = {

	id:			SetPluginID(EVMS_OEM_IBM,
				EVMS_CLUSTER_MANAGER_INTERFACE_MODULE,
				1),

	version:		{major:      MAJOR_VERSION,
				minor:      MINOR_VERSION,
				patchlevel: PATCH_LEVEL},

	required_engine_api_version: {major:      15,
				minor:      0,
				patchlevel: 0},

	required_plugin_api_version: {cluster: {major:      1,
					minor:      0,
					patchlevel: 0} },

	short_name:		"Linux-HA",
	long_name:		"Linux-HA Cluster Manager",
	oem_name:		"IBM",

	functions:	{cluster:   &ft},

	container_functions:   NULL
};

plugin_record_t * evms_plugin_records[] = {&ece_plugin_record, NULL};
