/*
 * Copyright (c) 2001-2003 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: ssi_crmpi_blcr.c,v 1.5.2.2 2003/09/09 00:56:41 jsquyres Exp $
 *
 *	Function:	- BLCR crmpi module 
 */

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>

#include <lam_config.h>
#include <lamthreads.h>
#include <lammpithreads.h>
#include <typical.h>
#include <sfh.h>
#include <etc_misc.h>

#include <lam-ssi-cr.h>
#include <lam-ssi-crmpi-blcr.h>
#include <lam-ssi-crmpi-blcr-config.h>

#include <libcr.h>

#define signal_puts(S)  write(STDOUT_FILENO, (S), strlen(S))

/*
 * local variables 
 */
static cr_spinlock_t *handler_spinlock = NULL;
static lam_mutex_t yield_to_cr_mutex;
static lam_ssi_crmpi_actions_t module_actions = {
  lam_ssi_crmpi_blcr_finalize,
  lam_ssi_crmpi_blcr_app_suspend
};


/*
 * local functions
 */
static int crmpi_thread_callback(void *arg);
static int crmpi_signal_callback(void *arg);
static int acquire_mpi_lock(void);
static int release_mpi_lock(void);
static void signal_handler(int signum);


/*
 * global variables
 */


/*
 * lam_ssi_crmpi_blcr_query
 *
 * Function: determine if the module wants to run.
 */
int
lam_ssi_crmpi_blcr_query(int *priority, int *thread_min, int *thread_max)
{
  /* Check this module's priority before doing anything else */

  if (lam_ssi_cr_base_check_priority("blcr", 50, LAM_SSI_CRMPI_BLCR_DEFAULT,
                                      priority) < 0)
    return LAMERROR;

  /* If we're verbose, print stuff out */

  if (lam_ssi_cr_verbose >= 5) {
    lam_debug(lam_ssi_cr_did, "blcr: module initializing");
    lam_debug(lam_ssi_cr_did, "blcr:verbose: %d",
              lam_ssi_cr_verbose);
    lam_debug(lam_ssi_cr_did, "blcr:priority: %d", *priority);
  }

  *thread_min = *thread_max = MPI_THREAD_SERIALIZED;

  /* Return 1 to say that we want to be considered. */

  return 0;
}


/*
 *	lam_ssi_crmpi_blcr_init
 *
 *	Function:	- primary initialiation of CRMPI subsystem
 *	Returns		- 0 or LAMERROR
 */
const lam_ssi_crmpi_actions_t *
lam_ssi_crmpi_blcr_init(void)
{
  cr_callback_id_t cr_thread_callback_id, cr_signal_callback_id;
  void *cr_thread_callback_arg = NULL, *cr_signal_callback_arg = NULL;
  struct sigaction  sa;
  
  if (lam_ssi_cr_verbose >= 10)
    lam_debug(lam_ssi_cr_did, " initializing");
  
  if (cr_init() < 0) {
    if (lam_ssi_cr_verbose >= 0)
      lam_debug(lam_ssi_cr_did, "cr_init failed...");

    /* Do a show_help_file here to tell the user that cr_init
       failed */

    show_help_file("lam-ssi-crmpi-blcr-helpfile", "init", "cr-init-fail",
                   NULL);
    return NULL;
  }


  /* Initialize yield_to_cr_mutex. */

  lam_mutex_init(&yield_to_cr_mutex);
  
  handler_spinlock = malloc(sizeof(cr_spinlock_t));
  cr_spinlock_init(handler_spinlock);
  cr_spinlock_lock(handler_spinlock);
  
  /* Register the checkpoint handler callbacks */

  cr_thread_callback_id = cr_register_callback(crmpi_thread_callback,
                                               cr_thread_callback_arg, 
                                               CR_THREAD_CONTEXT);
  cr_signal_callback_id = cr_register_callback(crmpi_signal_callback,
                                               cr_signal_callback_arg, 
                                               CR_SIGNAL_CONTEXT);
  
  /* XXX: Add in env var for the file directory here */
  /* XXX: print out file directory here */

  /* 
   * register a handler for SIGUSR1 -- the signal that will be sent to the main
   * application thread to interrupt blocking operations (read/write), if any.
   * SIG_IGN doesn't cause these operations to be interrupted.
   *
   * XXX: SS: need to document the fact that we do not check that SIGUSR1 is 
   * used elsewhere in LAM. And make this a run-time :r 
   */
  sa.sa_handler = signal_handler;
  sigemptyset(&(sa.sa_mask));
  sa.sa_flags = 0;
  sigaction(SIGUSR1, &sa, (struct sigaction *) 0);
  
  /* Return the struct with the function pointers in it for all the
     APIs */

  return (&module_actions);
}


/*
 *	lam_ssi_crmpi_blcr_finalize
 *
 *	Function:	- crmpi cleanup
 *	Returns:	- 0 or LAMERROR
 */
int
lam_ssi_crmpi_blcr_finalize(void)
{
  if (handler_spinlock != NULL)
    free(handler_spinlock);

  lam_mutex_destroy(&yield_to_cr_mutex); 

  return (0);
}


/*
 *      lam_ssi_crmpi_blcr_yield
 *
 *      Function:       - function for app thread to yield to the crmpi thread
 *      Returns:        - 0 or LAMERROR
 */
void
lam_ssi_crmpi_blcr_app_suspend(void)
{
  lam_mutex_unlock(&lam_mpi_mutex); 

  lam_mutex_lock(&yield_to_cr_mutex); 
  lam_mutex_unlock(&yield_to_cr_mutex); 

  lam_mutex_lock(&lam_mpi_mutex); 

  return;
}


/*
 * Asynchronous handler thread.  This does most of the checkpoint/restore 
 * functionality.  
 */
static int
crmpi_thread_callback(void *arg)
{
  int rc;
 
  if (lam_ssi_cr_verbose >= 30)
    lam_debug(lam_ssi_cr_did, "Inside cr_handler_app_async of %d\n", getpid());

  /* grab the yield_to_cr_mutex first */
  lam_mutex_lock(&yield_to_cr_mutex);

  /* update handler thread state */
  lam_ssi_crmpi_base_handler_state = LAM_SSI_CRMPI_BASE_HANDLER_STATE_WAITING;
  
  /* acquire mpi lock */
  if (acquire_mpi_lock() < 0) {
    show_help_file("lam-ssi-crmpi-blcr-helpfile", "cr-thread", 
                   "acquire-lock-fail", NULL);
    kexit(1);
  }

  /* Now set the state to indicate that we are running. */
  lam_ssi_crmpi_base_handler_state = LAM_SSI_CRMPI_BASE_HANDLER_STATE_RUNNING;

  /* release the yield_to_cr_mutex */
  lam_mutex_unlock(&yield_to_cr_mutex);
  
  /* prepare all the modules for checkpoint -- for now, rpi and coll */
  if (lam_ssi_crmpi_base_checkpoint() < 0) {
    lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: prepare for checkpoint failed.",
              getpid());
    show_help_file("lam-ssi-crmpi-blcr-helpfile", "cr-thread",
                   "chkpt-prepare-fail", NULL);
    kexit(1);
  }

  /* call into the kernel to drop the context-file */
  if (lam_ssi_cr_verbose >= 30)
    lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: calling cr_checkpoint", getpid());

  rc = cr_checkpoint(0);

  if (lam_ssi_cr_verbose >= 30)
    lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: back from cr_checkpoint, rc=%d", 
              getpid(), rc);

  /* 
   * We need to handle FAILURE and CONTINUE in the same way. If checkpoint
   * failed, we will still need to go about our business as if nothing
   * happened. 
   *
   * Common to all, we need to wait for the lam_register_pid() to occur in
   * the async thread.  We need to wait for the app_thread to register its pid
   * in the cache from signal_handler context, before we can attach to the
   * lamd. 
   */
  cr_spinlock_lock(handler_spinlock);
  
  /* First case: RESTART */
  if (rc > 0) {
    if (lam_ssi_crmpi_base_restart() < 0) {
      lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: restart failed.", getpid());
      show_help_file("lam-ssi-crmpi-blcr-helpfile", "cr-thread",
                     "chkpt-restart-fail", NULL);
      kexit(1);
    }
  }

  /* Second case: FAILURE/CONTINUE */
  else {
    if (lam_ssi_crmpi_base_continue() < 0) {
      lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: restart failed.", getpid());
      show_help_file("lam-ssi-crmpi-blcr-helpfile", "cr-thread",
                     "chkpt-cont-fail", NULL);
      kexit(1);
    }
  }

  /* set the handler_state back to IDLE */
  lam_ssi_crmpi_base_handler_state = LAM_SSI_CRMPI_BASE_HANDLER_STATE_IDLE;

  /*
   * At the end of it all, unlock lam_mpi_mutex, and the rest, so that the
   * app_thread can resume.
   */
  if (release_mpi_lock() < 0)
    return LAMERROR;
  
  if (lam_ssi_cr_verbose >= 30) {
    lam_debug(lam_ssi_cr_did, "cr_thread %d unlocked lam_mpi_mutex", getpid());
    lam_debug(lam_ssi_cr_did, "Leaving cr_handler_app_async of %d", getpid());
  }

  return 0;
}


/*
 * Synchronous handler.  Will be invoked in signal context.
 */
static int
crmpi_signal_callback(void *arg)
{
  int rc;
  char pid[10];
  
  if (lam_ssi_cr_verbose >= 30) {
    sfh_itoa(getpid(), pid);
    signal_puts(pid);
    signal_puts(": Inside cr_handler_app_sync\n");
  }

  /*
   * call lam_reset_pid_cache() so that the new pid can be registered at
   * restart
   */
  lam_reset_pid_cache();

  rc = cr_checkpoint(0);

  /* 
   * We need to handle FAILURE, CONTINUE and RESTART in the same way. If
   * checkpoint failed, we will still need to go about our business as if
   * nothing happened. 
   *
   * And in all the cases, we just need to register the pid in the cache. In
   * case of FAILUER/CONTINUE, we are already attached to the lamd. So no
   * problems there. And in the case of RESTART, after we register the pid,
   * the cr_thread will attach to the lamd with kenter/kinit and proceed with
   * what it needs to do.
   * NOTE: we cannot call kinit/kenter from here because these *might not*
   * continue to be signal-safe in the future.
   */

  if (lam_ssi_cr_verbose >= 30) {
    signal_puts(pid);
    signal_puts("APP_SYNC: cr_checkpoint returned...\n");  
  }

  lam_register_pid(getpid());

  /* We're all done here.  Let the async handler proceed. */
  cr_spinlock_unlock(handler_spinlock);

  if (lam_ssi_cr_verbose >= 30) {
    signal_puts(pid);
    signal_puts(": Leaving cr_handler_app_sync\n");
  }

  return 0;
}


static int
acquire_mpi_lock(void)
{
  int ret;
  int count = 0;
  struct timespec tv;

  /* 
   * Send signal to interrupt a blocked read/write in the app, if any. Once the
   * signal is sent, acquire the lam_mpi_mutex and then proceed.  This is used to
   * ensure that the app thread yields to the cr_handler thread before resuming
   * a blocked read/write. 
   *
   * The following happens in a loop to handle the corner case of an app
   * blocking indefinitely on a read just *after* the signal is delivered, and
   * hence not releasing the lock at all!
   */
  
  if (lam_ssi_cr_verbose >= 30)
    lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: before signalling the app thread", 
              getpid()); 

  while (1) {
    if (lam_ssi_cr_verbose >= 30)
      lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: killing pid %d", 
                getpid(), lam_getpid()); 

    if (kill(lam_getpid(), SIGUSR1)) {
      if (lam_ssi_cr_verbose >= 30)
        lam_debug(lam_ssi_cr_did, "APP_ASYNC%d: kill failed on pid%d. Aborting",
                  getpid(), lam_getpid());
      return LAMERROR;
    }

    if ((ret = lam_mutex_trylock(&lam_mpi_mutex)) == EBUSY) { 
      if (lam_ssi_cr_verbose >= 30)
        lam_debug(lam_ssi_cr_did,
                  "%d: lam_mpi_mutex held by app_thread; try again", 
                  getpid());  
      ++count;
      continue;  
    } else if (ret != 0) {	
      if (lam_ssi_cr_verbose >= 30)
        lam_debug(lam_ssi_cr_did,
                  "APP_ASYNC%d: Bad error %d on lam_mutex_trylock of "
                  "lam_mpi_mutex", getpid(), errno);  
    } else {
      if (lam_ssi_cr_verbose >= 30) {
        lam_debug(lam_ssi_cr_did, "cr_thread %d locked lam_mpi_mutex", getpid());
      }
      break;
    }
    
    /* 
     * sleep before we resend the signal. we need to sleep for > 2ms,
     * otherwise kernel might busy wait without yielding.
     */
    tv.tv_sec = 0;
    tv.tv_nsec = 2000001;
    nanosleep(&tv, NULL);
  }

  return 0;
}


static int
release_mpi_lock(void)
{
  lam_mutex_unlock(&lam_mpi_mutex);
  
  return 0;
}


/*
 * Empty signal handler.  We don't need to do anything here -- just need to
 * provide a handler for SIGUSR1 so that we can interrupt the app thread if it
 * is executing a blocking syscall.
 */
static void
signal_handler(int signum)
{
}
