#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <errno.h>
#include <signal.h>
#include <assert.h>
#include <fcntl.h>
#include <setjmp.h>
#include <pthread.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <uuid/uuid.h>

#include "msr-index.h"
#include "xenner.h"
#include "mm.h"

/* ------------------------------------------------------------------ */

static int shutdown;
static int termsig;

/* ------------------------------------------------------------------ */

static int shutdown;
static int termsig;
static int sigchld;

static sigset_t main_signals;
static sigset_t stop_signals;

static void sighandle(int sig)
{
    switch (sig) {
    case SIGUSR1: /* stop (until USR2) */
	sigwait(&stop_signals, &sig);
	break;
    case SIGUSR2: /* continue */
    case SIGHUP:  /* ignore   */
	break;
    case SIGCHLD:
	sigchld++;
	break;
    case SIGINT:
	if (!shutdown) {
	    shutdown++;
	    break;
	}
	/* else fall through */
    case SIGTERM:
    case SIGQUIT:
    default:
	fprintf(stderr, "%s: %s\n", __FUNCTION__, strsignal(sig));
	termsig = sig;
    }
}

static void sigsetup(struct xenvm *xen)
{
    struct sigaction act, old;

    /* install handlers */
    memset(&act,0,sizeof(act));
    act.sa_handler = sighandle;
    sigemptyset(&act.sa_mask);
    sigaction(SIGHUP,  &act, &old);
    sigaction(SIGCHLD, &act, &old);
    sigaction(SIGINT,  &act, &old);
    sigaction(SIGTERM, &act, &old);
    sigaction(SIGQUIT, &act, &old);
    sigaction(SIGUSR1, &act, &old);
    sigaction(SIGUSR2, &act, &old);

    sigemptyset(&main_signals);
    sigaddset(&main_signals, SIGINT);
    sigaddset(&main_signals, SIGCHLD);
    sigaddset(&main_signals, SIGTERM);
    sigaddset(&main_signals, SIGQUIT);

    sigemptyset(&stop_signals);
    sigaddset(&stop_signals, SIGUSR2);
}

int kvmbits_signal_vcpus(struct xenvm *xen, int signal, int self)
{
    int i;
    
    for (i = 0; i < xen->vcpus; i++) {
	if (!xen->vcpu[i].thread)
	    continue;
	if (!self && pthread_equal(pthread_self(), xen->vcpu[i].thread))
	    continue;
	pthread_kill(xen->vcpu[i].thread, signal);
    }
    return 0;
}

void alive(void)
{
    static const char alive[] = ".oOo";
    static const int chars = sizeof(alive)/sizeof(alive[0]) -1;
    static int count;
    
    fprintf(stderr, "[%c]\r", alive[count++ % chars]);
}

uint64_t get_systime(void)
{
    uint64_t systime;
    struct timeval tv;

    gettimeofday(&tv, NULL);
    systime  = tv.tv_sec  * (uint64_t)1000000000;
    systime += tv.tv_usec * (uint64_t)1000;
    return systime;
}

static int balloon(struct xenvm *xen)
{
    uint32_t page;
    int advice;

    /* Hmm, keep track of state and signal only changes?  Not sure it is
     * worth the effort given ballooning is an infrequent operation in
     * Xen due to being fully admin-controlled (aka manual) */
    for (page = 0; page < xen->pg_guest; page++) {
	advice = 0;
	if (xen->m2p_32)
	    if (xen->m2p_32[page] == (uint32_t)INVALID_M2P_ENTRY)
		advice = MADV_DONTNEED;
	if (xen->m2p_64)
	    if (xen->m2p_64[page] == (uint64_t)INVALID_M2P_ENTRY)
		advice = MADV_DONTNEED;
	if (advice)
	    madvise(xen->memory + page * PAGE_SIZE, PAGE_SIZE, advice);
    }
    return 0;
}

/* ------------------------------------------------------------------ */

void update_time(struct xenvcpu *vcpu)
{
    struct xenvm *xen = vcpu->vm;
    struct vcpu_time_info *xentime;
    struct xenclock *xen_wc;
    uint64_t systime;

    vcpu->systime = get_systime();
    systime = vcpu->systime - vcpu->vm->boot;

    if (!xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN])
	return;

    if (xen->mode == XENMODE_64) {
	struct shared_info_x86_64 *sh64;
	sh64 = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN]);
	xentime = (void*)(&(sh64->vcpu_info[vcpu->id].time));
	xen_wc  = (void*)(&sh64->wc_version);
    } else {
	struct shared_info_x86_32 *sh32;
	sh32 = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN]);
	xentime = (void*)(&(sh32->vcpu_info[vcpu->id].time));
	xen_wc  = (void*)(&sh32->wc_version);
    }

    xentime->version++;
    xentime->system_time       = systime;
    xentime->tsc_timestamp     = get_msr(vcpu, MSR_IA32_TSC);
    xentime->tsc_to_system_mul = xen->tsc_mul_frac;
    xentime->tsc_shift         = xen->tsc_shift;
    xentime->version++;
    d3printf("%d: systime %" PRId64 " tsc_timestamp %" PRId64 "\n", vcpu->id,
	     xentime->system_time, xentime->tsc_timestamp);

#if 1 /* FIXME: move elsewhere */
    xen_wc->wc_version++;
    xen_wc->wc_sec  = xen->boot / 1000000000;
    xen_wc->wc_nsec = xen->boot % 1000000000;
    xen_wc->wc_version++;
#endif
}

int raise_event(struct xenvcpu *vcpu, int port)
{
    struct xenvm *xen = vcpu->vm;
    int pin = xen->e.evtchn[port];

    if (-1 != pin)
	kvm_set_irq_level(xen->kvm, pin, 1);
    return 0;
}

int xenner_register_cb(struct xenvm *xen, char *name, int fd, void *cb_data,
		       int (*callback)(struct xenvm *xen, int fd, void *data))
{
    struct xenner_callback *cb;
    long flags;

    cb = malloc(sizeof(*cb));
    memset(cb,0,sizeof(*cb));
    cb->fd   = fd;
    cb->name     = name;
    cb->cb_data  = cb_data;
    cb->callback = callback;
    list_add_tail(&cb->list, &xen->callbacks);

    /* set nonblocking mode */
    flags  = fcntl(cb->fd, F_GETFL, 0);
    flags |= O_NONBLOCK;
    if (0 != fcntl(cb->fd, F_SETFL, flags)) {
	perror("fcntl F_SETFL");
	return -1;
    }
    return 0;
}

static void check_events(struct xenvm *xen)
{
    if (shutdown) {
	static int once = 1;
	if (once) {
	    xenstore_shutdown(xen, "halt");
	    once = 0;
	}
    }
    
    if (0 /* FIXME */)
	balloon(xen);
}

/* ------------------------------------------------------------------ */

void need_regs(struct xenvcpu *vcpu)
{
    if (vcpu->regs_up2date)
	return;
    kvm_get_regs(vcpu->vm->kvm, vcpu->id, &vcpu->regs);
    vcpu->regs_up2date = 1;
}

void need_sregs(struct xenvcpu *vcpu)
{
    if (vcpu->sregs_up2date)
	return;
    kvm_get_sregs(vcpu->vm->kvm, vcpu->id, &vcpu->sregs);
    vcpu->sregs_up2date = 1;
}

void flush_regs(struct xenvcpu *vcpu)
{
    assert(vcpu->regs_up2date);
    vcpu->regs_dirty = 1;
}

void flush_sregs(struct xenvcpu *vcpu)
{
    assert(vcpu->sregs_up2date);
    vcpu->sregs_dirty = 1;
}

void xenner_cleanup(struct xenvm *xen, int stats)
{
    struct xenner_info *vminfo;

    /* going down now ... */
    if (xen->e.config[EMUDEV_CONF_VMINFO_PFN]) {
	vminfo = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_VMINFO_PFN]);
	vminfo->dying = 1;
    } else if (xen->coremap) {
	/* dying before ELF headers are written */
	strcpy(xen->coremap, "dying");
    }

    if (stats) {
	section_print(xen, __FUNCTION__, "statistics");
	hypercall_stats(xen);
    }

    section_print(xen, __FUNCTION__, "cleaning up");
    xenstore_fini(xen);
    evtchn_fini(xen);
    qemu_cleanup(xen);
    vmcore_fini(xen);
}

/* ------------------------------------------------------------------ */

static void print_stack_32(struct xenvm *xen, char *name, uint32_t esp, uint32_t *ptr)
{
    int i;
    
    logprintf(xen, "--- %s stack trace ---\n", name);
    for (i = 0; i < 512; i++) {
	if (0 == addr_offset(esp + 4*i))
	    break;
	if (0 == (i % 8))
	    logprintf(xen, "%08" PRIx32 ":", esp + 4*i);
	logprintf(xen, " %08" PRIx32, ptr[i]);
	if (7 == (i % 8))
	    logprintf(xen, "\n");
    }
    if (0 != (i % 8))
	logprintf(xen, "\n");
}

static void print_stack_64(struct xenvm *xen, char *name, uint64_t esp, uint64_t *ptr)
{
    int i;
    
    logprintf(xen, "--- %s stack trace ---\n", name);
    for (i = 0; i < 256; i++) {
	if (0 == addr_offset(esp + 8*i))
	    break;
	if (0 == (i % 4))
	    logprintf(xen, "%016" PRIx64 ":", esp + 8*i);
	logprintf(xen, " %016" PRIx64, ptr[i]);
	if (3 == (i % 4))
	    logprintf(xen, "\n");
    }
    if (0 != (i % 4))
	logprintf(xen, "\n");
}

void print_guest_stack(struct xenvcpu *vcpu, uint64_t rsp)
{
    if (vcpu->vm->mode == XENMODE_64) {
	uint64_t *data = guest_vaddr_to_ptr(vcpu, rsp);
	if (data)
	    print_stack_64(vcpu->vm, "guest64", rsp, data);
    } else {
	uint32_t *data = guest_vaddr_to_ptr(vcpu, rsp);
	if (data)
	    print_stack_32(vcpu->vm, "guest32", rsp, data);
    }
}

void print_emu_stack(struct xenvm *xen, uint64_t rsp)
{
    if (xen->mode == XENMODE_64) {
	uint64_t *data = emu_vaddr_to_ptr(xen, rsp);
	if (data)
	    print_stack_64(xen, "emu64", rsp, data);
    } else {
	uint32_t *data = emu_vaddr_to_ptr(xen, rsp);
	if (data)
	    print_stack_32(xen, "emu32", rsp, data);
    }
}

static void dump_vcpu(struct xenvcpu *vcpu, uint64_t guest_rsp)
{
    struct xenvm *xen = vcpu->vm;

    logprintf(xen, "--- registers (vcpu %d)---\n", vcpu->id);
    kvm_show_regs(xen->kvm, vcpu->id);
    print_msrs(vcpu);

    need_regs(vcpu);
    if (vcpu->regs.rsp > xen->emu_vs && vcpu->regs.rsp < xen->emu_ve)
	print_emu_stack(xen, vcpu->regs.rsp);
    else if (!guest_rsp && vcpu->regs.rsp)
	guest_rsp = vcpu->regs.rsp;

    if (guest_rsp)
	print_guest_stack(vcpu, guest_rsp);
}

static void dump_events_64(struct xenvm *xen)
{
    struct shared_info_x86_64 *sh;
    int i;

    if (!xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN])
	return;
    sh = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN]);
    logprintf(xen, "--- irq / event channel status ---\n");
    logprintf(xen, "pending: %016" PRIx64 "\n", sh->evtchn_pending[0]);
    logprintf(xen, "mask:    %016" PRIx64 "\n", sh->evtchn_mask[0]);
    for (i = 0; i < xen->vcpus; i++) {
	logprintf(xen, "vcpu %d: pending %d, mask %d,"
		  " sel %016" PRIx64 " (shared_info)\n", i,
		  sh->vcpu_info[i].evtchn_upcall_pending,
		  sh->vcpu_info[i].evtchn_upcall_mask,
		  sh->vcpu_info[i].evtchn_pending_sel);
    }
}

static void dump_events_32(struct xenvm *xen)
{
    struct shared_info_x86_32 *sh;
    int i;

    if (!xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN])
	return;
    sh = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_SHARED_INFO_PFN]);
    logprintf(xen, "--- irq / event channel status ---\n");
    logprintf(xen, "pending: %08" PRIx32 "\n", sh->evtchn_pending[0]);
    logprintf(xen, "mask:    %08" PRIx32 "\n", sh->evtchn_mask[0]);
    for (i = 0; i < xen->vcpus; i++) {
	logprintf(xen, "vcpu %d: pending %d, mask %d,"
		  " sel %08" PRIx32 " (shared_info)\n", i,
		  sh->vcpu_info[i].evtchn_upcall_pending,
		  sh->vcpu_info[i].evtchn_upcall_mask,
		  sh->vcpu_info[i].evtchn_pending_sel);
    }
}

void vm_kill(struct xenvcpu *vcpu, const char *reason, uint64_t guest_rsp)
{
    struct xenvm *xen = vcpu->vm;
    int i;

    banner_print(xen, reason);

    if (!pthread_equal(pthread_self(), vcpu->thread)) {
	/* not called from any vcpu thread -- dump all vcpus */
	kvmbits_vcpus_stop(xen);
	for (i = 0; i < xen->vcpus; i++) {
	    if (!xen->vcpu[i].thread)
		continue;
	    dump_vcpu(xen->vcpu+i, 0);
	}
	if (xen->mode == XENMODE_64)
	    dump_events_64(xen);
	else
	    dump_events_32(xen);
    } else {
	/* dump current vcpu only */
	dump_vcpu(vcpu, guest_rsp);
    }
    
    xenner_cleanup(xen, 1);
    exit(1);
}

static void do_emudev_command(struct xenvm *xen, uint16_t cmd, uint16_t arg)
{
    void *ptr;

    switch (cmd) {
    case EMUDEV_CMD_NOP:
	/* nop vmexit (for time update without pvclock) */
	break;
    case EMUDEV_CMD_WRITE_CHAR:
	fprintf(stderr, "%c", arg);
	break;
    case EMUDEV_CMD_CONFIGURATION_DONE:
	/* emu finisted initial EMUDEV_CONF_* setup, process now */
	ptr = mfn_to_ptr(xen, xen->e.config[EMUDEV_CONF_BOOT_CTXT_PFN]);
	if (XENMODE_64 == xen->mode)
	    memcpy(ptr, &xen->boot_ctxt.ct64, sizeof(xen->boot_ctxt.ct64));
	else
	    memcpy(ptr, &xen->boot_ctxt.ct32, sizeof(xen->boot_ctxt.ct32));
	vmcore_write_headers(xen);
	break;
    case EMUDEV_CMD_EVTCHN_ALLOC:
	if (arg < EMUDEV_CONF_COMMAND_RESULT_COUNT)
	    xen->e.result[arg] = evtchn_port(xen, "irqchip");
	break;
    case EMUDEV_CMD_EVTCHN_SEND:
	evtchn_notify(xen, arg);
	break;
    case EMUDEV_CMD_INIT_SECONDARY_VCPU:
    {
	struct xenvcpu *vcpu;
	int i;

	i = xen->e.config[EMUDEV_CONF_NEXT_SECONDARY_VCPU];
	if (i < VCPUS_MAX) {
	    vcpu = xen->vcpu + i;
	    need_regs(vcpu);
	    need_sregs(vcpu);
	    setup_regs(vcpu);
	    flush_regs(vcpu);
	    flush_sregs(vcpu);
	}
	break;
    }
    case EMUDEV_CMD_GUEST_SHUTDOWN:
	logprintf(xen, "guest requests shutdown, exiting.\n");
	xenner_cleanup(xen, 1);
	exit(0);
    default:
	d0printf("emudev: cmd 0x%04x arg 0x%04x\n", cmd, arg);
    }
}

/* ------------------------------------------------------------------ */
/* functions                                                          */

static int cb_inb(void *opaque, uint16_t addr, uint8_t *value)
{
    struct xenvm *xen = opaque;

    d2printf("io: %s, addr 0x%" PRIx16 "\n",
	     __FUNCTION__, addr);
    *value = 0xff;
    return 0;
}

static int cb_inw(void *opaque, uint16_t addr, uint16_t *value)
{
    struct xenvm *xen = opaque;

    d2printf("io: %s, addr 0x%" PRIx16 "\n",
	     __FUNCTION__, addr);
    *value = 0xffff;
    return 0;
}

static int cb_inl(void *opaque, uint16_t addr, uint32_t *value)
{
    struct xenvm *xen = opaque;

    switch (addr) {
    case EMUDEV_REG_CONF_VALUE:
	*value = emudev_read_value(&xen->e);
	break;
    default:
	d2printf("io: %s, addr 0x%" PRIx16 "\n", __FUNCTION__, addr);
	*value = 0xffffffff;
    }
    return 0;
}

static int cb_outb(void *opaque, uint16_t addr, uint8_t value, int vcpu_id)
{
    struct xenvm *xen = opaque;
#if 0
    struct xenvcpu *vcpu = xen->vcpu + vcpu_id;
    char *msg;
#endif

    switch (addr) {
#if 0
    case 0xe0: /* hypercall */
	do_xen_hypercall(vcpu, value);
	break;

    case 0xe1: /* vmexit nop() */
	break;

    case 0xe2: /* setup secondary vcpu */
	need_regs(vcpu);
	need_sregs(vcpu);
	setup_regs(vcpu);
	flush_regs(vcpu);
	flush_sregs(vcpu);
	break;

    case 0xea: /* emu: print string */
	need_regs(vcpu);
	msg = emu_vaddr_to_ptr(xen, vcpu->regs.rdx);
	conemu_print(vcpu, msg);
	break;

    case 0xeb: /* emu: panic */
	vm_kill(vcpu, "emu: panic()", 0);
	break;

    case 0xec: /* debug trap */
	do_debug_trap(vcpu);
	break;

    case 0xed: /* shutdown */
	logprintf(xen, "guest requests shutdown, exiting.\n");
	xenner_cleanup(xen, 1);
	exit(0);

    case 0xee: /* get evtchn port */
	need_regs(vcpu);
	vcpu->regs.rax = evtchn_port(xen, "irqchip");
	flush_regs(vcpu);
	break;
#else
    case 0xe0 ... 0xef:
	d0printf("%s: 0x%02x\n", __FUNCTION__, addr);
	break;
#endif
    default:
	d2printf("io: %s, addr 0x%" PRIx16 " value 0x%" PRIx8 "\n",
		__FUNCTION__, addr, value);
	break;
    }
    return 0;
}

static int cb_outw(void *opaque, uint16_t addr, uint16_t value)
{
    struct xenvm *xen = opaque;

    d1printf("io: %s, addr 0x%" PRIx16 " value 0x%" PRIx16 "\n",
	     __FUNCTION__, addr, value);
    return 0;
}

static int cb_outl(void *opaque, uint16_t addr, uint32_t value)
{
    struct xenvm *xen = opaque;
    uint16_t cmd, arg;

    switch (addr) {
    case EMUDEV_REG_CONF_ENTRY:
	emudev_write_entry(&xen->e, value);
	break;
    case EMUDEV_REG_CONF_VALUE:
	emudev_write_value(&xen->e, value);
	break;
    case EMUDEV_REG_COMMAND:
	cmd = value >> 16;
	arg = value & 0xffff;
	do_emudev_command(xen, cmd, arg);
	break;
    default:
	d1printf("io: %s, addr 0x%" PRIx16 " value 0x%" PRIx32 "\n",
		 __FUNCTION__, addr, value);
    }
    return 0;
}

static int cb_debug(void *opaque, int vcpu_id)
{
    struct xenvm *xen = opaque;
    
    d3printf("callback: %s/%d\n", __FUNCTION__, vcpu_id);
    return 0;
}

static int cb_halt(void *opaque, int vcpu_id)
{
    struct xenvm *xen = opaque;
    struct xenvcpu *vcpu = xen->vcpu + 0;

    /* should not be called any more */
    vm_kill(vcpu, __FUNCTION__, 0);
    return 0;
}

static int cb_shutdown(void *opaque, int vcpu_id)
{
    struct xenvm *xen = opaque;
    struct xenvcpu *vcpu = xen->vcpu + vcpu_id;

    d3printf("callback: %s/%d\n", __FUNCTION__, vcpu_id);
    vm_kill(vcpu, "cb_shutdown", 0);
    return 0;
}

static int cb_io_window(void *opaque)
{
    struct xenvm *xen = opaque;

    d2printf("callback: %s\n", __FUNCTION__);
    return 0;
}

static int cb_try_push_interrupts(void *opaque)
{
    struct xenvm *xen = opaque;
    struct xenvcpu *vcpu = xen->vcpu + 0;

    /* should not be called any more */
    vm_kill(vcpu, __FUNCTION__, 0);
    return 0;
}

static void cb_post_kvm_run(void *opaque, int vcpu_id)
{
    struct xenvm *xen = opaque;

    pthread_mutex_lock(&xen->biglock);
    d3printf("callback: %s/%d\n", __FUNCTION__, vcpu_id);
}

static int cb_pre_kvm_run(void *opaque, int vcpu_id)
{
    struct xenvm *xen = opaque;
    struct xenvcpu *vcpu = xen->vcpu + vcpu_id;

    d3printf("callback: %s/%d\n", __FUNCTION__, vcpu_id);

    if (!(xen->features & (1 << KVM_FEATURE_CLOCKSOURCE)))
	update_time(vcpu);

    /* registers */
    if (vcpu->regs_dirty) {
	kvm_set_regs(xen->kvm, vcpu->id, &vcpu->regs);
	vcpu->regs_dirty = 0;
    }
    if (vcpu->sregs_dirty) {
	kvm_set_sregs(xen->kvm, vcpu->id, &vcpu->sregs);
	vcpu->sregs_dirty = 0;
    }
    vcpu->regs_up2date  = 0;
    vcpu->sregs_up2date = 0;

    pthread_mutex_unlock(&xen->biglock);
    return 0;
}

/* ------------------------------------------------------------------ */

static struct kvm_para_features {
    int  cap;
    int  feature;
    int  allowed;
    char *name;
} para_features[] = {
    {
	.cap     = KVM_CAP_CLOCKSOURCE,
	.feature = KVM_FEATURE_CLOCKSOURCE,
	.name    = "clocksource",
	.allowed = 0, /* needs host patches */
    },{
	.cap     = KVM_CAP_NOP_IO_DELAY,
	.feature = KVM_FEATURE_NOP_IO_DELAY,
	.name    = "nop-iodelay",
	.allowed = 0, /* unused */
    },{
	.cap     = KVM_CAP_PV_MMU,
	.feature = KVM_FEATURE_MMU_OP,
	.name    = "mmu-op",
	.allowed = 0, /* unused */
    },
#ifdef KVM_CAP_CR3_CACHE
    {
	.cap     = KVM_CAP_CR3_CACHE,
	.feature = KVM_FEATURE_CR3_CACHE,
	.name    = "cr3-cache",
	.allowed = 0,
    },
#endif
};

int kvmbits_features_enable(char *name, int state)
{
    int i;
    
    for (i = 0; i < sizeof(para_features)/sizeof(para_features[0]); i++) {
	if (0 != strcmp(name, para_features[i].name))
	    continue;
	para_features[i].allowed = state;
	return 0;
    }
    return -1;
}

int kvmbits_features_check(struct xenvm *xen)
{
    char *state;
    int i;

    /* check minimum requirement */
    if (!kvm_check_extension(xen->kvm, KVM_CAP_SET_TSS_ADDR)) {
	logprintf(xen, "Error: KVM_CAP_SET_TSS_ADDR is not available.\n");
	logprintf(xen, "Most likely your kvm version is too old.\n");
	logprintf(xen, "Minimum requirement is kvm-49 (or linux 2.6.25).\n");
	return -1;
    }

    /* build feature bitmap */
    for (i = 0; i < sizeof(para_features)/sizeof(para_features[0]); i++) {
	if (!kvm_check_extension(xen->kvm, para_features[i].cap)) {
	    state = "not supported";
	} else if (!para_features[i].allowed) {
	    state = "not enabled";
	} else {
	    state = "ok";
	    xen->features |= (1 << para_features[i].feature);
	}
	logprintf(xen, "kvm capability %s: %s\n", para_features[i].name, state);
    }
    return 0;
}

/* ------------------------------------------------------------------ */

static struct kvm_callbacks xenner_cb = {
    .inb                 = cb_inb,
    .inw                 = cb_inw,
    .inl                 = cb_inl,
    .outb                = cb_outb,
    .outw                = cb_outw,
    .outl                = cb_outl,
    .debug               = cb_debug,
    .halt                = cb_halt,
    .shutdown            = cb_shutdown,
    .io_window           = cb_io_window,
    .try_push_interrupts = cb_try_push_interrupts,
    .post_kvm_run        = cb_post_kvm_run,
    .pre_kvm_run         = cb_pre_kvm_run,
};

int kvmbits_init(struct xenvm *xen)
{
    int i;
    
    /* init kvm */
    xen->kvm = kvm_init(&xenner_cb, xen);
    if (!xen->kvm) {
	logprintf(xen, "kvm_init failed\n");
	goto err;
    }

    if (kvmbits_features_check(xen) < 0)
	goto err;
    
    /* create vm */
    xen->pg_total &= ~((1<<10)-1);  /* round to 4MB border */
    if (vmcore_init(xen) < 0)
	goto err;
    if (kvm_create_vm(xen->kvm) < 0) {
	logprintf(xen, "kvm_create_vm failed\n");
	goto err;
    }

    if (kvm_set_tss_addr(xen->kvm, xen->pg_total * PAGE_SIZE) < 0) {
	logprintf(xen, "kvm_set_tss_addr failed (%s)\n", strerror(errno));
	goto err;
    }

    if (kvm_register_userspace_phys_mem(xen->kvm, 0, xen->memory,
					xen->pg_total * PAGE_SIZE, 1) < 0) {
	logprintf(xen, "kvm_register_userspace_phys_mem failed\n");
	goto err;
    }

    kvm_create_irqchip(xen->kvm);
    if (!kvm_irqchip_in_kernel(xen->kvm)) {
	logprintf(xen, "kvm_create_irqchip failed\n");
	goto err;
    }

    for (i = 0; i < xen->vcpus; i++) {
	if (kvm_create_vcpu(xen->kvm, i) < 0) {
	    logprintf(xen, "kvm_create_vcpu failed\n");
	    goto err;
	}
	xen->vcpu[i].id = i;
	xen->vcpu[i].vm = xen;
	need_regs(xen->vcpu + i);
	need_sregs(xen->vcpu + i);
    }

    banner_print(xen, "setup");
    sigsetup(xen);
    if (0 != evtchn_init(xen))
	goto err;
    if (0 != xenstore_init_early(xen))
	goto err;
    console_init(xen);

    if (0 != domain_builder(xen)) {
	logprintf(xen, "domain builder failed\n");
	goto err;
    }

    xenstore_init_late(xen);
    setup_cpuid(xen);

    tsc_calibrate(xen, &xen->tsc_shift, &xen->tsc_mul_frac);
    xen->boot = get_systime();

    return 0;

err:
    if (xen->kvm)
	kvm_finalize(xen->kvm);
    if (-1 != xen->corefd)
	unlink(xen->corefile);
    return -1;
}

/* ------------------------------------------------------------------ */

static void *vcpu_thread(void *arg)
{
    struct xenvcpu *vcpu = arg;
    struct xenvm *xen = vcpu->vm;

    sigprocmask(SIG_BLOCK, &main_signals, NULL);
    vcpu->thread_id = syscall(SYS_gettid);
    if (xen->start_stopped)
	pthread_kill(pthread_self(), SIGUSR1);

    pthread_mutex_lock(&xen->biglock);
    d1printf("%s/%d: start\n", __FUNCTION__, vcpu->id);

    kvm_run(xen->kvm, vcpu->id);

    d1printf("%s/%d: stop\n", __FUNCTION__, vcpu->id);
    pthread_mutex_unlock(&xen->biglock);
    return NULL;
}

static void collect_zombies(struct xenvm *xen)
{
    int pid, status;
    char *name = "unknown";

    for (;;) {
	pid = waitpid(-1, &status, WNOHANG);
	if (-1 == pid)
	    return;

	if (pid == xen->qemu_pid) {
	    xen->qemu_pid = 0;
	    name = "qemu-dm";
	}
	
	if (WIFEXITED(status)) {
	    d1printf("child %d (%s) exited: %d\n",
		     pid, name, WEXITSTATUS(status));
	} else if (WIFSIGNALED(status)) {
	    d1printf("child %d (%s) signaled: %s\n",
		     pid, name, strsignal(WTERMSIG(status)));
	} else {
	    d1printf("child %d (%s) is gone for unknown reasons\n",
		     pid, name);
	}
    }
}

int kvmbits_mainloop(struct xenvm *xen)
{
    struct xenvcpu *vcpu;
    struct list_head *item;
    struct xenner_callback *cb;
    fd_set rd;
    int rc, max, i;

    pthread_mutex_lock(&xen->biglock);

    for (i = 0; i < xen->vcpus; i++) {
	vcpu = xen->vcpu + i;
	setup_regs(vcpu);
	flush_regs(vcpu);
	flush_sregs(vcpu);
	pthread_create(&vcpu->thread, NULL, vcpu_thread, vcpu);
    }
    vcpu = xen->vcpu + 0;

    for (;;) {
	if (termsig)
	    vm_kill(vcpu, "termsig", 0);
	if (sigchld)
	    collect_zombies(xen);
	check_events(xen);

	FD_ZERO(&rd);
	max = 0;
	list_for_each(item, &xen->callbacks) {
	    cb = list_entry(item, struct xenner_callback, list);
	    FD_SET(cb->fd,&rd);
	    if (max < cb->fd)
		max = cb->fd;
	}

	pthread_mutex_unlock(&xen->biglock);
	rc = select(max+1, &rd, NULL, NULL, NULL);
	pthread_mutex_lock(&xen->biglock);
	
	if (-1 == rc) {
	    if (EINTR == errno) {
		d1printf("%s: select: EINTR (got signal)\n", __FUNCTION__);
		continue;
	    } else {
		d0printf("%s: select: %s\n", __FUNCTION__, strerror(errno));
		vm_kill(vcpu, "select error", 0);
	    }
	}
	list_for_each(item, &xen->callbacks) {
	    cb = list_entry(item, struct xenner_callback, list);
	    if (FD_ISSET(cb->fd,&rd))
		cb->callback(xen, cb->fd, cb->cb_data);
	}
    }

    pthread_mutex_unlock(&xen->biglock);
    return 0;
}

int kvmbits_vcpu_up(struct xenvcpu *vcpu)
{
    if (vcpu->thread)
	return -EINVAL;
    pthread_create(&vcpu->thread, NULL, vcpu_thread, vcpu);
    return 0;
}

int kvmbits_vcpu_down(struct xenvcpu *vcpu)
{
    /* FIXME */
    return -ENOSYS;
}

int kvmbits_vcpus_stop(struct xenvm *xen)
{
    return kvmbits_signal_vcpus(xen, SIGUSR1, 1);
}

int kvmbits_vcpus_cont(struct xenvm *xen)
{
    return kvmbits_signal_vcpus(xen, SIGUSR2, 1);
}
