// -*- C++ -*-
//
// The Hoard Multiprocessor Memory Allocator
// www.hoard.org
//
// Author: Emery Berger, http://www.cs.umass.edu/~emery
//
// Copyright (c) 1998-2002, The University of Texas at Austin.
//
// This library is free software; you can redistribute it and/or modify
// it under the terms of the GNU Library General Public License as
// published by the Free Software Foundation, http://www.fsf.org.
//
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Library General Public License for more details.
//
//////////////////////////////////////////////////////////////////////////////

#if defined(USE_HOARD) && defined(_WIN32)
#pragma comment(lib, "libhoard.lib")
#endif

#include "arch-specific.h"

// How many iterations we spin waiting for a lock.
enum { SPIN_LIMIT = 100 };

// The values of a user-level lock.
#if defined(__hppa__) || defined(__hppa)
enum { UNLOCKED = 1, LOCKED = 0 };
#else
enum { UNLOCKED = 0, LOCKED = 1 };
#endif

extern "C" {

#if defined(_WIN32)

  // -------------------------- WINDOWS -----------------------

  __declspec(naked)
    unsigned long __fastcall myExchange (volatile unsigned long *oldVal,
					 volatile unsigned long newVal) throw()
  {
    __asm
      {
	mov eax, [ecx]
	  xchg [ecx], edx
   	  ret     
	  }
  }

  __declspec(naked)
    unsigned long __fastcall myInterlockedExchange (volatile unsigned long *oldVal,
						    volatile unsigned long newVal) throw()
  {
    __asm
      {
	mov eax, [ecx]
	  lock xchg [ecx], edx
   	  ret     
	  }
  }

  unsigned long hoardInterlockedExchange (unsigned long * oldval,
					  unsigned long newval)
  {
    return InterlockedExchange (reinterpret_cast<long *>(oldval), newval);
    //  return myInterlockedExchange (oldval, newval);
  }

  void hoardCreateThread (hoardThreadType& t,
			  void (*function) (void *),
			  void * arg)
  {
    //	unsigned int threadID;
    //  t = (HANDLE) _beginthread( function, 0, arg); // NULL, 0, function, NULL, 0, &threadID );

    t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) function, (LPVOID) arg, 0, 0);
  }

  void hoardJoinThread (hoardThreadType& t)
  {
    WaitForSingleObject (t, INFINITE);
  }

  void hoardSetConcurrency (int)
  {
  }

  int hoardGetThreadID (void) {
    // Windows thread id's are even, so we divide them by two
    // in order to get a reasonable thread identifier.
    static int numProcessors = hoardGetNumProcessors();
    int tid;
    if (numProcessors == 1) {
      tid = GetCurrentThreadId() >> 1;
      // tid = 0;
    } else {
      tid = GetCurrentThreadId() >> 1;
    }
    return tid;
  }

  void hoardLockInit (hoardLockType& mutex) {
    mutex = UNLOCKED;
    //  InitializeCriticalSection (&mutex);
  }

  void hoardLock (hoardLockType& mutex) {
#if 0
    int spincount = 0;
    while (InterlockedExchange (&mutex, LOCKED) != UNLOCKED) {
      spincount++;
      if (spincount > 100) {
	hoardYield();
	spincount = 0;
      }
    }
#else
    static int numProcessors = hoardGetNumProcessors();
    if (numProcessors == 1) { // 1) {
      while (InterlockedExchange (reinterpret_cast<volatile long *>(&mutex), LOCKED) != UNLOCKED) {
	hoardYield();
      }
    } else {
      int spincount = 0;
      while (InterlockedExchange (reinterpret_cast<volatile long *>(&mutex), LOCKED) != UNLOCKED) {
	spincount++;
	if (spincount > 100) {
	  hoardYield();
	  spincount = 0;
	}
      }
    }
#endif
    //	EnterCriticalSection (&mutex);
  }

  void hoardYield (void) {
    Sleep (0);
  }

  void hoardUnlock (hoardLockType& mutex) {
    mutex = UNLOCKED;
    //	LeaveCriticalSection(&mutex);
  }

#if 0
  void hoardLockDestroy (hoardLockType& mutex) {
    //	DeleteCriticalSection(&mutex);
  }
#endif

  static hoardLockType memoryLock = UNLOCKED;
  //extern "C" void * dlmalloc (size_t);
  //extern "C" void dlfree (void *);

  void * hoardGetMemory (size_t size)
  {
#if 0
    hoardLock (memoryLock);
    void * ptr = dlmalloc (size);
    hoardUnlock (memoryLock);
#else
    static HANDLE pHeap = GetProcessHeap();
    void * ptr = HeapAlloc (pHeap, 0, size);
#endif

    return (void *) ptr;
  }


  void hoardFreeMemory (void * ptr)
  {
#if 0
    hoardLock (memoryLock);
    dlfree (ptr);
    hoardUnlock (memoryLock);
#else
    HeapFree (GetProcessHeap(), 0, ptr);
#endif
  }

  int hoardGetPageSize (void)
  {
    SYSTEM_INFO infoReturn[1];
    GetSystemInfo (infoReturn);
    return (int) (infoReturn -> dwPageSize);
  }


  int hoardGetNumProcessors (void)
  {
    static int numProcessors = 0;
    if (numProcessors == 0) {
      SYSTEM_INFO infoReturn[1];
      GetSystemInfo (infoReturn);
      numProcessors = (int) (infoReturn -> dwNumberOfProcessors);
    }
    return numProcessors;
  }

  // -------------------------------------------------------------

#else // UNIX

#if USE_SPROC
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <ulocks.h>
#endif

  void hoardCreateThread (hoardThreadType& t,
			  void * (*function) (void *),
			  void * arg)
  {
#if USE_SPROC
    typedef void (*sprocFunction) (void *);
    t = sproc ((sprocFunction) function, PR_SADDR, arg);
#else
    typedef void (*pthreadFunction) (void *);

    pthread_attr_t attr;
    pthread_attr_init (&attr);
#if defined(_AIX)
    // Bound (kernel-level) threads.
    pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
#endif
    pthread_create (&t, &attr, function, arg);
#endif
  }

  void hoardJoinThread (hoardThreadType& t)
  {
#if USE_SPROC
    waitpid (t, 0, 0);
#else
    pthread_join (t, NULL);
#endif
  }

#if !defined(_GNU_SOURCE)
#if defined(__linux)
  // This extern declaration is required for some versions of Linux.
  extern "C" void pthread_setconcurrency (int n);
#endif
#endif

  void hoardSetConcurrency (int n)
  {
#if USE_SPROC
    usconfig (CONF_INITUSERS, n);
#elif defined(__SVR4) // Solaris
    thr_setconcurrency (n);
#else
    pthread_setconcurrency (n);
#endif
  }


#if defined(__SVR4) // Solaris

  // Solaris's two-level threads model gives us an edge here;
  // we can hash on the LWP's id. This helps us in two ways:
  // (1) there are likely to be far fewer LWP's than threads,
  // (2) if there's a one-to-one correspondence between LWP's
  //     and the number of processors (usually the case), then
  //     the number of heaps used will be the same as the number
  //     of processors (the optimal case).
  // Here we rely on an undocumented call in libthread.so, which
  // turns out to be MUCH cheaper than the documented _lwp_self(). Go figure.

  extern "C" unsigned int lwp_self(void);
#endif

  int hoardGetThreadID (void) {
#if USE_SPROC
    // This hairiness has the same effect as calling getpid(),
    // but it's MUCH faster since it avoids making a system call
    // and just accesses the sproc-local data directly.
    int pid = (int) PRDA->sys_prda.prda_sys.t_pid;
    return pid;
#elif defined(__linux)
    // Consecutive thread id's in Linux are 1024 apart.
    return (int) pthread_self() / 1024;
#elif defined(__AIX)
    // Consecutive thread id's in AIX are 257 apart.
    return (int) pthread_self() / 257;
#elif defined(__SVR4)
    return (int) lwp_self();
#elif defined(hpux) || defined(__hpux)
    // pthread_self() crashes until pthread.sl has been initialized. yuck.
    static unsigned long callno = 0;
    if (callno < 5) {
      callno++;
      return 1; //assume main thread
    }
#endif
    return (int) pthread_self();
  }


  // Here's our own lock implementation (spin then yield). This is much
  // cheaper than the ordinary mutex, at least on Linux and Solaris.

#if USER_LOCKS

#include <sched.h>

#if defined(__sgi)
#include <mutex.h>
#endif


#if !defined(__hppa) && !defined(__hppa__)
  // Atomically:
  //   retval = *oldval;
  //   *oldval = newval;
  //   return retval;

#if defined(sparc) && !defined(__GNUC__)
  extern "C" unsigned long InterlockedExchange (unsigned long * oldval,
						unsigned long newval);
#else
  unsigned long InterlockedExchange (unsigned long * oldval,
				     unsigned long newval)
  {
#if defined(sparc)
    asm volatile ("swap [%1],%0"
		  :"=r" (newval)
		  :"r" (oldval), "0" (newval)
		  : "memory");

#endif
#if defined(i386)
    asm volatile ("xchgl %0, %1"
		  : "=r" (newval)
		  : "m" (*oldval), "0" (newval)
		  : "memory");
#endif
#if defined(__sgi)
    newval = test_and_set (oldval, newval);
#endif
#if defined(ppc)
    int ret;
    asm volatile ("sync;"
		  "0:    lwarx %0,0,%1 ;"
		  "      xor. %0,%3,%0;"
		  "      bne 1f;"
		  "      stwcx. %2,0,%1;"
		  "      bne- 0b;"
		  "1:    sync"
		  : "=&r"(ret)
		  : "r"(oldval), "r"(newval), "r"(*oldval)
		  : "cr0", "memory");
#endif
#if defined (__s390__)
    __asm__ __volatile__(
			 "   lhi   1,3\n"          /* CS must be aligned on 4 byte b. */
			 "   nr    1,%1\n"         /* isolate last 2 bits of address */
			 "   xr    %1,1\n"         /* make addr % 4 == 0 */
			 "   sll   1,3\n"
			 "   ar    %0,1\n"         /* add alignement to bitnr */ 
			 "   lhi   1,31\n"
			 "   nr    1,%0\n"         /* make shift value */
			 "   xr    %0,1\n"
			 "   srl   %0,3\n"
			 "   la    %1,0(%0,%1)\n"  /* calc. address for CS */
			 "   lhi   2,1\n"
			 "   sll   2,0(1)\n"       /* make OR mask */
			 "   l     %0,0(%1)\n"
			 "0: lr    1,%0\n"         /* CS loop starts here */
			 "   xr    1,2\n"          /* set bit */
			 "   cs    %0,1,0(%1)\n"
			 "   jl    0b\n"
			 "   nr    %0,2\n"         /* isolate old bit */
			 : "+a" (newval), "+a" (oldval) :
			 : "cc", "memory", "1", "2" );
#endif
#if !(defined(sparc) || defined(i386) || defined(__sgi) || defined(ppc) || defined(__hppa__) || defined(__s390__))
#error "Hoard does not include support for user-level locks for this platform."
#endif
    return newval;
  }
#endif

  unsigned long hoardInterlockedExchange (unsigned long * oldval,
					  unsigned long newval)
  {
    return InterlockedExchange (oldval, newval);
  }

  void hoardLockInit (hoardLockType& mutex) {
    InterlockedExchange (&mutex, UNLOCKED);
  }

#include <stdio.h>

  void hoardLock (hoardLockType& mutex) {
    int spincount = 0;
    while (InterlockedExchange (&mutex, LOCKED) != UNLOCKED) {
      spincount++;
      if (spincount > 100) {
	hoardYield();
	spincount = 0;
      }
    }
  }

  void hoardUnlock (hoardLockType& mutex) {
    mutex = UNLOCKED;
    //  InterlockedExchange (&mutex, UNLOCKED);
  }

#else
  // HP PA-RISC
  //
  //HP PA-RISC does not have any kind of atomic exchange instruction
  //It only has fetch-and-clear

#if defined(__HP_aCC)
  //HP's aCC does not know inline assembler
  extern "C" unsigned long hppa_fetch_and_clear(unsigned long *oldval);
#else
  //assume GCC
  unsigned long hppa_fetch_and_clear(unsigned long *oldval) {
#error GCC inline assembler not ready - FIXME
    //ISJ: I don't have a clue about GCC inline assembler syntax
    //But it should be something roughly equivalent to the stuff below
#ifdef __LP64__
    asm volatile ("ldcd 0(%1), %0"
		  : "=r" (newval)
		  : "r" (oldval)   /* base */
		  : "0");
#else
    asm volatile ("ldcw 0(%1), %0"
		  : "=r" (newval)
		  : "r" (oldval)   /* base */
		  : "0");
#endif
  }
#endif

  void hoardLockInit (hoardLockType& mutex) {
    mutex = UNLOCKED;
  }

  void hoardLock (hoardLockType& mutex) {
    static int numProcessors = hoardGetNumProcessors();
    if(numProcessors == 1) {
      while (hppa_fetch_and_clear (&mutex) != UNLOCKED) {
	hoardYield();
      }
    } else {
      int spincount = 0;
      while (hppa_fetch_and_clear (&mutex) != UNLOCKED) {
	spincount++;
	if (spincount > 100) {
	  hoardYield();
	  spincount = 0;
	}
      }
    }
  }

  void hoardUnlock (hoardLockType& mutex) {
    mutex = UNLOCKED;
  }

  //end of PA-RISC specific section
#endif

#else

  // use non-user-level locks. 

#endif // USER_LOCKS


#if defined(__SVR4)
#include <thread.h>
#endif

  void hoardYield (void)
  {
#if defined(__SVR4)
    thr_yield();
#else
    sched_yield();
#endif
  }


  extern "C" void * dlmalloc (size_t);
  extern "C" void dlfree (void *);


#if USER_LOCKS
  static hoardLockType getMemoryLock = UNLOCKED;
#else
  static hoardLockType getMemoryLock = PTHREAD_MUTEX_INITIALIZER;
#endif

#include <stdio.h>

  void * hoardGetMemory (size_t size) {
    hoardLock (getMemoryLock);
    void * ptr = dlmalloc (size);
    hoardUnlock (getMemoryLock);
    return ptr;
  }


  void hoardFreeMemory (void * ptr)
  {
    hoardLock (getMemoryLock);
    dlfree (ptr);
    hoardUnlock (getMemoryLock);
  }


  int hoardGetPageSize (void)
  {
    return (int) sysconf(_SC_PAGESIZE);
  }


#if defined(linux)
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#endif

#if defined(__sgi)
#include <sys/types.h>
#include <sys/sysmp.h>
#include <sys/sysinfo.h>
#endif

#if defined(hpux) || defined(__hpux)
#include <sys/mpctl.h>
#endif

  int hoardGetNumProcessors (void)
  {
#if !(defined(linux)) && defined(__sgi)
    static int np = (int) sysmp(MP_NAPROCS);
    return np;
#endif
#if defined(hpux) || defined(__hpux)
    static int np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
    return np;
#endif
#if !(defined(linux)) && !(defined(hpux)) && !(defined(__hpux))
    static int np = (int) sysconf(_SC_NPROCESSORS_ONLN);
    return np;
#endif
#if defined(linux)
    static int numProcessors = 0;

    if (numProcessors == 0) {
      // Ugly workaround.  Linux's sysconf indirectly calls malloc() (at
      // least on multiprocessors).  So we just read the info from the
      // proc file ourselves and count the occurrences of the word
      // "processor".
    
      // We only parse the first 32K of the CPU file.  By my estimates,
      // that should be more than enough for at least 64 processors.
      enum { MAX_PROCFILE_SIZE = 32768 };
      char line[MAX_PROCFILE_SIZE];
      int fd = open ("/proc/cpuinfo", O_RDONLY);
      //    assert (fd);
      read(fd, line, MAX_PROCFILE_SIZE);
      char * str = line;
      while (str) {
	str = strstr(str, "processor");
	if (str) {
	  numProcessors++;
	  str++;
	}
      }
      close (fd);
      //    assert (numProcessors > 0);
    }
    return numProcessors;
#endif
  }

#endif // UNIX

}
