; ACL2 Version 3.6 -- A Computational Logic for Applicative Common Lisp
; Copyright (C) 2006  University of Texas at Austin

; This version of ACL2 is a descendent of ACL2 Version 1.9, Copyright
; (C) 1997 Computational Logic, Inc.  See the documentation topic NOTE-2-0.

; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.

; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.

; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

; Written by:  Matt Kaufmann               and J Strother Moore
; email:       Kaufmann@cs.utexas.edu      and Moore@cs.utexas.edu
; Department of Computer Sciences
; University of Texas at Austin
; Austin, TX 78712-1188 U.S.A.

; We thank David L. Rager for contributing an initial version of this file.

; This file is divided into the following sections.

; Section:  Enabling and Disabling Interrupts
; Section:  Threading Interface
; Section:  Parallelism Basis
; Section:  Work Consumer Code
; Section:  Work Producer Code
; Section:  Parallelism Primitives

; In particular, see the Essay on Parallelism Definitions and the Essay on
; Parallelism Strategy for overviews on this implementation of parallel
; evaluation.

(in-package "ACL2")

;---------------------------------------------------------------------
; Section:  Enabling and Disabling Interrupts

; "Without-interrupts" means that there will be no interrupt from the Lisp
; system, including ctrl+c from the user or an interrupt from another
; thread/process.  For example, if *thread1* is running (progn
; (without-interrupts (process0)) (process1)), then execution of
; (interrupt-thread *thread1* (lambda () (break))) will not interrupt
; (process0).

; But note that "without-interrupts" does not guarantee atomicity; for example,
; it does not mean "without-setq".

(defmacro without-interrupts (&rest forms)

; This macro prevents interrupting evaluation of any of the indicated forms in
; a parallel lisp.  In a non-parallel environment (#-acl2-par), we simply
; evaluate the forms.  This behavior takes priority over any enclosing call of
; with-interrupts.  See also with-interrupts.

  #+(and openmcl acl2-par)
  `(ccl:without-interrupts ,@forms)
  #+(and sbcl sb-thread acl2-par) 
  `(sb-sys:without-interrupts ,@forms)
  #-acl2-par
  `(progn ,@forms))

(defmacro unwind-protect-disable-interrupts-during-cleanup
  (body-form &rest cleanup-forms)

; As the name suggests, this is unwind-protect but with a guarantee that
; cleanup-form cannot be interrupted.  Note that OpenMCL's implementation
; already disables interrupts during cleanup (1.1pre and later).

  #+(and openmcl acl2-par)
  `(unwind-protect ,body-form ,@cleanup-forms)
  #+(and sbcl sb-thread acl2-par)
  `(unwind-protect ,body-form (without-interrupts ,@cleanup-forms))
  #-acl2-par
  `(unwind-protect ,body-form ,@cleanup-forms))

;---------------------------------------------------------------------
; Section:  Threading Interface
;
; The threading interface is intended for system level programmers.  It is not
; intended for the ACL2 user.  When writing system-level multi-threaded code,
; we use implementation-independent interfaces.  If you need a function not
; covered in this interface, create the interface!

; Many of the functions in this interface (lockp, make-lock, and so on) are not
; used elsewhere, but are included here in case we find a use for them later.

; We take a conservative approach for implementations that do not support
; parallelism.  For example, if the programmer asks for a semaphore or lock in
; an unsupported Lisp, then nil is returned.

; We employ counting semaphores.  For details, including a discussion of
; ordering, see comments in the definition of function make-semaphore.

; Note: We use parts of the threading interface for our implementation of the
; parallelism primitives.

(defun lockp (x)
  #+(and openmcl acl2-par) (cl:typep x 'ccl::recursive-lock)
  #+(and sbcl sb-thread acl2-par) (cl:typep x 'sb-thread::mutex)
  #-acl2-par

; We return nil in the uni-threaded case in order to stay in sync with
; make-lock, which returns nil in this case.  In a sense, we want (lockp
; (make-lock x)) to be a theorem if there is no error.

  (null x))

(defun make-lock (&optional lock-name)

; See also deflock.

; Even though OpenMCL nearly always uses a FIFO for threads blocking on a lock,
; it does not guarantee so: no such promise is made by the OpenMCL
; documentation or implementor (in fact, we are aware of a race condition that
; would violate FIFO properties for locks).  Thus, we make absolutely no
; guarantees about ordering; for example, we do not guarantee that the
; longest-blocked thread for a given lock is the one that would enter a
; lock-guarded section first.  However, we suspect that this is usually the
; case for most implementations, so assuming such an ordering property is
; probably a reasonable heuristic.  We would be somewhat surprised to find
; significant performance problems in our own application to ACL2's parallelism
; primitives due to the ordering provided by the underlying system.

  #-acl2-par
  (declare (ignore lock-name))
  #+(and openmcl acl2-par) (ccl:make-lock lock-name)
  #+(and sbcl sb-thread acl2-par) (sb-thread:make-mutex :name lock-name)
  #-acl2-par

; We return nil in the uni-threaded case in order to stay in sync with lockp.

  nil)

(defmacro deflock (lock-symbol)

; Deflock defines what some Lisps call a "recursive lock", namely a lock that
; can be grabbed more than once by the same thread, but such that if a thread
; outside the owner tries to grab it, that thread will block.

; Note that if lock-symbol is already bound, then deflock will not re-bind
; lock-symbol.

  `(defvar ,lock-symbol
     (make-lock (symbol-name ',lock-symbol))))

(defmacro reset-lock (bound-symbol)

; This macro binds the given global (but not necessarily special) variable to a
; lock that is new, at least from a programmer's perspective.

; Reset-lock should only be applied to bound-symbol if deflock has previously
; been applied to bound-symbol.

  `(setq ,bound-symbol (make-lock ,(symbol-name bound-symbol))))

(defmacro with-lock (lock-name &rest forms)

; Grab a lock, blocking until it is acquired; evaluate forms; and then release
; the lock.  This macro guarantees mutual exclusion.

  #-acl2-par
  (declare (ignore lock-name))
  (let ((forms

; We ensure that forms is not empty because otherwise, in OpenMCL alone,
; (with-lock some-lock) evaluates to t.  We keep the code simple and consistent
; by modifying forms here for all cases, not just OpenMCL.

         (or forms '(nil))))
    #+(and openmcl acl2-par)
    `(ccl:with-lock-grabbed (,lock-name) nil ,@forms)
    #+(and sbcl sb-thread acl2-par)
    `(sb-thread:with-recursive-lock (,lock-name) nil ,@forms)
    #-acl2-par
    `(progn ,@forms)))

; A future possibility, also mentioned in parallel.lisp:

;   Recycle locks, perhaps for example in wait-on-condition-variable-lockless.

; Here is code, not heavily tested, that may be useful in that regard.

#||
(progn 

; We declare some variables here that are necessary for the threading
; interface.  These support semaphore recycling via functions allocate-lock and
; free-lock, which are useful for efficient implementation of
; wait-on-condition-variable-lockless.

(defvar *lock-allocation-lock* (make-lock))
(defvar *lock-freelist* 
  #+openmcl
  (ccl::%cons-pool)
  #-openmcl 
  nil)

)

(defun allocate-lock ()
  (with-lock *lock-allocation-lock*
             (or (pop 
                  #+openmcl
                  (ccl::pool.data *lock-freelist*)
                  #-openmcl
                  *lock-freelist*)
                 (make-lock))))

(defun free-lock (lock)

; Warning: This function requires that lock is unacquired. 

; We considered creating a user-settable limit on the length of the
; *lock-freelist*.  In favor of simplicity, we have not implemented this, but
; if OS lock resources become an issue, this is worth further consideration.

; We test that the lock is not currently acquired.  Note that we do not test
; that the lock isn't already on the *lock-freelist*.  It's questionable
; whether this check is worth it, since the user can get into trouble in many
; other ways.

  (without-interrupts
   #+(and openmcl acl2-par)
   (when (not (ccl:try-lock lock))
     (error "A lock was freed while still being held."))
   #+(and sbcl sb-thread acl2-par)
   (if (sb-thread:get-mutex lock nil nil)
       (sb-thread:release-mutex lock)
     (error "A lock was freed while still being held.")))
  (with-lock *lock-allocation-lock*
             (push lock
                   #+openmcl
                   (ccl::pool.data *lock-freelist*)
                   #-openmcl
                   *lock-freelist*)))

(defun reset-lock-free-list ()

; We provide the user the ability to clear the locks stored for recycling.  If
; an ACL2 programmer wants to free OS resources for garbage collection, they
; can use this method to free the locks stored in our system.

  (with-lock *lock-allocation-lock*
             (setf *lock-freelist* 
                   #+openmcl
                   (ccl::%cons-pool)
                   #-openmcl 
                   nil)))
||#

(defun run-thread (name fn-symbol &rest args)

; Apply fn-symbol to args.  We follow the precedent set by LISP machines (and
; in turn OpenMCL), which allowed the user to spawn a thread whose initial
; function receives an arbitrary number of arguments.  

; We expect this application to occur in a fresh thread with the given name.
; When a call of this function returns, we imagine that this fresh thread can
; be garbage collected; at any rate, we don't hang on to it!

; Note that run-thread returns different types in different Lisps.

  #-acl2-par
  (declare (ignore name))
  #+(and openmcl acl2-par)
  (ccl:process-run-function name (lambda () (apply fn-symbol args)))
  #+(and sbcl sb-thread acl2-par)
  (sb-thread:make-thread (lambda () (apply fn-symbol args)) :name name)

; We're going to be nice and let the user's function still run, even though
; it's not split off.

  #-acl2-par
  (apply fn-symbol args))

(defun interrupt-thread (thread function &rest args)

; Interrupt the indicated thread and then, in that thread, apply function to
; args.  Note that function and args are all evaluated.  When this function
; application returns, the thread resumes from the interrupt (from where it
; left off).

  #-acl2-par
  (declare (ignore thread function args))
  #+(and openmcl acl2-par)
  (apply #'ccl:process-interrupt thread function args)
  #+(and sbcl sb-thread acl2-par)
  (if args
      (error "Passing arguments to interrupt-thread not supported in SBCL.")
    (sb-thread:interrupt-thread thread function))
  #-acl2-par
  nil)

(defun kill-thread (thread)
  #-acl2-par
  (declare (ignore thread))
  #+(and openmcl acl2-par)
  (ccl:process-kill thread)
  #+(and sbcl sb-thread acl2-par)
  (sb-ext:process-kill thread)
  #-acl2-par
  nil)

(defun all-threads ()
  #+(and openmcl acl2-par) 
  (ccl:all-processes)
  #+(and sbcl sb-thread acl2-par) 
  (sb-thread:list-all-threads)
  #-acl2-par 
  (error "We don't know how to list threads in this Lisp (or acl2-par is not ~
          on the features list."))

(defun current-thread ()
  #+(and openmcl acl2-par)
  ccl:*current-process*
  #+(and sbcl sb-thread acl2-par)
  sb-thread:*current-thread*
  #-acl2-par
  nil)

(defun thread-wait (fn &rest args)

; Thread-wait provides an inefficient mechanism for the current thread to wait
; until a given condition, defined by the application of fn to args, is true.
; When performance matters, we advise using a signaling mechanism over this
; hacker's function.

  #+openmcl
  (apply #'ccl:process-wait "Busy-waiting on condition for thread" fn args)
  #-openmcl 
  (loop while (not (apply fn args)) do (sleep 0.05)))

#+(and sbcl sb-thread acl2-par (not acl2-loop-only))
(defstruct sbcl-semaphore 
  (lock (sb-thread:make-mutex))
  (cv (sb-thread:make-waitqueue)) ; condition variable
  (count 0))

(defun make-semaphore (&optional name)

; Make-semaphore, signal-semaphore, and semaphorep work together to implement
; counting semaphores for the threading interface.

; This function creates "counting semaphores", which are data structures that
; include a "count" field, which is a natural number.  A thread can "wait on" a
; counting semaphore, and it will block in the case that the semaphore's count
; is 0.  To "signal" such a semaphore means to increment that field and to
; notify a unique waiting thread (we will discuss a relaxation of this
; uniqueness shortly) that the semaphore's count has been incremented.  Then
; this thread, which is said to "receive" the signal, decrements the
; semaphore's count and is then unblocked.  This mechanism is typically much
; faster than busy waiting.

; In principle more than one waiting thread could be notified (though this
; seems rare in practice).  In this case, only one would be the receiving
; thread, i.e., the one that decrements the semaphore's count and is then
; unblocked.

; If semaphore usage seems to perform inefficiently, could this be due to
; ordering issues?  For example, even though OpenMCL nearly always uses a FIFO
; for blocked threads, it does not guarantee so: no such promise is made by the
; OpenMCL documentation or implementor.  Thus, we make absolutely no guarantees
; about ordering; for example, we do not guarantee that the longest-blocked
; thread for a given semaphore is the one that would receive a signal.
; However, we suspect that this will usually be the case for most
; implementations, so assuming such an ordering property is probably a
; reasonable heuristic.  We would be somewhat surprised to find significant
; performance problems in our own application to ACL2's parallelism primitives
; due to the ordering provided by the underlying system.

; OpenMCL provides us with semaphores for signaling.  SBCL provides condition
; variables for signaling.  Since we want to code for one type of signaling
; between parents and children, we create a semaphore wrapper for SBCL's
; condition variables.  The structure sbcl-semaphore implements the data for
; this wrapper.

; Followup: SBCL has recently implemented sempahores, and the parallelism code
; should maybe be changed to reflect this.  It probably depends on whether
; their implementation provides previously discussed
; semaphore-nofication-object's.

  (declare (ignore name))
  #+(and openmcl acl2-par)
  (ccl:make-semaphore)
  #+(and sbcl sb-thread acl2-par)
  (make-sbcl-semaphore)
  #-acl2-par

; We return nil in the uni-threaded case in order to stay in sync with
; semaphorep.

  nil)

(defun semaphorep (semaphore)

; Make-semaphore, signal-semaphore, and semaphorep work together to implement
; counting semaphores for our threading interface.

; This function recognizes our notion of semaphore structures.

  #+(and openmcl acl2-par)
  (typep semaphore 'ccl::semaphore)
  #+(and sbcl sb-thread acl2-par)
  (and (sbcl-semaphore-p semaphore)
       (typep (sbcl-semaphore-lock semaphore) 'sb-thread::mutex)
       (typep (sbcl-semaphore-cv semaphore) 'sb-thread::waitqueue)
       (integerp (sbcl-semaphore-count semaphore)))
  #-acl2-par

; We return nil in the uni-threaded case in order to stay in sync with
; make-semaphore, which returns nil in this case.  In a sense, we want
; (semaphorep (make-semaphore x)) to be a theorem if there is no error.

  (null semaphore))

(defun signal-semaphore (semaphore)

; Make-semaphore, signal-semaphore, and semaphorep work together to implement
; counting semaphores for our threading interface.

; This function is executed for side effect; the value returned is irrelevant.

  #-acl2-par
  (declare (ignore semaphore))
  #+(and openmcl acl2-par)
  (ccl:signal-semaphore semaphore)
  #+(and sbcl sb-thread acl2-par)
  (sb-thread:with-recursive-lock
   ((sbcl-semaphore-lock semaphore))
   (without-interrupts  
    (incf (sbcl-semaphore-count semaphore))
    (sb-thread:condition-notify (sbcl-semaphore-cv semaphore))))
  #-acl2-par
  nil)

(progn 

; We declare some variables here that are necessary for the threading
; interface.  These support semaphore recycling via functions
; allocate-semaphore and free-semaphore, which is useful for efficiency
; (especially when there are thousands of sempaphores) and for avoiding errors
; in early versions (at least) of OpenMCL.

(defvar *semaphore-allocation-lock* (make-lock))
(defvar *semaphore-freelist* 
  #+openmcl
  (ccl::%cons-pool)
  #-openmcl 
  nil)
)

(defun allocate-semaphore ()
  (without-interrupts 
   (with-lock *semaphore-allocation-lock*
             (or (pop 
                  #+openmcl
                  (ccl::pool.data *semaphore-freelist*)
                  #-openmcl
                  *semaphore-freelist*)
                 (make-semaphore)))))

(defun free-semaphore (s)

; Warning: This function assumes that s is properly initialized.  In
; particular, it must have a count field of 0, and for SBCL, the lock field
; must be free (which is guaranteed if we only use the threading interface).

; We test that the semaphore is properly initialized.  Note that we do not test
; that the semaphore isn't already on the *semaphore-freelist*.  It's
; questionable whether this check is worth it, since the user can get into
; trouble in many other ways.

; We considered creating a user-settable limit on the length of the
; *semaphore-freelist*.  In favor of simplicity, we have not
; implemented this, but if OS semaphore resources again become an
; issue, this is worth further consideration.  If this limit is
; implemented, once reached, probably all semaphores should be
; discarded and a full gc called.  This full gc is required to truly
; free the OS-level semaphores underneath the LISP implementation.

  (without-interrupts
   #+openmcl
   (loop while (timed-wait-on-semaphore s 0))
   #+(and sbcl sb-thread)
   (let ((lock (sbcl-semaphore-lock s)))

; Note that we must test the count before the mutex, so that we don't acquire
; the mutex and then not release it.

     (if (or (not (equal (sbcl-semaphore-count s) 0))
             (not (sb-thread:get-mutex lock nil nil)))
         (error "It is a design violation to free a semaphore while its lock ~
                is acquired.")
       (sb-thread:release-mutex lock))))
  (with-lock *semaphore-allocation-lock*
             (push s 
                   #+openmcl
                   (ccl::pool.data *semaphore-freelist*)
                   #-openmcl
                   *semaphore-freelist*)))

(defun reset-semaphore-free-list ()

; We provide the user the ability to clear the semaphores stored for recycling.
; If an ACL2 programmer wants to free OS resources for garbage collection, they
; can use this method to free the semaphores stored in our system.

  (with-lock *semaphore-allocation-lock*
             (setf *semaphore-freelist* 
                   #+openmcl
                   (ccl::%cons-pool)
                   #-openmcl 
                   nil)))

(defun wait-on-semaphore (semaphore &optional semaphore-notification-object)

; This function always returns t.  It only returns normally after receiving a
; signal for the given semaphore, setting the notification status of
; semaphore-notification-object (if supplied and not nil) to true; see
; semaphore-notification-status.  But control can leave this function
; abnormally, for example if the thread executing a call of this function is
; interrupted (e.g., with interface function interrupt-thread) with code that
; does a throw, in which case semaphore-notification-object is unmodified.

; We need the ability to know whether we received a signal or not.  OpenMCL
; provides this through a semaphore-notification-object.  As it turns out, SBCL
; does not provide this mechanism currently, so we modify our wrapper to
; "unreceive the signal" in the semaphore.  We do this by not decrementing the
; count of it unless we also modify the semaphore-notification object.  This
; means we have to resignal the semaphore if we were interrupted while
; signaling, but we would have to do this anyway.

  #-acl2-par
  (declare (ignore semaphore semaphore-notification-object))

  #+(and openmcl acl2-par)
  (ccl:wait-on-semaphore semaphore semaphore-notification-object)

  #+(and sbcl sb-thread acl2-par)
  (let ((supposedly-did-not-receive-signal-p t))
    
    (sb-thread:with-recursive-lock 
     ((sbcl-semaphore-lock semaphore))
     
     (unwind-protect-disable-interrupts-during-cleanup
      (progn 
        (loop while (<= (sbcl-semaphore-count semaphore) 0) do 
          
; The current thread missed the chance to decrement and must rewait.
          
          (sb-thread:condition-wait (sbcl-semaphore-cv semaphore) 
                                    (sbcl-semaphore-lock semaphore)))      
        (setq supposedly-did-not-receive-signal-p nil))
      (if supposedly-did-not-receive-signal-p
          
; The current thread may have received the signal but been unable to record it.
; In this case, the current thread will signal the condition variable again, so
; that any other thread waiting on the semaphore can have a chance at acquiring
; the said semaphore.
          
          (sb-thread:condition-notify (sbcl-semaphore-cv semaphore) 
                                      (sbcl-semaphore-lock semaphore))
        
; The current thread was able to record the reception of the signal.  The
; current thread will decrement the count of the semaphore and set the
; semaphore-notification-object.
        
        (progn
          (decf (sbcl-semaphore-count semaphore))
          (when semaphore-notification-object 
            (set-semaphore-notification-status semaphore-notification-object)))))))
  
  #-acl2-par
  t) ; default is to receive a semaphore/lock

(defun timed-wait-on-semaphore (semaphore length-in-seconds)

; It would be possible to manually implement a timed-wait-on-semaphore in SBCL,
; but it requires an extra thread every time a current thread waits.  This
; extra thread could be given a pointer to the current thread, sleep for 60
; seconds, and throw the thread if the current thread hadn't set a variable in
; a shared array.  This variable would be set whenever the current thread had
; successfully received the signal to wake up.  The problem with this idea is
; that it requires double the number of threads.  As a result, we leave the
; spawned worker threads to forever exist in SBCL, with the hope that SBCL
; will one day implement a timed-condition-wait.

; We do not simply reduce timed-wait-on-semaphore to wait-on-semaphore outside
; of OpenMCL, because the timeout version has different behavior than the
; non-timeout version.  We want the hard run-time error here.

  #+openmcl
  (ccl:timed-wait-on-semaphore semaphore length-in-seconds)
  #-openmcl
  (error "Timed waiting not supported outside openmcl" nil))

(defun make-semaphore-notification ()

; This function returns an object that records when a corresponding semaphore
; has been signaled (for use when wait-on-semaphore is called with that
; semaphore and that object).

  #+(and openmcl acl2-par)
  (ccl:make-semaphore-notification)
  #+(and sbcl sb-thread acl2-par)
  (make-array 1 :initial-element nil)
  #-acl2-par
  nil)

(defun semaphore-notification-status (semaphore-notification-object)
  (declare (ignorable semaphore-notification-object))
  #+(and openmcl acl2-par)
  (ccl:semaphore-notification-status semaphore-notification-object)
  #+(and sbcl sb-thread acl2-par)
  (aref semaphore-notification-object 0)
  #-acl2-par 
; t may be the wrong default, but we don't have a use case for this return
; value yet, so we postpone thinking about the "right" value until we are aware
; of a need.
  t) 
  
(defun clear-semaphore-notification-status (semaphore-notification-object)
  (declare (ignorable semaphore-notification-object))
  #+(and openmcl acl2-par)
  (ccl:clear-semaphore-notification-status semaphore-notification-object)
  #+(and sbcl sb-thread acl2-par)
  (setf (aref semaphore-notification-object 0) nil)
  #-acl2-par
  nil)

; We implement this only for SBCL, because even a system-level programmer is
; not expected to use this function.  We use it only from within the threading
; interface to implement wait-on-semaphore for SBCL.

(defun set-semaphore-notification-status (semaphore-notification-object)
  (declare (ignorable semaphore-notification-object))
  #+(and sbcl sb-thread)
  (setf (aref semaphore-notification-object 0) t)
  #-(and sbcl sb-thread)
  (error "Set-semaphore-notification-status not supported outside SBCL" 
         nil))

; Essay on Condition Variables

; A condition variable is a data structure that can be passed to corresponding
; "wait" and "signal" functions.  When a thread calls the wait function on a
; condition variable, c, the thread blocks until "receiving a signal" from the
; application of the signal function to c.  Only one signal is sent per call of
; the signal function; so, at most one thread will unblock.  (There is a third
; notion for condition variable, namely the broadcast function, which is like
; the signal function except that all threads blocking on the given condition
; variable will unblock.  But we do not support broadcast functions in this
; interface, in part because we use semaphores for OpenMCL, and there's no way
; to broadcast when you're really using a semaphore.)

; The design of our parallelism library is simpler when using condition
; variables for the following reason: Since a worker must wait for two
; conditions before consuming work, it is better to use a condition variable
; and test those two conditions upon waking, rather than try and use two
; semaphores.

; Implementation Note: As of March 2007, our OpenMCL implementation does not
; yield true condition variables.  A condition variable degrades to a
; semaphore, so if one thread first signals a condition variable, then that
; signal has been stored.  Then later (perhaps much later), when another thread
; waits for that signal, that thread will be able to proceed by decrementing
; the count.  As a result the later thread will "receive" the signal, even
; though that signal occurred in the past.  Fortunately, this isn't a
; contradiction of the semantics of condition variables, since with condition
; variables there is no specification of how far into the future the waiting
; thread will receive a signal from the signalling thread.

; Note: Condition variables should not be used to store state.  They are only a
; signaling mechanism, and any state update implied by receiving a condition
; variable's signal should be checked.  This usage is believed to be consistent
; with traditional condition variable semantics.

(defun make-condition-variable ()

; If OpenMCL implements condition variables, we will want to change the OpenMCL
; expansion and remove the implementation note above.

  #+(and openmcl acl2-par) 
  (ccl:make-semaphore)

  #+(and sbcl sb-thread acl2-par)
  (sb-thread:make-waitqueue)

  #-acl2-par

; We may wish to have assertions that evaluation of (make-condition-variable)
; is non-nil.  So we return t, even though as of this writing there are no such
; assertions.

  t)

(defmacro signal-condition-variable (cv) 
  #-acl2-par
  (declare (ignore cv))

  #+(and openmcl acl2-par) 
  `(ccl:signal-semaphore ,cv)
  
  #+(and sbcl sb-thread acl2-par)

; According to an email sent by Gabor Melis, of SBCL help, on 2007-02-25, if
; there are two threads waiting on a condition variable, and a third thread
; signals the condition variable twice before either can receive the signal,
; then both threads should receive the signal.  If only one thread unblocks, it
; is considered a bug.

  `(sb-thread:condition-notify ,cv)
  
  #-acl2-par
  t)

(defun wait-on-condition-variable-lockless (cv &optional s)

; Wait-on-condition-variable-lockless takes a required condition variable and
; an optional amount of timeout value.  Since timeout-bound waiting is
; unsupported in SBCL, an error occurs when the user tries to include a timeout
; value.

; Here, s is the number of seconds allowed to elapse before unblocking occurs
; (in essence a timeout).  This function returns t if we acquire the semaphore
; (i.e., we don't time out), and otherwise returns nil.

  #-acl2-par
  (declare (ignore cv s))

  #+(and openmcl acl2-par)
  (if s
      (ccl:timed-wait-on-semaphore cv s)
    (ccl:wait-on-semaphore cv))

  #+(and sbcl sb-thread acl2-par)
  (if s
      (error "Timed waiting on condition variables unsupported in SBCL")
    
; Since we do not want lock acquisition to be a bottleneck, we create a new
; lock each time we wait.  It's too bad that SBCL doesn't provide a lock-free
; signaling mechanism.  But since it doesn't, we do the inefficient thing of
; creating a fresh lock each time that becomes garbage when we return.

    (let ((lock (make-lock)))
      (with-lock lock (sb-thread:condition-wait cv lock))
      t))
   
  #-acl2-par
  nil) ; the default is to never receive a signal

; End of threading interface

;---------------------------------------------------------------------
; Section:  Parallelism Basis

; In this section we outline definitions and strategies for parallel evaluation
; and define constants, structures, variables, and other basic parallelism
; infrastructure.

; Essay on Parallelism Definitions

; Core
;
; A core is a unit inside a computer that can do useful work.  It has its own
; instruction pointers and usually accesses shared memory.  In the old days, we
; had "dual processors."  This is an example of a two core system.  A
; 2006-vintage example of a four core system is "dual sockets" with "dual core
; technology."

; Process
;
; We generally use the term "process" as a verb, meaning: run a set of
; instructions.  For example, the system can process a closure.

; Thread
;
; We use the OS definition of a thread as a lightweight process that shares
; memory with other threads in the same process.  A thread in our system is in
; one of the following three states.
;
;   1. Idle - The thread is waiting until both a piece of work (see below) and
;   a core are available.
;
;   2. Active - The thread has been allocated a core and is processing some
;   work.
;
;   3. Pending - This state occurs iff the thread in this state is associated
;   with a parent piece of work, and it is waiting for the children of that
;   piece of work to complete and for sufficient CPU core resources.  A thread
;   in this state is often waiting on a signaling mechanism.

; Closure
; 
; We use the term "closure" in the Lisp sense: a function that captures values
; of variables from the lexical environment in which it is formed.  A closure
; thus contains enough information to be applied to a list of arguments.  We
; create closures in the process of saving work to be performed.

; Work
;
; A piece of work contains all the data necessary for a worker thread to
; process one closure, save its result somewhere that a parent can read it, and
; communicate that it is finished.  It also contains some data necessary to
; implement features like the early termination of parallel and/or.  Comments
; at parallelism-piece give implementation details.

; Roughly, work can be in any of four states: unassigned, starting, pending, or
; resumed.  A piece of work will be processed by a single worker thread (not
; including, of course, child work, which will be processed by other worker
; threads).  When a core becomes available, a thread can grab an unassigned
; piece of work, at which time the thread and the piece of work leave their
; initial states together.  From that point forward until the piece of work is
; complete, the piece of work and its associated worker thread are considered
; to be in corresponding states (active/started,resumed or pending).  Initially
; they are in their active/started states.  Later, if child work is created,
; then at that time the thread and its associated piece of work both enter the
; pending state.  When all child work terminates and either a CPU core becomes
; available or a heuristic allows an exception to that requirement, the piece
; of work enteres the resumed state and its associated worker thread re-enters
; the active state.  This heuristic (implemented in
; wait-for-resumptive-parallelism-resources) gives priority to such resumptions
; over starting new pieces of work.

; Parallelism Primitive
;
; A macro that enables the user to introduce parallelism into a computation:
; one of plet, pargs, pand, and por.

; End of Essay on Parallelism Definitions

; Essay on Parallelism Strategy

; Whenever a parallelism primitive is used, the following steps occur.  The
; text between the < and > describes the state after the previous step
; finishes.  

;  1. If there is a granularity form, the form is evaluated.  If the form
;     returns nil, the parallelism primitive expands to the serial equivalent;
;     otherwise we continue.

; < granularity form has returned true or was omitted - the system was given a
; "large" amount of work >

;  2. If we heuristically determine that the system is already overwhelmed with
;     too much work (see parallelism-resources-available for details), then the
;     primitive expands to its serial equivalent; otherwise we continue.

;  3. Create closures for each primitive's arguments, as follows.
;     - Plet:  one closure for each form assigned to a bound variable
;     - Pargs:  one closure for each argument to the function call
;     - Pand/Por: one closure for each argument

; < have closures in memory representing computation to parallelize >

;  4. Create the data structures for pieces of work that worker threads are to
;     process.  One such data structure (documented in *work-queue* below) is
;     created for each computation to be spawned.  Among the fields of each
;     such structure is a closure that represents that computation.  Siblings
;     have data structures that share some fields, such as a result-array that
;     is to contain the values returned by the sibling computations.
;
;     The system then adds these pieces of work to the global *work-queue* for
;     worker threads to pop off the queue and process.
;
;     Note that Step 2 avoids creating undesirably many pieces of work.
;     (Actually the heuristics used in Step 2 don't provide exact guarantees,
;     since two computations that reach Step 2 simultaneously might both
;     receive the go-ahead even though together, they create work that exceeds
;     the heuristic work limit).

; < now have unassigned work in the work-queue >

; 5.  After the parent thread adds the work to the queue, it will check to see
;     if more worker threads are needed and spawn them if necessary.  Note
;     however that if there are more threads than cores, then any newly spawned
;     thread will wait on a semaphore, and only begins evaluating the work when
;     a core becomes available.  Each core is assigned to at most one thread at
;     any time (but if this decision is revisited, then it should be documented
;     here and in the Parallelism Variables section.  Note that this decision
;     is implemented by setting *idle-core-count* to (1- *core-count*) in
;     reset-parallelism-variables). 


;     Note that by limiting the amount of work in the system at Step 2, we
;     avoid creating more threads than the system can handle.

; < now have enough worker threads to process the work >

;  6. The parent thread waits for its children to signal their completion.  It
;     is crucial for efficiency that this waiting be implemented using a
;     signaling mechanism rather than as busy waiting.

; < the parent is waiting for the worker threads to process the work >

; 7.  At this point, the child threads begin processing the work on the queue.
;     As they are allocated resources, they each pull off a piece of work from
;     the work queue and save their results in the associated result-array.
;     After a child thread finishes a piece of work, it will check to see if
;     its siblings' computations are still necessary.  If not, the child will
;     remove these computations from the work queue and interrupt each of its
;     running sibling threads with a primitive that supplies a function for
;     that thread to execute.  This function throws to the tag
;     :result-no-longer-needed, causing the interrupted sibling to abort
;     evaluation of that piece of work, signal the parent (in an
;     unwind-protect's cleanup form on the way out to the catch), catch that
;     tag, and finally reenter the stalled state (where the controlling loop
;     will find it something new to do).  We take care to guarantee that this
;     mechanism works even if a child receives more than one interrupt.  Note
;     that when a child is interrupted in this manner, the value stored for the
;     child is a don't-care.

; < all of the children are done computing, the required results are in the 
;   results-array, and the parent has been signaled a number of times equal to
;   the number of children >

; 8.  The parent thread (from steps 1-6) resumes.  It finds the results stored
;     in its results array.  If the primitive is a: 
;     - Plet:  it executes the body of the plet with the calculated bindings
;     - Pargs:  it applies the called function to the calculated arguments
;     - Pand, Por:  it applies a functionalized "and" or "or" to the calculated
;                   arguments.  The result is Booleanized.

; End of Essay on Parallelism Strategy

; Parallelism Constants

(progn

(defun core-count-raw (&optional (ctx nil) default)

; If ctx is supplied, then we cause an error using the given ctx.  Otherwise we
; return a suitable default value (see below).

  #+openmcl (declare (ignore ctx default))
  #+openmcl (ccl:cpu-count)
  #-openmcl
  (if ctx
      (er hard ctx
          "It is illegal to call cpu-core-count in this Common Lisp ~
           implementation.")
  
; If the host Lisp does not provide a means for obtaining the number of cores,
; then we simply estimate on the high side.  A high estimate is desired in
; order to make it unlikely that we have needlessly idle cores.  We thus
; believe that 8 cores is a reasonable estimate for early 2007; but we may well
; want to increase this number later.

    (or default 8)))

; *Core-count* is the total number of cores in the system.

(defvar *core-count* 
  (core-count-raw))

(defvar *unassigned-and-active-work-count-limit*

; The *unassigned-and-active-work-count-limit* limits work on the *work-queue*
; to what we think the system will be able to process in a reasonable amount of
; time.  Suppose we have 8 CPU cores.  This means that there can be 8 active
; work consumers, and that generally not many more than 24 pieces of
; paralellism work are stored in the *work-queue* to be processed.  This
; provides us the guarantee that if all worker threads were immediately to
; finish their piece of parallelism work, that each of them would immediately
; be able to grab another piece from the work queue.

; We could increase the following coefficient from 4 and further guarantee that
; consumers have parallelism work to process, but this would come at the
; expense of backlogging the *work-queue".  We prefer simply to avoid the
; otherwise parallelized computations in favor of their serial equivalents.

  (* 4 *core-count*))
  
(defvar *total-work-limit* ; unassigned, started, resumed AND pending

; The number of pieces of work in the system, *parallelism-work-count*, must be
; less than *total-work-limit* in order to enable creation of new pieces of
; work.  (However, we could go from 49 to 69 pieces of work when encountering a
; pand; just not from 50 to 52.)

; Why limit the amount of work in the system?  :Doc parallelism-how-to
; (subtopic "Another Granularity Issue Related to Thread Limitations") provides
; an example showing how cdr recursion can rapidly create threads.  That
; example shows that if there is no limit on the amount of work we may create,
; then eventually, many successive cdrs starting at the top will correspond to
; waiting threads.  If we do not limit the amount of work that can be created,
; this can exhaust the supply of Lisp threads available to process the elements
; of the list.

  (let ((val

; Warning: It is possible, in principle to create (+ val
; *max-idle-thread-count*) threads.  Presumably you'll get a hard Lisp error
; (or seg fault!) if your Lisp cannot create that many threads.

         50)
        (bound (* 2 *core-count*)))
    (when (< val bound)
      (error "The variable *total-work-limit* needs to be at least ~s, i.e., ~%~
              at least double the *core-count*.  Please redefine ~%~
              *total-work-limit* so that it is not ~s."
             bound
             val))
    val))
  
; We don't want to spawn more worker threads (which are initially idle) when we
; already have sufficiently many idle worker threads.  We use
; *max-idle-thread-count* to limit this spawning in function
; spawn-worker-threads-if-needed.

(defvar *max-idle-thread-count* (* 2 *core-count*))

; *intial-threads* stores a list of threads that are considered to be part of
; the non-threaded part of ACL2.  When terminating parallelism threads, only
; those not appearing in this list will be terminated.  Warning: If ACL2 uses
; parallelism during the build process, that this variable could incorrectly
; record parallelism threads as initial threads.

(defvar *initial-threads* (all-threads))

) ; end of constant list


; Parallelism Structures

; If the shape of parallelism-piece changes, update the *work-queue*
; documentation in the section "Parallelism Variables."

(defstruct parallelism-piece ; piece of work

; A data item in the work queue has the following contents, and we often call
; each a "piece of work."

; thread-array - the array that holds the threads spawned for that closure's
; particular parent

; result-array - the array that holds the results for that closure's particular
; parent, where each value is either nil (no result yet) or a cons whose cdr is
; the result

; array-index - the index into the above two arrays for this particular closure

; semaphore-to-signal-as-last-act - the semaphore to signal right before the
; spawned thread dies

; closure - the closure to process by spawning a thread

; throw-siblings-when-function - the function to funcall on the current
; thread's result to see if its siblings should be terminated.  The function
; will also remove work from the work-queue and throw the siblings if
; termination should occur.

  (thread-array nil)
  (result-array nil)
  (array-index -1)
  (semaphore-to-signal-as-last-act nil)
  (closure nil)
  (throw-siblings-when-function nil))

; Parallelism Variables

(progn 

; Keep this progn in sync with reset-parallelism-variables, which resets the
; variables defined here.  Note that none of the variables are initialized
; here, so reset-parallelism-variables must be called before evaluating
; parallelism primitives (an exception is *throwable-worker-thread* since it is
; first called in reset-parallelism-variables).

; *Idle-thread-count* is updated both when a thread is created and right before
; it expires.  It is also updated when a worker thread gets some work to do and
; after it is done with that work.

  (defvar *idle-thread-count*)
  (deflock *idle-thread-count-lock*)

; *Idle-core-count* is only used to estimate resource availability.  The number
; itself is always kept accurate using a lock, but because we read it without a
; lock when calculating resource availability, the value actually read is only
; an estimate.  It defaults to (1- *core-count*), because the current thread is
; considered active.

; There are two pairs of places that *idle-core-count* is updated.  First,
; whenever a worker thread begins processing work, the count is decremented.
; This decrement is paired with the increment that occurs after a worker thread
; finishes work.  It is also incremented and decremented in
; eval-and-save-result, before and after a parent waits for its children.

; Note: At different stages of development we have contemplated having a
; "*virtual-core-count*", exceeding the number of CPU cores, that bounds the
; number of active threads.  Since our initial tests did not show a performance
; improvement by using this trick, we have not employed a *virtual-core-count*.
; If we later do employ this trick, the documentation in step 5 of the Essay on
; Parallelism Strategy will need to be updated.

  (defvar *idle-core-count*) 
  (deflock *idle-core-count-lock*)

; *Unassigned-and-active-work-count* tracks the amount of parallelism work in
; the system, other than pending work.  It is increased when a parallelism
; primitive adds work to the system.  This increase is paired with the final
; decrease in consume-work-on-work-queue-when-its-there, which occurs when a
; piece of work finishes.  It is decremented and incremented (respectively)
; when a parent waits on children and when it resumes after waiting on
; children.

  (defvar *unassigned-and-active-work-count*)
  (deflock *unassigned-and-active-work-count-lock*)

; *Total-work-count* tracks the total amount of parallelism work.  This
; includes unassigned, started, pending, and resumed work.
  
  (defvar *total-work-count*)
  (deflock *total-work-count-lock*)

; We maintain a queue of work to process.  See parallelism-piece for
; documentation on pieces of work.  Even though *work-queue* is a list, we
; think of it as a structure that can be destructively modified -- so beware
; sharing any structure with *work-queue*!

  (defvar *work-queue*)
  (deflock *work-queue-lock*)
  
; An idle thread waits for the condition variable
; *check-work-and-core-availability-cv* to be signaled, at which time it looks
; for work on the *work-queue* and an idle core to use.  This condition can be
; signaled by the addition of new work or by the availabilty of a CPU core.

; Warning: In the former case, a parent thread must always signal this
; semaphore *after* it has already added the work to the queue.  Otherwise, a
; child can attempt to acquire work, fail, and then go wait on the semaphore
; again.  Since the parent has already signaled, there is no guarantee that the
; work they place on the queue will ever be processed.  (The latter case also
; requires analogous care.)

; Why are there two condition variables, one for idle threads and one for
; resuming threads?  Suppose that idle and resuming threads waited on the same
; condition variable.  We would then have no guarantee that resuming threads
; would be signaled before the idle threads (which is necessary to establish
; the priority explained in wait-for-resumptive-parallelism-resources).  Using
; separate condition variables allows both an idle and resuming thread to be
; signaled.  Then whichever thread's heuristics allow it to execute will claim
; access to the CPU core.  There is no problem if both their heuritistics allow
; them to continue.

  (defvar *check-work-and-core-availability-cv*)
  (defvar *check-core-availability-for-resuming-cv*)
  
; When we terminate threads due to a break and abort, we need a way to
; terminate all threads.  We implement this by having them throw the
; :worker-thread-no-longer-needed tag.  Unfortunately, sometimes the threads
; are outside the scope of the associated catch, when throwing the tag would
; cause a warning.  We avoid this warning by maintaining the dynamically-bound
; variable *throwable-worker-thread*.  When the throwable context is entered,
; we let a new copy of the variable into existence and set it to T.  Now, when
; we throw :worker-thread-no-longer-needed, we only throw it if
; *throwable-worker-thread* is non-nil.

  (defvar *throwable-worker-thread* nil)

; *total-parallelism-piece-historical-count* tracks the total number of pieces
; of parallelism work processed over the lifetime of the ACL2 session.  It is ;
; reset whenever the parallelism variables are reset.  It is only used for ;
; informational purposes, and the system does not depend on its accuracy in any
; way.  It therefore does not have an associated lock.

  (defvar *total-parallelism-piece-historical-count*)

) ; end of parallelism variables

; Following are definitions of functions that help us restore the
; parallelism system to a stable state after an interrupt occurs.

(defun throw-all-threads-in-list (thread-list)

; We interrupt each of the given threads with a throw to the catch at the top
; of consume-work-on-work-queue-when-its-there, which is the function called
; by run-thread in spawn-worker-threads-if-needed.

; Compare with kill-all-threads-in-list, which kills all of the given threads
; (typically all user-produced threads), not just those self-identified as
; being within the associated catch block.

  (if (endp thread-list)
      nil
    (progn
      (interrupt-thread 
       (car thread-list) 
       #'(lambda () (when *throwable-worker-thread* 
                      (throw :worker-thread-no-longer-needed nil))))
      (throw-all-threads-in-list (cdr thread-list)))))

(defun kill-all-threads-in-list (thread-list)

; Compare with throw-all-threads-in-list, which uses throw instead of killing
; threads directly, but only affects threads self identified as being within an
; associated catch block.

  (if (endp thread-list)
      nil
    (progn
      (kill-thread (car thread-list))
      (kill-all-threads-in-list (cdr thread-list)))))

(defun all-threads-except-initial-threads-are-dead ()
  #+sbcl
  (<= (length (all-threads)) 1)
  #-sbcl
  (null (set-difference (all-threads) *initial-threads*)))


(defun send-die-to-all-except-initial-threads ()

; This function is evaluated only for side effect.

  (let ((target-threads (set-difference (all-threads)
                                        *initial-threads*)))
    #+acl2-par
    (throw-all-threads-in-list target-threads)
    #-acl2-par nil)
  (thread-wait 'all-threads-except-initial-threads-are-dead))
  
(defun kill-all-except-initial-threads ()

; This function is evaluated only for side effect.

  (let ((target-threads (set-difference (all-threads)
                                        *initial-threads*)))
    (kill-all-threads-in-list target-threads))
  (thread-wait 'all-threads-except-initial-threads-are-dead))

(defun reset-parallelism-variables ()
 
; We use this function (a) to kill all worker threads, (b) to reset "most" of
; the parallelism variables, and (c) to reset the lock and semaphore recycling
; systems.  Keep (b) in sync with the progn above that declares the variables
; reset here, in the sense that this function assigns values to exactly those
; variables.

; If a user kills threads directly from raw Lisp, for example using functions
; above, then they should call reset-parallelism-variables.  Note that
; reset-parallelism-variables is called automatically on any call of LD,
; including the case that an ACL2 user interrupts with control-c and then
; aborts to get back to the ACL2 top-level loop.

; (a) Kill all worker threads.

  (send-die-to-all-except-initial-threads)

; (b) Reset "most" of the parallelism variables.

  (setf *idle-thread-count* 0)
  (reset-lock *idle-thread-count-lock*)

  (setf *idle-core-count* (1- *core-count*))
  (reset-lock *idle-core-count-lock*)
  
  (setf *unassigned-and-active-work-count* 1) 
  (reset-lock *unassigned-and-active-work-count-lock*)

  (setf *total-work-count* 1)
  (reset-lock *total-work-count-lock*)
  
  (setf *work-queue* nil)
  (reset-lock *work-queue-lock*)
 
  (setf *check-work-and-core-availability-cv* (make-condition-variable))
  (setf *check-core-availability-for-resuming-cv* (make-condition-variable))

  (setf *throwable-worker-thread* nil)

  (setf *total-parallelism-piece-historical-count* 0)

  (setf *initial-threads* (all-threads))

; (c) Reset the lock and semaphore recycling system.

; The definitions for the following two are commented out above.
; (setf *lock-allocation-lock* (make-lock))
; (reset-lock-free-list)

  (setf *semaphore-allocation-lock* (make-lock))
  (reset-semaphore-free-list)

)

; We reset parallelism variables as a standard part of compilation to make
; sure the code behind declaring variables and resetting variables is
; consistent and that we are in a stable state.

(reset-parallelism-variables) 

;---------------------------------------------------------------------
; Section:  Work Consumer Code

; We develop functions that assign threads to process work.

(defun eval-and-save-result (work)

; Work is a piece of parallelism work.  Among its fields are a closure and an
; array.  We evaluate this closure and save the result into this array.  No
; lock is required because no other thread will be writing to the same position
; in the array.

; Keep this in sync with the comment in parallelism-piece, where we explain
; that the result is the cdr of the cons stored in the result array at the
; appropriate position.

  (assert work)
  (let ((result-array (parallelism-piece-result-array work))
        (array-index (parallelism-piece-array-index work))
        (closure (parallelism-piece-closure work)))
    (setf (aref result-array array-index)
          (cons t (funcall closure)))))

(defun pop-work-and-set-thread ()
  
; Once we exit the without-interrupts that must enclose a call to
; pop-work-and-set-thread, our siblings can interrupt us so that we execute a
; throw to the tag :result-no-longer-needed.  The reason they can access us is
; that they will have a pointer to us in the thread array.

; There is a race condition between when work is popped from the *work-queue*
; and when the current thread is stored in the thread-array.  This race
; condition could be eliminated by holding *work-queue-lock* during the
; function's entire execution.  Since (1) we want to minimize the duration
; locks are held, (2) the chance of this race condition occuring is small and
; (3) there is no safety penalty when this race condition occurs (instead an
; opportunity for early termination is missed), we only hold the lock for the
; amount of time it takes to ready and modify the *work-queue*

  (let ((work (with-lock *work-queue-lock*
                         (when (consp *work-queue*)
                           (pop *work-queue*))))
        (thread (current-thread)))
    (when work
      (assert thread)
      (assert (parallelism-piece-thread-array work))

; Record that the current thread is the one assigned to do this piece of work:

      (setf (aref (parallelism-piece-thread-array work) 
                  (parallelism-piece-array-index work)) 
            thread))
    work))

(defun consume-work-on-work-queue-when-its-there ()

; This function is an infinite loop.  However, the thread running it can be
; waiting on a condition variable and will expire if it waits too long.

; Each iteration through the main loop will start by trying to grab a piece of
; work to process.  When it succeeds, then it will process that piece of work
; and wait again on a condition variable before starting the next iteration.
; But ideally, if it has to wait too long for a piece of work to grab then we
; return from this function (with expiration of the current thread); see below.
  
  (catch :worker-thread-no-longer-needed
    (let* ((*throwable-worker-thread* t))
      (loop ; "forever" - really until :worker-thread-no-longer-needed thrown

; Wait until there are both a piece of work and an idle core.  In openmcl, if
; the thread waits too long, it throws to the catch above and returns from this
; function.

       (loop while (not (and *work-queue* (< 0 *idle-core-count*)))

; We can't grab work yet, so we wait until somebody signals us to try again, by
; returning a non-nil value to the call of not, just below.  If however nobody
; signals us then ideally (and in OpenMCL but not SBCL) a timeout occurs that
; returns nil to this call of not, so we give up with a throw.

             do (when (not #+(and openmcl acl2-par)
                           (wait-on-condition-variable-lockless
                            *check-work-and-core-availability-cv* 15)
                           #+(and sbcl sb-thread acl2-par)
                           (wait-on-condition-variable-lockless
                            *check-work-and-core-availability-cv*)
                           #-acl2-par
                           t)
                  (throw :worker-thread-no-longer-needed nil)))
        
; Now very likely there are both a piece of work and an idle core to process
; it.  But a race condition allows either of these to disappear before we can
; claim a piece of work and a CPU core, which explains the use of `when'
; below.

       (unwind-protect-disable-interrupts-during-cleanup
        (when (<= 0 (with-lock *idle-core-count-lock* ; allocate CPU core
                               
; We will do a corresponding increment of *idle-core-count* in the cleanup form
; of this unwind-protect.  Note that the current thread cannot be interrupted
; (except by direct user intervention, for which we may provide only minimal
; protection) until the call of pop-work-and-set-thread below (see long comment
; above that call), because no other thread has a pointer to this one until
; that time.
                               
                               (decf *idle-core-count*)))
          (catch :result-no-longer-needed
            (let ((work nil))
              (unwind-protect-disable-interrupts-during-cleanup
               (progn
                 (without-interrupts 
                  (setq work
                                           
; The following call has the side effect of putting the current thread into a
; thread array, such that this presence allows the current thread to be
; interrupted by another (via interrupt-thread, in throw-threads-in-array).  So
; until this point, the current thread will not be told to do a throw.
                                           
; We rely on the following claim: If any state has been changed by this call of
; pop-work-and-set-thread, then that call completes and work is set to a
; non-nil value.  This claim guarantees that if any state has been changed,
; then the cleanup form just below will be executed and will clean up properly.
; For example, we would have a problem if pop-work-and-set-thread were
; interrupted after the decrement of *idle-thread-count*, but before work is
; set, since then the matching increment in the cleanup form below would be
; skipped.  For another example, if we complete the call of
; pop-work-and-set-thread but not the enclosing setq for work, then we miss the
; semaphore signaling in the cleanup form below.
                        
                        (pop-work-and-set-thread))
                  (when work (with-lock *idle-thread-count-lock* 
                                        (decf *idle-thread-count*))))
                 (when work
                   
; The consumer now has a core (see the <= test above) and a piece of work.
                   
                   (eval-and-save-result work)
                   
                   (let* ((thread-array (parallelism-piece-thread-array work))
                          (result-array (parallelism-piece-result-array work))
                          (array-index (parallelism-piece-array-index work))
                          (throw-siblings-when-function 
                           (parallelism-piece-throw-siblings-when-function work)))
                     (setf (aref thread-array array-index) nil)
                     
; The nil setting just above guarantees that the current thread doesn't
; interrupt itself by way of the early termination function.
                     
                     (when throw-siblings-when-function
                       (funcall throw-siblings-when-function
                                (aref result-array array-index))))))
               
               (when work ; process this cleanup form if we acquired work
                 (let* ((semaphore-to-signal-as-last-act
                         (parallelism-piece-semaphore-to-signal-as-last-act
                          work))
                        (thread-array (parallelism-piece-thread-array work))
                        (array-index (parallelism-piece-array-index work)))
                   (incf *total-parallelism-piece-historical-count*)
                   (setf (aref thread-array array-index) nil)
                   (with-lock *idle-thread-count-lock*
                              
; Above we argued that if *idle-thread-count* is decremented, then work is set
; and hence we get to this point so that we can do the corresponding
; increment.  In the other direction, if we get here, then how do we know that 
; *idle-thread-count* was decremented?  We know because if we get here, then
; work is non-nil and hence pop-work-and-set-thread must have completed.
                              
                              (incf *idle-thread-count*))
                   
; Each of the following two decrements undoes the corresponding increment done
; when the piece of work was first created and queued.
                   
                   (with-lock *total-work-count-lock*
                              (decf *total-work-count*))
                   (with-lock *unassigned-and-active-work-count-lock* 
                              (decf *unassigned-and-active-work-count*))
                   (assert (semaphorep semaphore-to-signal-as-last-act))
                   (signal-semaphore semaphore-to-signal-as-last-act)))))
            ) ; end catch :result-no-longer-needed
          ) ; end when CPU core allocation 
        
        (with-lock *idle-core-count-lock*
                   (incf *idle-core-count*))
        (signal-condition-variable
         *check-work-and-core-availability-cv*)
        (signal-condition-variable
         *check-core-availability-for-resuming-cv*))))
    
    ) ; end catch :worker-thread-no-longer-needed
  
; The current thread is about to expire because all it was given to do was to
; run this function.
  
  (with-lock *idle-thread-count-lock* (decf *idle-thread-count*)))

(defun spawn-worker-threads-if-needed ()

; This function must be called with interrupts disabled.  Otherwise it is
; possible for the *idle-thread-count* to be incremented even though no new
; worker thread is spawned.

  (loop while (< *idle-thread-count* *max-idle-thread-count*) 

; Note that the above test could be true, yet *idle-thread-count* could be
; incremented before we get to the lock just below.  But we want as little
; bottleneck as possible for scaling later, and the practical worst consequence is
; that we spawn extra threads here.

; Another possibility is that we spawn too few threads here, because the final
; decrement of *idle-thread-count* in consume-work-on-work-queue-when-its-there
; has not occurred even though a worker thread has decided to expire.  If this
; occurs, then we may not have the expected allotment of idle threads for
; awhile, but we expect the other idle threads (if any) and the active threads
; to suffice.  Eventually a new parallelism primitive call will invoke this
; function again, at a time when the about-to-expire threads have already
; updated *idle-thread-count*, which will allow this function to create the
; expected number of threads.  The chance of any of this kind of issue arising
; is probably extremely small.

; NOTE: Consider coming up with a design that's easier to understand.

    do
    (progn (with-lock *idle-thread-count-lock* (incf *idle-thread-count*))
           ;(format t "param parent thread ~a: ~s~%" (current-thread) acl2::*param*)
           (run-thread 
            "Worker thread" 
            'consume-work-on-work-queue-when-its-there))))

;---------------------------------------------------------------------
; Section:  Work Producer Code

; We develop functions that create work, to be later processed by threads.  Our
; main concern is to keep the work queue sufficiently populated so as to keep
; CPU cores busy, while limiting the total amount of work so that the number of
; threads necessary to evaluate that work does not execede the number of
; threads that the underlying Lisp supports creating.  (See also comments in
; *total-work-limit*.)

(defun add-work-list-to-queue (work-list)

; Call this function inside without-interrupts, in order to maintain the
; invariant that when this function exits, the counts are accurate.

; WARNING!  This function destructively modifies *work-queue*.

  (let ((work-list-length (length work-list))) 
    (with-lock *work-queue-lock*
               
; In naive performance tests using a parallel version of Fibonacci, we found
; that (pfib 45) took about 19.35 seconds with (nconc *work-queue* work-list),
; as opposed to 19.7 seconds when we reversed the argument order.  We have
; other evidence that suggests switching the argument order.  But we follow
; Halstead's 1989 paper "New Ideas in Parallel Lisp: Language Design,
; Implementation, and Programming Tools", by doing the oldest work first.
               
               (setf *work-queue*
                     (nconc *work-queue* work-list)))
    (with-lock *total-work-count-lock*
               (incf *total-work-count* work-list-length))
    (with-lock *unassigned-and-active-work-count-lock*
               (incf *unassigned-and-active-work-count* work-list-length))
    (dotimes (i work-list-length)
      (signal-condition-variable *check-work-and-core-availability-cv*))))

(defun combine-array-results-into-list (result-array current-position acc)
  (if (< current-position 0)
      acc
    (combine-array-results-into-list 
     result-array
     (1- current-position)
     (cons (cdr ; entry is a cons whose cdr is the result
            (aref result-array current-position))
           acc))))

(defun remove-thread-array-from-work-queue-rec
  (work-queue thread-array array-positions-left)

; The function calling remove-thread-array-from-work-queue must hold the lock
; *work-queue-lock*.

; This function must be called with interrupts disabled.

  (cond ((equal array-positions-left 0)
         work-queue)
        ((atom work-queue)
         nil)
        ((equal thread-array (parallelism-piece-thread-array (car work-queue)))
         (progn
           (with-lock *total-work-count-lock* 
                      (decf *total-work-count*))
           (with-lock *unassigned-and-active-work-count-lock*
                      (decf *unassigned-and-active-work-count*))

; we must signal the parent
           
           (assert 
            (semaphorep (parallelism-piece-semaphore-to-signal-as-last-act 
                         (car work-queue))))
           (signal-semaphore
            (parallelism-piece-semaphore-to-signal-as-last-act 
             (car work-queue)))
           (remove-thread-array-from-work-queue-rec (cdr work-queue)
                                                    thread-array
                                                    (1- array-positions-left))))
        (t (cons (car work-queue) 
                 (remove-thread-array-from-work-queue-rec
                  (cdr work-queue)
                  thread-array
                  (1- array-positions-left))))))

(defun remove-thread-array-from-work-queue (thread-array)
  (without-interrupts 
   (with-lock *work-queue-lock*
              (setf *work-queue*
                    (remove-thread-array-from-work-queue-rec
                     *work-queue*
                     thread-array 
                     (length thread-array))))))

(defun terminate-siblings (thread-array)
  
; This function supports early termination by eliminating further computation
; by siblings.  Siblings not yet assigned a thread are removed from the work
; queue.  Siblings that are already active are interrupted to throw with tag
; :result-no-longer-needed.  The order of these two operations is important: if
; we do them in the other order, then we could miss a sibling that is assigned
; a thread (and removed from the work queue) just inbetween the two
; operations.
  
  (remove-thread-array-from-work-queue thread-array)
  (throw-threads-in-array thread-array (1- (length thread-array))))

(defun generate-work-list-from-closure-list-rec 
  (thread-array result-array children-done-semaphore closure-list current-position
                &optional throw-siblings-when-function)
  (if (atom closure-list)
      (assert (equal current-position (length thread-array))) ; returns nil
    (cons (make-parallelism-piece
           :thread-array thread-array
           :result-array result-array
           :array-index current-position
           :semaphore-to-signal-as-last-act children-done-semaphore
           :closure (car closure-list)
           :throw-siblings-when-function throw-siblings-when-function)
          (generate-work-list-from-closure-list-rec
           thread-array
           result-array
           children-done-semaphore
           (cdr closure-list)
           (1+ current-position)
           throw-siblings-when-function))))

(defun generate-work-list-from-closure-list
  (closure-list &optional terminate-early-function)

; Given a list of closures, we need to generate a list of work data structures
; that are in a format ready for the work queue.  Via mv, we also return the
; pointers to the thread, result, and semaphore arrays.

  (let* ((closure-count (length closure-list))
         (thread-array (make-array closure-count :initial-element nil))
         (result-array (make-array closure-count :initial-element nil))
         (children-done-semaphore (allocate-semaphore)))
    (progn ; warning: avoid prog2 as we need to return multiple value
      (assert (semaphorep children-done-semaphore)) 
      (mv (generate-work-list-from-closure-list-rec 
           thread-array
           result-array
           children-done-semaphore
           closure-list
           0
           (if terminate-early-function
               (lambda (x) ; x is (t . result)
                 (when (funcall terminate-early-function (cdr x))
                   (terminate-siblings thread-array)))
             nil))
          thread-array
          result-array
          children-done-semaphore))))

(defun parallelism-resources-available ()

; This function is our attempt to guess when resources are available.  When
; this function returns true, then resources are probably available, and a
; parallelism primitive call will opt to parallelize.  We say "probably"
; because correctness does not depend on our answering exactly.  For
; performance, we prefer that this function is reasonably close to an accurate
; implementation that would use locks.  Perhaps even more important for
; performance, however, is that we avoid the cost of locks to try to remove
; bottlenecks.

; In summary, it is unneccessary to acquire a lock, because we just don't care
; if we miss a few chances to parallelize, or parallelize a few extra times.

  (and (f-get-global 'parallel-evaluation-enabled *the-live-state*)
       (< *unassigned-and-active-work-count* 
          *unassigned-and-active-work-count-limit*)
       (< *total-work-count* *total-work-limit*)))

(defun throw-threads-in-array (thread-array current-position)

; Call this function to terminate computation for every thread in the given
; thread-array from position current-position down to position 0.  We expect
; that thread-array was either created by the current thread's parent or was
; created by the current thread (for its children).

; We require that the current thread not be in thread-array.  This requirement
; prevents the current thread from interrupting itself, which could conceivably
; abort remaining recursive calls of this function, or cause a hang in some
; Lisps since we may be operating with interrupts disabled (for example, inside
; the cleanup form of an unwind-protect in OpenMCL 1.1pre or later).

  (assert thread-array)
  (when (<= 0 current-position)
    (let ((current-thread (aref thread-array current-position)))
      (when current-thread
        (interrupt-thread current-thread 
                          
; The delayed evaluation of (aref thread-array...) below is crucial to keep a
; thread from throwing :result-no-longer-needed outside of the catch for that tag.
; Consume-work-on-work-queue-when-its-there will set the (aref thread-array...)
; to nil when the thread should not be thrown.
                          
                          (lambda ()
                            (when (aref thread-array current-position)
                              (throw :result-no-longer-needed nil))))))
    (throw-threads-in-array thread-array (1- current-position))))

(defun decrement-children-left (children-left-ptr semaphore-notification-obj)

; This function should be called with interrupts disabled.

   (when (semaphore-notification-status semaphore-notification-obj)
     (decf (aref children-left-ptr 0))
     (clear-semaphore-notification-status semaphore-notification-obj)))

(defun wait-for-children-to-finish 
  (semaphore children-left-ptr semaphore-notification-obj) 
  
; This function is called both in the normal case and in the early-termination
; case.

  (assert children-left-ptr)
  (when (<= 1 (aref children-left-ptr 0))
    (assert (not (semaphore-notification-status semaphore-notification-obj)))
    (unwind-protect-disable-interrupts-during-cleanup
     (wait-on-semaphore semaphore 
                        semaphore-notification-obj)
     (decrement-children-left children-left-ptr
                              semaphore-notification-obj))
    (wait-for-children-to-finish semaphore
                                 children-left-ptr
                                 semaphore-notification-obj)))

(defun wait-for-resumptive-parallelism-resources ()

; A thread resuming execution after its children finish has a higher priority
; than a thread just beginning execution.  As such, resuming threads are
; allowed to "borrow" up to *core-count* CPU cores.  That is implemented by
; waiting until *idle-core-count* is greater than the negation of the
; *core-count*.  This is different from a thread just beginning execution,
; which waits for *idle-core-count* to be greater than 0.

  (loop while (<= *idle-core-count* (- *core-count*))

; So, *idle-core-count* is running a deficit that is at least the number of
; cores: there are already *core-count* additional active threads beyond the
; normal limit of *core-count*.

          do (wait-on-condition-variable-lockless 
              *check-core-availability-for-resuming-cv*))
  (with-lock *unassigned-and-active-work-count-lock*
             (incf *unassigned-and-active-work-count*))
  (with-lock *idle-core-count-lock*
             (decf *idle-core-count*)))

(defun early-terminate-children-and-rewait 
  (children-done-semaphore children-left-ptr semaphore-notification-obj
                           thread-array)
  
; This function performs three kinds of actions.

; A. It signals children-done-semaphore once for each child that is unassigned
; (i.e. still on the work queue) and removes that child from the work queue.

; B. It interrups each assigned child's thread with a throw that terminates
; processing of its work.  Note that we must do Step B after Step A: otherwise
; threads might grab work after Step B but before Step A, resulting in child
; work that is no longer available to terminate unless we call this function
; again.

; C. The above throw from Step B eventually causes the interrupted threads to
; signal children-done-semaphore.  The current thread waits for those remaining
; signals.

  (when (< 0 (aref children-left-ptr 0))
    (remove-thread-array-from-work-queue ; A

; Signal children-done-semaphore, which is in each piece of work in
; closure-list.

     thread-array)
    (throw-threads-in-array thread-array ; B
                            (1- (length thread-array)))
    (wait-for-children-to-finish         ; C
     children-done-semaphore
     children-left-ptr
     semaphore-notification-obj)))

#+acl2-par
(defun prepare-to-wait-for-children ()
           
; This function should be executed with interrupts disabled, after all child
; work is added to the work queue but before the current thread waits on such
; work to finish.

; First, since we are about to enter the pending state, we must free CPU core
; resources and notify other threads.

  (with-lock *idle-core-count-lock*
             (incf *idle-core-count*))
  (signal-condition-variable *check-work-and-core-availability-cv*)
  (signal-condition-variable *check-core-availability-for-resuming-cv*)

; Second, record that we are no longer active.  (Note: We could avoid the
; following form (thus saving a lock) by incrementing
; *unassigned-and-active-work-count* by one less in add-work-list-to-queue.)
          
  (with-lock *unassigned-and-active-work-count-lock*
             (decf *unassigned-and-active-work-count*)))

#+acl2-par
(defun parallelize-closure-list (closure-list &optional terminate-early-function)

; Given a list of closures, we:

; 1. Create a list of pieces of work (see defstruct parallelism-piece).

; 2. If there aren't enough idle worker threads, we spawn a reasonably
; sufficient number of new worker threads, so that CPU cores are kept busy but
; without the needless overhead of useless threads.  Note that when a thread
; isn't already assigned work, it is waiting for notification to look for work
; to do.

; 3. Add the work to the work queue, which notifies the worker threads of the
; additional work.  

; 4. Free parallelism resources (specifically, a CPU core), since we are about
; to become idle as we wait children to finish.  Issue the proper notifications
; (via condition variables) so that other threads are aware of the freed
; resources.

; 5. Wait for the children to finish.  In the event of receiving an early
; termination from our parent (a.k.a. the grandparent of our children) or our
; sibling (a.k.a. the uncle of our children), we signal our children to
; terminate early, and we wait again.

; Note that if the current thread's children decide the remaining child results
; are irrelevant, that the current thread will never know it.  The children
; will terminate early amongst themselves without any parent intervention.

; 6. Resume when resources become available, reclaiming parallelism resources
; (see wait-for-resumptive-parallelism-resources).

; 7. Return the result.

; It's silly to parallelize just 1 (or no!) thing.  The definitions of pargs,
; plet, pand, and por should prevent this assertion from failing, but we have
; it here as a check that this is true.

  (assert (and (consp closure-list) (cdr closure-list))) 
    
  (let ((work-list-setup-p nil)
        (semaphore-notification-obj (make-semaphore-notification))
        (children-left-ptr (make-array 1 :initial-element (length closure-list))))

; 1. Create a list of pieces of work.
    (mv-let 
     (work-list thread-array result-array children-done-semaphore)
     (generate-work-list-from-closure-list closure-list
                                           terminate-early-function)
     (assert (semaphorep children-done-semaphore))
     (unwind-protect-disable-interrupts-during-cleanup
      (progn
        (without-interrupts
         
; 2. Spawn worker threads so that CPU cores are kept busy.
         (spawn-worker-threads-if-needed)
         
; 3. Add the work to the work queue.
         (setq work-list-setup-p (progn (add-work-list-to-queue work-list) t))
         
; 4. Free parallelism resources.
         (prepare-to-wait-for-children))

; 5a. Wait for children to finish.  But note that we may be interrupted by our
; sibling or our parent before this wait is completed.

; Now that the two operations under the above without-interrupts are complete,
; it is once again OK to be interrupted with a function that throws to the tag
; :results-no-longer-needed.  Note that wait-for-children-to-finish is called
; again in the cleanup form below, so we don't have to worry about dangling
; child threads even if we don't complete evaluation of the following form.

        (wait-for-children-to-finish children-done-semaphore
                                     children-left-ptr
                                     semaphore-notification-obj))
        
; We are entering the cleanup form, which we always need to run (in particular,
; so that we can resume and return a result).  But why must we run without
; interrupts?  Suppose for example we have been interrupted (to do a throw) by
; the terminate-early-function of one of our siblings or by our parent.  Then
; we must wait for all child pieces of work to terminate (see
; early-terminate-children-and-rewait) before we return.  And this waiting must
; be non-abortable; otherwise, for example, we could be left (after Control-c
; and an abort) with orphaned child threads.

      (progn 
        (when work-list-setup-p ; children were added to *work-queue*
          
; If we were thrown by a sibling or parent, it's possible that our children
; didn't finish.  We now throw our children and wait for them.
            
; 5b. Complete processing of our children in case we were interrupted when we
; were waiting the first time.
          (early-terminate-children-and-rewait children-done-semaphore
                                               children-left-ptr
                                               semaphore-notification-obj
                                               thread-array)

; AS OF *HERE*, ALL OF THIS PARENT'S CHILD WORK IS "DONE"
            
; 6. Resume when resources become available.
          (wait-for-resumptive-parallelism-resources)
          (assert (eq (aref children-left-ptr 0) 0))

; Free semaphore for semaphore recycling.
          (free-semaphore children-done-semaphore))))
     
; 7. Return the result.
     (combine-array-results-into-list
      result-array
      (1- (length result-array))
      nil))))


; Parallelize-fn Booleanizes the results from pand/por.

#+acl2-par
(defun parallelize-fn (parent-fun-name arg-closures &optional terminate-early-function)
  
; It's inefficient to parallelize just one (or no!) computation.  The
; definitions of pargs, plet, pand, and por should prevent this assertion from
; failing, but we have it here as a check that this is true.

  (assert (cdr arg-closures))
  (let ((parallelize-closures-res 
         (parallelize-closure-list arg-closures terminate-early-function)))
    (if (or (equal parent-fun-name 'and-list)
            (equal parent-fun-name 'or-list))
        (funcall parent-fun-name parallelize-closures-res)
      (apply parent-fun-name parallelize-closures-res))))

(defmacro closure-for-expression (x)
  `(function (lambda () ,x)))

(defmacro closure-list-for-expression-list (x)
  (if (atom x)
      nil
    `(cons (closure-for-expression ,(car x))
           (closure-list-for-expression-list ,(cdr x)))))

;---------------------------------------------------------------------
; Section:  Parallelism Primitives

#+acl2-par
(defun parallelism-condition (gran-form-exists gran-form)
  (if gran-form-exists
      `(and (parallelism-resources-available)

; We check availability of resources before checking the granularity form,
; since the latter can be arbitrarily expensive.

            ,gran-form)
    '(parallelism-resources-available)))

#+acl2-par
(defmacro pargs (&rest forms)

; This is the raw lisp version for threaded Lisps.

  (mv-let 
   (erp msg gran-form-exists gran-form remainder-forms)
   (check-and-parse-for-granularity-form forms)
   (declare (ignore msg))
   (assert (not erp))
   (let ((function-call (car remainder-forms)))
     (if (null (cddr function-call)) ; whether there are two or more arguments
         function-call
       (list 'if
             (parallelism-condition gran-form-exists gran-form)
             (list 'parallelize-fn
                   (list 'quote (car function-call))
                   (list 'closure-list-for-expression-list
                         (cdr function-call)))
             function-call)))))

(defun plet-doublets (bindings bsym n)
  (cond ((endp bindings)
         nil)
        (t
         (cons (list (caar bindings) (list 'nth n bsym))
               (plet-doublets (cdr bindings) bsym (1+ n))))))

(defun make-closures (bindings)

; We return a list of forms (function (lambda () <expr>)), each of which
; evaluates to a closure, as <expr> ranges over the expression components of
; the given plet bindings.  Note that this function is only called on the
; bindings of a plet expression that has passed translate -- so we know that
; bindings has the proper shape.

  (if (endp bindings)
      nil
    (cons `(function (lambda () ,(cadar bindings)))
          (make-closures (cdr bindings)))))

(defun identity-list (&rest rst) rst)

(defun make-list-until-non-declare (remaining-list acc)
  (if (not (caar-is-declarep remaining-list))
      (mv (reverse acc) remaining-list)
    (make-list-until-non-declare (cdr remaining-list)
                                 (cons (car remaining-list) acc))))

(defun parse-additional-declare-forms-for-let (x)

; X is a list of forms from a well-formed plet, with the plet and optional
; granularity form removed.  It thus starts with bindings and is followed by
; any finite number of valid declare forms, and finally a body.

  (mv-let (declare-forms body)
          (make-list-until-non-declare (cdr x) nil)
          (mv (car x) declare-forms body)))

#+acl2-par
(defmacro plet (&rest forms)

; This is the raw Lisp version for threaded Lisps.

  (mv-let 
   (erp msg gran-form-exists gran-form remainder-forms)
   (check-and-parse-for-granularity-form forms)
   (declare (ignore msg))
   (assert (not erp))
   (mv-let 
    (bindings declare-forms body)
    (parse-additional-declare-forms-for-let remainder-forms)
    (cond ((null (cdr bindings)) ; at most one binding
           `(let ,bindings ,@declare-forms ,@body))
          (t (list 'if
                   (parallelism-condition gran-form-exists gran-form)
                   (let ((bsym (acl2-gentemp "plet")))
                     `(let ((,bsym (parallelize-fn 'identity-list
                                                   (list ,@(make-closures bindings)))))
                        (let ,(plet-doublets bindings bsym 0)
                          ,@declare-forms
                          ,@body)))
                   `(let ,bindings ,@declare-forms ,@body)))))))

(defun and-list (x)
  (declare (xargs :guard (true-listp x)))
  (if (endp x)
      t
    (and (car x)
         (and-list (cdr x)))))

#+acl2-par
(defmacro pand (&rest forms)
  
; This is the raw Lisp version for threaded Lisps.

  (mv-let
   (erp msg gran-form-exists gran-form remainder-forms)
   (check-and-parse-for-granularity-form forms)
   (declare (ignore msg))
   (assert (not erp))
   (if (null (cdr remainder-forms)) ; whether pand has only one argument
       (list 'if (car remainder-forms) t nil)
     (let ((and-early-termination-function
            '(lambda (x) (null x))))
       (list 'if
             (parallelism-condition gran-form-exists gran-form)
             (list 'parallelize-fn ''and-list
                   (list 'closure-list-for-expression-list
                         remainder-forms)
                   and-early-termination-function)
             (list 'if (cons 'and remainder-forms) t nil))))))

(defun or-list (x)
  (declare (xargs :guard (true-listp x)))
  (if (endp x)
      nil
    (if (car x)
        t
        (or-list (cdr x)))))

#+acl2-par
(defmacro por (&rest forms)

; This is the raw Lisp version for threaded Lisps.

  (mv-let
   (erp msg gran-form-exists gran-form remainder-forms)
   (check-and-parse-for-granularity-form forms)
   (declare (ignore msg))
   (assert (not erp))
   
   (if (null (cdr remainder-forms)) ; whether por has one argument
       (list 'if (car remainder-forms) t nil)
     (let ((or-early-termination-function
            '(lambda (x) x)))
       (list 'if
             (parallelism-condition gran-form-exists gran-form)
             (list 'parallelize-fn ''or-list
                   (list 'closure-list-for-expression-list
                         remainder-forms)
                   or-early-termination-function)
             (list 'if (cons 'or remainder-forms) t nil))))))
