/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2020 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/*! \file vector.h

    \brief code for horizontal vectorization in vspline
    
    vspline currently has three ways of approaching vectorization:
  
    - no vectorization. Scalar code is less complex since it does not
      have to aggregate the data into vectorization-friendly parcels,
      and for some data types, the performance is just as good as with
      vectorization. Use of scalar code results from setting the
      vectorization width to 1. This is usually a template argument
      going by the name 'vsize'.
  
    - Use of Vc for vectorization. This requires the presence of Vc
      during compilation and results in explicit vectorization for all
      elementary types Vc can handle. Vc provides code for several
      operations which are outside the scope of autovectorization,
      most prominently hardware gather and scatter operations, and the
      explicit vectorization with Vc makes sure that vectorization is
      indeed used whenever possible, rather than having to rely on the
      compiler to recognize the opportunity. Use of Vc has to be
      explicitly activated by defining USE_VC during compilation.
      Using this option usually produces the fastest code. The downside
      is the dependence on an external library which may or may not
      actually implement the intended vector operations with vector
      code for a given target: Newer processors may not yet be supported,
      or support may be implemented for part of the instructions only.
      Also, the Vc version coming from the distro's packet management
      may not be up-to-date. Building processing pipelines based on
      Vc::SimdArray is, on the other hand, straightforward - the type
      is well-thought-out and there is good library support for many
      operations. Use of Vc triggers use of fallback code for elementary
      types which Vc can't vectorize - such types are pseudo-vectorized:
  
    - The third option is to produce code which is designed to be
      easily recognized by the compiler as amenable to autovectorization.
      This is a technique I call 'goading': data are processed in small
      aggregates of vector friendly size, resulting in inner loops
      which oftentimes are recognized by the autovectorization stage,
      resulting in hardware vector code if the compiler flags allow
      for it and the compiler can generate code for the intended target.
      Since this approach relies entirely on the compiler's capability
      to autovectorize the (deliberately vectorization-friendly) code,
      the mileage varies. If it works, this is a clean and simple
      solution. A disadvantage is the use of class simd_tv for
      vectorization, which is mildly exotic and very much a vspline
      creature - building processing pipelines using this type will
      not be as effortless as using Vc::SimdArray. As long as you're
      not building your own functors to be used with vspline's family
      of transform-like functions, the precise mode of vectorization
      remains an internal issue and you needn't concern yourself with
      with it beyond choosing whether you want vspline to use Vc or not,
      and choosing a suitable vectorization width if the default does
      not suit you.
  
    It's important to understand that using simd_tv is not simply mapping,
    say, pixels of three floats to simd_tv of three floats - that would be
    'vertical' vectorization, which is represented by vspline's *scalar*
    code. Instead, vspline is coded to use *horizontal* vectorization,
    which produces vector data fitting the size of the vector unit's
    registers, where each element held by the vector has exactly the same
    meaning as every other: rather than vectors holding, like, the colour
    channels of a pixel, we have a 'red', a 'green' ad a 'blue' vector
    holding, say, eight floats each. Horizontal vectorization is best
    explicitly coded, and if it is coded explicitly, the code structure
    itself suggests vectorization to the compiler. Using code like Vc
    gives more structure to this process and adds capabilities beyond the
    scope of autovectorization, but having the horizontal vectorization
    manifest in the code's structure already goes a long way, and if the
    'structurally' vectorized code autovectorizes well, that may well be
    'good enough' as it is. In my experience, it is often significantly
    faster than scalar code - provided the processor has vector units.

    Using Vc for SIMD aggregation is not always possible:
    
    - the user may not want/have Vc
    
    - Vc may not be able to vectorize a given fundamental
    
    struct simd_tv provides an implementation of a type exhibiting
    roughly the same interface as Vc::SimdArray, which is built on
    top of a vigra::TinyVector and uses loops to process the multiple
    data it holds. Many of these loops will be autovectorized,
    so the effect is quite similar to using 'proper' explicit SIMD
    code - the notable exception being gather/scatter access to memory,
    which - AFAICT - is not automatically translated to SIMD gathers
    and scatters.
    
    Most of the pseudo-SIMD capabilities are introduced by mixins,
    which might be useful to instrumentalize other 'host' types for
    SIMD emulation - std::valarray springs to mind, as it can do
    arithmetic, though this needs some finessing, as the binary
    operators return expression templates and the size is not a
    template argument.
    
    Using simd_tv as fallback if Vc is not available allows us to
    use most of the vectorized code in vspline as long as it is not
    Vc-specific. Vc-specific code is inside #ifdef USE_VC ... #endif
    preprocessor statements, and vspline provides an #else case where
    necessary (some code can be excluded completely from compilation
    if Vc is not used). Running vspline's vector code with simd_tv
    is slower than running it with Vc::SimdArray, but usually faster
    than running scalar code. I assume that the speed difference is
    largely due to the absence of hardware gather/scatter operations,
    which are not produced by autovectorization since their loop
    equivalents aren't recognized by the compilers as constructs
    which have a vectorized representation.
    
    This is new code, the emulation of Vc::SimdArray is not at all
    complete. What's provided here is the functionality of Vc::SimdArray
    which is actually used inside namespace vspline, so all vspline
    functions should function as expected. When using vspline's
    transform-like functions with functors using Vc code, the
    incomplete emulation will likely show, unless the use of Vc
    is limited to what's emulated in this header.
    
    The implementation of the mixins and the type itself are deliberately
    trivial for the time being to avoid introducing errors by trying to
    be comprehensive. If necessary, the type can evolve to perform better.
    
    Since simd_tv inherits from vigra::TinyVector, it's easy to
    'break out' of functions receiving simd_tv as arguments: simd_tv
    references can simply be cast to vigra::TinyVector references
    and passed on as such. So if you have a vspline::unary_functor
    with a vectorized eval overload, you can cast the incoming
    argument to const TinyVector&, the outgoing argument to
    TinyVector&, and then work on the TinyVector references. This
    may be helpful if the simd_tv gets in the way. If you just want
    to use vspline's transform-like functions, this gives you an
    easy way of processing arrays with multithreading and aggregation
    without forcing you to deal with the simd_tv data type.
    
    Note that this header is included by vspline/common.h, so this code
    is available throughout vspline.
*/

#ifndef VSPLINE_VECTOR_H
#define VSPLINE_VECTOR_H

#include "common.h"

#ifdef USE_VC

#include <Vc/Vc>

// KFJ 2019-02-21 To use Vc::SimdArray in syntactic slots expecting
// a class template with a type and a size_t template argument, I
// introduce 'simd_array' which takes just these two arguments and
// leaves the last two template arguments for Vc::SimdArray at their
// default:

#ifdef USE_VC

template < typename T , size_t SZ >
using simd_array = typename
Vc::SimdArray < T , SZ > ;

#endif

// this enum will hold true or false, depending on whether the
// translation unit including this header was compiled with USE_VC
// defined or not.

enum { vc_in_use = true } ;

#else // #ifdef USE_VC

enum { vc_in_use = false } ;
  
#endif // #ifdef USE_VC

namespace vspline
{
// The first section of code in this file provides the type 'simd_tv'
// which emulates SIMD data types and operations using a vigra::TinyVector.
// This type's capabilities are mostly provided as mixins, the type
// definition follows after the mixins.

/// generic_simd_memory_access is a mixin used to provide SIMD-typical
/// memory access functionality to some template class X in a way which
/// only relies on X being indexable.

template < template < typename , size_t > class X ,
           typename T , size_t N >
struct generic_simd_memory_access
{
  typedef X < T , N > derived_t ;  
  
  /// generic load uses a loop:
  
  void load ( const T * const p_src )
  {
    derived_t & di ( * ( static_cast < derived_t * > ( this ) ) ) ;
    for ( size_t i = 0 ; i < N ; i++ )
      di [ i ] = p_src [ i ] ;
  }

  /// generic gather performs the gather operation using a loop.
  /// Note how index_type is introduced as a template argument,
  /// allowing any type which provides operator[] ( int )
  
  template < class index_type >
  void gather ( const T * const p_src ,
                const index_type & indexes )
  {
    derived_t & di ( * ( static_cast < derived_t * > ( this ) ) ) ;
    for ( size_t i = 0 ; i < N ; i++ )
      di [ i ] = p_src [ indexes [ i ] ] ;
  }
  
  /// store saves the content of the container to memory
  
  void store ( T * const p_src ) const
  {
    const derived_t & di ( * ( static_cast < const derived_t * > ( this ) ) ) ;
    for ( size_t i = 0 ; i < N ; i++ )
      p_src [ i ] = di [ i ] ;
  }
  
  /// scatter is the reverse operation to gather, see the comments there.
  
  template < class index_type >
  void scatter ( T * const p_src ,
                 const index_type & indexes ) const
  {
    const derived_t & di ( * ( static_cast < const derived_t * > ( this ) ) ) ;
    for ( size_t i = 0 ; i < N ; i++ )
      p_src [ indexes [ i ] ] = di [ i ] ;
  }

} ;

/// mixin 'compare' provides methods to produce a mask on comparing
/// a vector with some other indexable object or a scalar.

template < template < typename , size_t > class X ,
           typename T , size_t N >
struct compare
{
  typedef X < T , N > derived_t ;
  typedef X < bool , N > mask_t ;
  
// we take lhs and rhs by value. If the calling code passes anything that
// can be used to construct derived_t, we'll receive derived_t here, so we
// can safely index lhs and rhs and this single macro is enough:

#define COMPARE_FUNC(OP,OPFUNC) \
  friend mask_t OPFUNC ( derived_t lhs , \
                         derived_t rhs ) \
  { \
    mask_t m ; \
    for ( size_t i = 0 ; i < N ; i++ ) \
      m [ i ] = ( lhs [ i ] OP rhs [ i ] ) ; \
    return m ; \
  }

  COMPARE_FUNC(<,operator<) ;
  COMPARE_FUNC(<=,operator<=) ;
  COMPARE_FUNC(>,operator>) ;
  COMPARE_FUNC(>=,operator>=) ;
  COMPARE_FUNC(==,operator==) ;
  COMPARE_FUNC(!=,operator!=) ;
  
#undef COMPARE_FUNC

  // TODO: test if providing comparison with T is faster than broadcasting
  // T arguments to derived_t by accepting the args per value
} ;

/// 'bitwise_op' 'rolls out' bitwise and, or and xor. Inside vspline,
/// this is only used for masks, so we enable this code only if T is
/// bool. Might be extended to integral types, though.

template < template < typename , size_t > class X ,
           typename T , size_t N >
struct bitwise_op
{
  typedef X < T , N > derived_t ;
  

#define BITWISE_OP(OPFUNC,OPEQ) \
  template < typename = std::enable_if \
                        < std::is_same < T , bool > :: value > > \
  friend derived_t OPFUNC ( derived_t lhs , \
                            derived_t rhs ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      lhs [ i ] OPEQ rhs [ i ] ; \
    return lhs ; \
  }

  BITWISE_OP(operator&,&=)
  BITWISE_OP(operator|,|=)
  BITWISE_OP(operator^,^=)
  
#undef BITWISE_OP

} ;

/// 'broadcast_std_func' applies functions from namespace std to
/// each element in a vector, or to each corresponding pair of
/// elements in two vectors. While this might be extended, we only
/// provide the set of functions which are actually needed inside
/// vspline. Most functions work without using this mixin because
/// vigra::TinyVector provides the 'rollout'.

// TODO: abs is defined for vigra::TinyVector, but without rolling
// it out here I don't get it to work on simd_tv.

template < template < typename , size_t > class X ,
           typename T , size_t N >
struct broadcast_std_func
{
  typedef X < T , N > derived_t ;
  
#define BROADCAST_STD_FUNC(FUNC) \
  friend derived_t FUNC ( derived_t arg ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      arg [ i ] = std::FUNC ( arg [ i ] ) ; \
    return arg ; \
  }

BROADCAST_STD_FUNC(abs)
BROADCAST_STD_FUNC(trunc)

// BROADCAST_STD_FUNC(round)
// BROADCAST_STD_FUNC(floor)
// BROADCAST_STD_FUNC(ceil)
// BROADCAST_STD_FUNC(log)
// BROADCAST_STD_FUNC(exp)
// BROADCAST_STD_FUNC(sqrt)

// BROADCAST_STD_FUNC(sin)
// BROADCAST_STD_FUNC(cos)
// BROADCAST_STD_FUNC(tan)
// BROADCAST_STD_FUNC(asin)
// BROADCAST_STD_FUNC(acos)
// BROADCAST_STD_FUNC(atan)

#undef BROADCAST_STD_FUNC

// #define BROADCAST_STD_FUNC2(FUNC) \
//   friend derived_t FUNC ( derived_t arg , derived_t arg2 ) \
//   { \
//     for ( size_t i = 0 ; i < N ; i++ ) \
//       arg [ i ] = std::FUNC ( arg [ i ] , arg2 [ i ] ) ; \
//     return arg ; \
//   }
// 
// BROADCAST_STD_FUNC2(atan2)
// 
// #undef BROADCAST_STD_FUNC2

// fma for simd_tv is currently defined via a template in common.h

// #define BROADCAST_STD_FUNC3(FUNC) \
//   friend derived_t FUNC ( const derived_t & arg1 , \
//                           const derived_t & arg2 , \
//                           const derived_t & arg3 ) \
//   { \
//     derived_t result ; \
//     for ( size_t i = 0 ; i < N ; i++ ) \
//       result [ i ] = FUNC ( arg1 [ i ] , arg2 [ i ] , arg3[i] ) ; \
//     return result ; \
//   }
// 
// BROADCAST_STD_FUNC3(fma)
// 
// #undef BROADCAST_STD_FUNC3

} ;

/// 'reduce_to_bool' provides any_of, all_of and none_of,
/// which reduce a vector to a boolean. currently unused,
/// simd_tv delegates to vigra::TinyVector's 'all' and 'any'

// template < template < typename , size_t > class X ,
//            typename T , size_t N >
// struct reduce_to_bool
// {
//   typedef X < T , N > derived_t ;
//   
//   friend bool any_of ( const derived_t & arg )
//   {
//     bool result = false ;
//     for ( size_t i = 0 ; i < N ; i++ )
//       result |= bool ( arg[i] ) ;
//     return result ;
//   }
// 
//   friend bool all_of ( const derived_t & arg )
//   {
//     bool result = true ;
//     for ( size_t i = 0 ; i < N ; i++ )
//       result &= bool ( arg[i] ) ;
//     return result ;
//   }
// 
//   friend bool none_of ( const derived_t & arg )
//   {
//     bool result = true ;
//     for ( size_t i = 0 ; i < N ; i++ )
//       result &= ( ! bool ( arg[i] ) ) ;
//     return result ;
//   }
// } ;

/// struct simd_tv inherits from TinyVector and gets the SIMD functionality
/// by inheriting a set of mixins. simd_tv is used inside vspline wherever
/// vectorized data are processed - unless Vc is available and capable of
/// vectorizing a given fundamental type. So Vc code is opt-in only -
/// vspline can function without it, oftentimes even without much of a
/// performance penalty, but if it can be used, Vc may provide an extra
/// speedup which is critical for some use scenarios. Within vspline, I
/// found the most notable speedup when performing b-spline evaluations
/// on float data, which is a very common requirement.

template < typename T , size_t N >
struct simd_tv
: public vigra::TinyVector < T , N > ,
  public generic_simd_memory_access < simd_tv , T , N > ,
  public compare < simd_tv , T , N > ,
  public bitwise_op < simd_tv , T , N > ,
  public broadcast_std_func < simd_tv , T , N >
{
  typedef T value_type ;
  typedef simd_tv < T , N > this_t ;
  typedef vigra::TinyVector < T , N > inner_t ;
  typedef simd_tv < bool , N > mask_type ;
  typedef simd_tv < int , N > index_type ;
  
  static constexpr size_t size()
  {
    return N ;
  }

  // emulate Vc's IndexesFromZero, which I use often
  
  static const index_type IndexesFromZero()
  {
    index_type ix ;
    for ( int i = 0 ; i < N ; i++ )
      ix[i] = i ;
    return ix ;
  }
  
  // perfect-forward any arguments to the base class ctor. Since
  // simd_tv inherits from vigra::TinyVector, this results in the
  // availability of all of vigra::TinyVector's ctor overloads.
  
  template < typename ... types >
  simd_tv ( types && ... args )
  : inner_t ( std::forward < types > ( args ) ... )
  { } ;

  // ctor from some other vector-like object which has the same
  // template argument signature <typename, size_t> and same size,
  // but element type U. the elements are taken singly, casting to T.
  
  template < typename U ,
             template < typename , size_t > class other_v >
  simd_tv ( const other_v < U , N > & other )
  {
    for ( size_t i = 0 ; i < N ; i++ )
      (*this)[i] = T ( other[i] ) ;
  }
  
#ifdef USE_VC

  // ctor taking a Vc::SimdArray which has additional
  // (normally default) template arguments.

  template < typename U , typename X , size_t x >
  simd_tv ( const Vc::SimdArray < U , N , X , x > & other )
  {
    for ( size_t i = 0 ; i < N ; i++ )
      (*this)[i] = T ( other[i] ) ;
  }
  
#endif

// overrides for assignment operators. For scalars, vigra only has op=
// with double arguments, here we define the operations taking the vector's
// value_type instead. We also define op= for vectors of equal type.

#define OPEQ_FUNC(OPFUNC,OPEQ) \
  this_t & OPFUNC ( const value_type & rhs ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      (*this) [ i ] OPEQ rhs ; \
    return *this ; \
  } \
  this_t & OPFUNC ( const this_t & rhs ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      (*this) [ i ] OPEQ rhs [ i ] ; \
    return *this ; \
  }

OPEQ_FUNC(operator+=,+=)
OPEQ_FUNC(operator-=,-=)
OPEQ_FUNC(operator*=,*=)
OPEQ_FUNC(operator/=,/=)
OPEQ_FUNC(operator%=,%=)
OPEQ_FUNC(operator&=,&=)
OPEQ_FUNC(operator|=,|=)
OPEQ_FUNC(operator^=,^=)

// bit-sifted assignment

#undef SHIFTEQ_FUNC

#define SHIFTEQ_FUNC(OPFUNC,OPEQ) \
  this_t & OPFUNC ( const int & rhs ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      (*this) [ i ] OPEQ rhs ; \
    return *this ; \
  } \
  this_t & OPFUNC ( const simd_tv<int,N> & rhs ) \
  { \
    for ( size_t i = 0 ; i < N ; i++ ) \
      (*this) [ i ] OPEQ rhs [ i ] ; \
    return *this ; \
  }

SHIFTEQ_FUNC(operator<<=,<<=)
SHIFTEQ_FUNC(operator>>=,>>=)

#undef SHIFTEQ_FUNC

// binary operator and left and right scalar operations with
// a fundamental. vigra only accepts 'double' as scalar.
// We use operatorX= instead, returning the same type as
// the vector argument. Since this is code specifically
// overriding vigra::TinyVector functionality, it's not
// in a mixin.

#define OP_FUNC(OPFUNC,OPEQ) \
  this_t OPFUNC ( const value_type & rhs ) const \
  { \
    this_t help ( *this ) ; \
    for ( size_t i = 0 ; i < N ; i++ ) \
      help [ i ] OPEQ rhs ; \
    return help ; \
  } \
  this_t OPFUNC ( const this_t & rhs ) const \
  { \
    this_t help ( *this ) ; \
    for ( size_t i = 0 ; i < N ; i++ ) \
      help [ i ] OPEQ rhs [ i ] ; \
    return help ; \
  } \
  friend this_t OPFUNC ( const value_type & lhs , \
                         const this_t & rhs ) \
  { \
    this_t help ( lhs ) ; \
    for ( size_t i = 0 ; i < N ; i++ ) \
      help [ i ] OPEQ rhs [ i ] ; \
    return help ; \
  }    
  
OP_FUNC(operator+,+=)
OP_FUNC(operator-,-=)
OP_FUNC(operator*,*=)
OP_FUNC(operator/,/=)
OP_FUNC(operator%,%=)
OP_FUNC(operator&,&=)
OP_FUNC(operator|,|=)
OP_FUNC(operator^,^=)

#undef OP_FUNC

  // unary -

  this_t operator-() const
  {
    const inner_t & inner ( *this ) ;
    return - inner ;
  }
  
  // vigra::TinyVector offers 'any' and 'all', so we delegate to the
  // vigra code instead of using the generic mixin 'reduce_to_bool'

  friend bool any_of( const this_t & arg )
  {
    return arg.any() ;
  }

  friend bool all_of ( const this_t & arg )
  {
    return arg.all() ;
  }

  friend bool none_of ( const this_t & arg )
  {
    return ! ( arg.any() ) ;
  }
  
} ; // end of struct simd_tv

// The next section codes use of vectorization in vspline.

/// traits class simd_traits provides three traits:
/// - 'hsize' holds the hardware vector width if applicable (used only with Vc)
/// - 'type': template yielding the vector type for a given vectorization width
/// - 'default_size': the default vectorization width to use for T
///
/// default simd_traits: without further specialization, T will be vectorized
/// as a vspline::simd_tv. This way, *all* types will be vectorized, there is
/// no more fallback to scalar code for certain types. Scalar code will only be
/// produced if the vectorization width is set to 1 in code taking this
/// datum as a template argument. Note that the type which simd_traits produces
/// for sz == 1 is T itself, not a simd_tv of one element.

template < typename T >
struct simd_traits
{
  template < size_t sz > using type =
    typename std::conditional < sz == 1 ,
                                T ,
                                vspline::simd_tv < T , sz >
                              > :: type ;
                              
  static const size_t hsize = 0 ;
  
  enum { default_size =   sizeof ( T ) > 64
                        ? 1
                        : 64 / sizeof ( T ) } ; // <<<< heuristic
} ;

#if defined USE_VC

// in Vc ML discussion M. Kretz states that the set of types Vc can vectorize
// (with 1.3) is consistent throughout all ABIs, so we can just list the 
// acceptable types without having to take the ABI into account.
// So, for these types we specialize 'simd_traits', resulting in the use of
// the appropriate Vc::SimdArray

#define VC_SIMD(T) \
template<> struct simd_traits<T> \
{ \
  static const size_t hsize = Vc::Vector < T > :: size() ; \
  template < size_t sz > using type = \
    typename std::conditional \
             < sz == 1 , \
               T , \
               Vc::SimdArray < T , sz > \
             > :: type ; \
  enum { default_size = 2 * hsize } ; \
} ;

VC_SIMD(float)
VC_SIMD(double)
VC_SIMD(int)
VC_SIMD(unsigned int)
VC_SIMD(short)
VC_SIMD(unsigned short)

#undef VC_SIMD

#endif // USE_VC

/// with the definition of 'simd_traits', we can proceed to implement
/// 'vector_traits':
/// struct vector_traits is a traits class fixing the types used for
/// vectorized code in vspline.
/// with the types defined by vector_traits, a system of type names is
/// introduced which uses a set of patterns:
/// - 'ele' stands for 'elementary', the type of an aggregate's component
/// - 'nd' stands for 'n-dimensional', a type of an aggregate of one or more
///    components of a given elementary type.
/// - 'v' suffix indicates a 'simdized' type, vspline uses Vc::SimdArrays
///   and vigra::TinyVectors of Vc::SimdArrays if Vc is used and the type
///   can be used with Vc::SimdArray, and the equivalent types using
///   vspline::simd_tv instead of Vc::SimdArray otherwise.
/// the unspecialized definition of class vector_traits will vectorize
/// by concatenating instances of T into the type simd_traits produces,
/// taking, per default, as many T as the default_size given there.
/// This will work with any type T, even though it makes most sense with
/// fundamentals.

template < typename T ,
           size_t _vsize = 0 ,
           typename Enable = void >
struct vector_traits
{
  // T is not 'element-expandable', so 'dimension' is 1 and T is ele_type
  enum { dimension = 1 } ;
  typedef T ele_type ;
  
  // find the right vectorization width
  enum { size = _vsize == 0
                ? simd_traits < ele_type > :: default_size
                : _vsize } ;

  enum { vsize = size } ;
  enum { hsize = simd_traits < ele_type > :: hsize } ;
  
  // produce the 'synthetic' type,
  typedef vigra::TinyVector < ele_type , 1 > nd_ele_type ;
  
  // the vectorized type
  template < typename U , size_t sz >
  using vector = typename simd_traits < U > :: template type < sz > ;
  
  typedef vector < ele_type , vsize > ele_v ; 
  
  // and the 'synthetic' vectorized type
  typedef vigra::TinyVector < ele_v , 1 > nd_ele_v ;
  
  // for not 'element-expandable' T, we produce ele_v as 'type'
  typedef ele_v type ;
} ;

/// specialization of vector_traits for 'element-expandable' types.
/// These types are recognized by vigra's ExpandElementResult mechanism,
/// resulting in the formation of a 'vectorized' version of the type.
/// As explained above, vectorization is *horizontal*, so if T is, say,
/// a pixel of three floats, the type generated here will be a TinyVector
/// of three vectors of vsize floats.

template < typename T , size_t _vsize >
struct vector_traits
       < T ,
         _vsize ,
         typename std::enable_if
                  < vspline::is_element_expandable < T > :: value
                  > ::type
       >
{
  // T is 'element-expandable' - meaning it can be element-expanded
  // with vigra's ExpandElementResult mechanism. We use that to obtain
  // the elementary type and the dimension of T. Note that, if T is
  // fundamental, the resulting traits are the same as they would be
  // for the unspecialized case. What we're interested in here are
  // multi-channel types; that fundamentals are routed through here
  // is just as good as if they were routed through the unspecialized
  // case above.

  enum { dimension = vigra::ExpandElementResult < T > :: size } ;
  typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
  
  // given the elementary type, we define nd_ele_type as a vigra::TinyVector
  // of ele_type. This is the 'synthetic' type.

  typedef vigra::TinyVector < ele_type , dimension > nd_ele_type ;
  
  // next we glean the number of elements a 'vector' should contain.
  // if the template argument 'vsize' was passed as 0, which is the default,
  // We use the default vector size which simd_traits provides. For
  // explicitly specified _vsize we take the explicitly specified value.

  enum { size = _vsize == 0
                ? simd_traits < ele_type > :: default_size
                : _vsize } ;

  // I prefer to use 'vsize' as it is more specific than mere 'size'
                
  enum { vsize = size } ;
  
  // hardware vector register size, if applicable - only used with Vc
  
  enum { hsize = simd_traits < T > :: hsize } ;

  // now we obtain the template for a vector of a given size. This will
  // be either Vc::SimdArray or vspline::simd_tv
  
  template < typename U , size_t sz >
  using vector = typename simd_traits < U > :: template type < sz > ;
  
  // using this template and the vectorization width we have established,
  // we obtain the vectorized data type for a component:

  typedef vector < ele_type , vsize > ele_v ; 
  
  // nd_ele_v is the 'synthetic' vectorized type, which is always a
  // TinyVector of the vectorized component type, possibly with only
  // one element:
  
  typedef vigra::TinyVector < ele_v , dimension > nd_ele_v ;
  
  // finally, 'type' is the 'canonical' vectorized type, meaning that if
  // T is a fundamental we produce the component vector type itself, but if
  // it is some aggregate (like a TinyVector) we produce a TinyVector of the
  // component vector data type. So if T is float, 'type' is a vector of float,
  // If T is a TinyVector of one float, 'type' is a TinyVector of one vector
  // of float.
  
  typedef typename std::conditional
    < std::is_fundamental < T > :: value ,
      ele_v ,
      nd_ele_v
    > :: type type ;
    
} ;

/// this alias is used as a shorthand to pick the vectorized type
/// for a given type T and a size N from 'vector_traits':

template < typename T , size_t N >
using simdized_type = typename vector_traits < T , N > :: type ;

// In order to avoid syntax which is specific to a specific vectorization
// method, I use some free functions for assignments, which avoid
// member functions of the vector objects. While this produces some
// notational inconvenience, it allows a formulation which is
// independent of the vectorization used. this way I can use Vc::SimdArray
// as a target of an assignment from another vectorized data type, which
// would be impossible with operator=, which has to be a member function.

// KFJ 2018-05-11 added variants of assign with T == U. I was
// surprised to see that this did not produce ambiguity
// problems, but it seems okay.
//
// There are now quite a few 'assign' overloads, so that the code
// has explicit variants for several argument constellations. I'm
// not sure if this is really necessary (TODO: test), but I want to
// keep the impact of 'breaking out' low, to enable pipeline code
// to move from one type of vectorized object to another without
// too much performance penalty.

/// To uniformly handle assignments of vectorized an unvectorized
/// data, I use free functions. The first one is enabled if both
/// objects are fundamentals and simply assigns with cast to the
/// target type:

template < typename T , typename U >
           typename std::enable_if
                    <    std::is_fundamental < T > :: value
                      && std::is_fundamental < U > :: value
                    > :: type
assign ( T & self , const U & other )
{
  self = T ( other ) ;
}

template < typename T >
           typename std::enable_if
                    < std::is_fundamental < T > :: value
                    > :: type
assign ( T & self , const T & other )
{
  self = other ;
}

/// The second one is enabled if both arguments are simd_tv of
/// the same size

template < typename T , typename U , size_t sz >
void assign ( vspline::simd_tv < T , sz > & self ,
              const vspline::simd_tv < U , sz > & other )
{
  for ( size_t i = 0 ; i < sz ; i++ )
  {
    self[i] = T ( other[i] ) ;
  }
}

template < typename T , size_t sz >
void assign ( vspline::simd_tv < T , sz > & self ,
              const vspline::simd_tv < T , sz > & other )
{
  self = other ;
}

#ifdef USE_VC

/// with Vc in use, we have overloads for assignment from one
/// SimdArray to another and the mixed forms

template < typename T , typename U , size_t sz >
void assign ( Vc::SimdArray < T , sz > & self ,
              const Vc::SimdArray < U , sz > & other )
{
  self = Vc::SimdArray < T , sz > ( other ) ;
}

template < typename T , size_t sz >
void assign ( Vc::SimdArray < T , sz > & self ,
              const Vc::SimdArray < T , sz > & other )
{
  self = other ;
}

template < typename T , typename U , size_t sz >
void assign ( Vc::SimdArray < T , sz > & self ,
              const vspline::simd_tv < U , sz > & other )
{
  for ( size_t i = 0 ; i < sz ; i++ )
  {
    self[i] = T ( other[i] ) ;
  }
}

template < typename T , size_t sz >
void assign ( Vc::SimdArray < T , sz > & self ,
              const vspline::simd_tv < T , sz > & other )
{
  self = Vc::SimdArray < T , sz > ( ( T* ) ( & other ) ) ;
}

template < typename T , typename U , size_t sz >
void assign ( vspline::simd_tv < T , sz > & self ,
              const Vc::SimdArray < U , sz > & other )
{
  for ( size_t i = 0 ; i < sz ; i++ )
  {
    self[i] = T ( other[i] ) ;
  }
}

template < typename T , size_t sz >
void assign ( vspline::simd_tv < T , sz > & self ,
              const Vc::SimdArray < T , sz > & other )
{
  self = vspline::simd_tv < T , sz > ( ( T* ) ( & other ) ) ;
}

#endif

/// assignment between two TinyVectors. This delegates to
/// 'assign' for the components, so compilation fails if
/// T and U aren't either fundamentals or vector types. This
/// is deliberate, since the use of 'assign' is only meant for
/// the specific arguments occuring in it's overloads, rather
/// than a universal way of assigning any two objects.

template < typename T , typename U , int sz >
void assign ( vigra::TinyVector < T , sz > & self ,
              const vigra::TinyVector < U , sz > & other )
{
  for ( int i = 0 ; i < sz ; i++ )
  {
    assign ( self[i] , other[i] ) ;
  }
}

/// generic free masked assignment function. wherever 'mask' is true,
/// the corresponding entry in 'other' will be assigned to the
/// corresponding entry in 'self'. Other may be any type as long as
/// it can be used to construct a 'vector', so if other is a scalar,
/// it is broadcast.
///
/// In my ongoing effort to factor out vectorization, I use this function
/// template in preference to Vc's convenient X(mask) = Y syntax. This
/// provides a uniform interface to masked assignment which can be used
/// with any indexable type.
//
// TODO investigate elaboration of this function by extending it into
// a family with diversified argument spectrum (like, scalar 'other',
// Vc::SimdArray + non-Vc mask etc.)

template < template < typename , size_t > class vector ,
           typename T , size_t N , class other_type >  
void assign_if ( vector < T , N > & self ,
                 const typename
                       vector < T , N > :: mask_type & mask ,
                 const other_type & other )
{
  vector < T , N > help ( other ) ;

  for ( size_t i = 0 ; i < N ; i++ )
  {
    if ( mask [ i ] )
      self [ i ] = help [ i ] ;
  }
}

#ifdef USE_VC

/// overload of assign_if for Vc::SimdArray, using Vc's convenient
/// syntax of X(mask) = Y, which may be more efficient as well.

template < typename T , size_t N , class other_type >  
void assign_if ( Vc::SimdArray < T , N > & self ,
                 const typename
                       Vc::SimdArray < T , N > :: mask_type & mask ,
                 const other_type & other )
{
  self ( mask ) = other ;
}

#endif

} ; // end of namespace vspline

#endif // #ifndef VSPLINE_VECTOR_H
