# ========================== begin_copyright_notice ============================
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
# =========================== end_copyright_notice =============================

#===----------------------------------------------------------------------===//
#
# This file defines all of the GenX-specific intrinsics, which correspond to
# vISA instructions.
#
# Comment lines with a triple slash ### introduction are extracted and
# appended to docs/Targets/GenX/GenXLangRef.rst to give the GenX backend
# language reference in docs/autogenerated/Targets/GenX/GenXLangRef.rst.
#
#===------------------------------------------------------------------------===#

#------------ Currently Supported Types ----------------------
#PointerTypes = ["ptr_private", "ptr_global", "ptr_constant", "ptr_local", "ptr_generic"]
#FloatingPointTypes = ["half", "float", "double"]
#IntegerTypes = ["bool", "char", "short", "int", "long"]
#AdditionalTypes = ["vararg"]
#IntrinsicsProperties = ["None", "NoMem", "ReadArgMem", "ReadMem", "ReadWriteArgMem", "NoReturn", "NoDuplicate", "Convergent"]
#IntrinsicsProperties may be specified as a comma separated list(e.g., "Convergent,NoMem")
#
# EX. "blah": {"result" : {return_type}, "arguments" : [arg1_type, arg2_type.....], "attributes" : Property }
#
# The "any" type can be followed by a default type if a type is not explicitly specified : Ex. "any:int"
#
# 0 - LLVMMatchType<0>
# 1 - LLVMMatchType<1>
# {int} - LLVMMatchType<{int}>

#------------ Supported platforms ----------------------
# Every intrinsic has optinal field "platforms" : "CPU"
# CPU can be any from "platforms" in Intrinsics.py or "ALL"
# when field is absent - ALL by default
# additional commands :
# "CPU" = "-SKL" - unsupported since SKL
# "CPU" = "KBL+" - supported from KBL
# "CPU" = "~ICLLP" - unsupported on ICLLP
# CPU can be list:
# ["CNL+", "KBL"] - supported on KBL and all started from CNL
# ["ALL", "~TGLLP"] - supported everyvere except TGLLP

Imported_Intrinsics = \
{

##--------------------------------------------------------------------
## Start and end markers of the genx intrinsic enum values. This relies on
## tablegen outputting the intrinsics in sorted by name order.
    "aaaabegin" : { "result" : "anyvector",
                    "arguments" : [],
                    "attributes" : "None"
                  },
    "zzzzend" : { "result" : "anyvector",
                  "arguments" : [],
                  "attributes" : "None"
                },

### ``llvm.genx.alloca.<any type>`` : CMC internal, no VISA
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Indicates memory allocation in thread-private memory
###
### * arg0: type to allocate in thread-private memory
###
### * Return value: offset in stack surface
###
    "alloca" : { "result" : "anyint",
                 "arguments" : ["any"],
                 "attributes" : "None"
               },

### ``llvm.genx.faddr.<any type>`` : take an address of the function provided
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Takes an address of the provided function which then may be used
### in VISA indirect call instruction.
###
### * arg0: function to take address of
###
### * Return value: i64 address ready to be consumed by an indirect call
###
    "faddr" : { "result" : "long",
                "arguments" :  ["any"],
                "attributes" :  "NoMem"
              },

## --------------------------------
### Region/element access intrinsics
### --------------------------------
###
### ``llvm.genx.rdregion*.<return type>.<vector type>.<any int>`` : read a region, direct or single-indirect
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.rdregioni`` : integer element type (not i1)
### * ``llvm.genx.rdregionf`` : fp element type
###
### * arg0: vector to read region out of (overloaded)
### * arg1: i32 vstride in elements, constant
### * arg2: i32 width in elements, constant
### * arg3: i32 stride in elements, constant
### * arg4: i16 or vXi16 offset in bytes (overloaded)
### * arg5: i32 parent width, constant, ignored if offset is constant
###
### * Return value: the region extracted
###
### The return type must be a vector with the same element type as the input
### vector, and number of elements giving the total size of the region.
### A scalar can be used instead of a 1-vector.
###
### There are two variants, an integer one and an fp one, because the
### intrinsic declaration language does not let us declare the return type
### as any scalar or vector int or fp type.
###
### The element type must be an integral power of two number of bytes up to
### and including 8 bytes in size, thus one of i8, i16, i32, i64, half,
### float, double. In particular i1 is not allowed.
### The width must be non-zero and must divide the total size evenly.
###
### There is no requirement on vstride, width, stride or total size being
### a power of two or having any maximum.
###
### The offset in bytes arg can be i16 or vector of i16. If a vector, then
### its vector width must be the height of the region, i.e. the total
### size of the region divided by the width.
###
### The parent width arg is ignored if the offset arg is constant. If the
### offset arg is variable, then a non-undef parent width is a statement
### that the value of offset is such that a row of the region does not
### cross a multiple of parent width boundary. This is used by the backend
### to determine whether the region can be collapsed into another region.
###
    "rdregioni" : { "result" : "anyint",
                    "arguments" : ["anyvector","int","int","int","anyint","int"],
                    "attributes" : "NoMem"
                  },
    "rdregionf" : { "result" : "anyfloat",
                    "arguments" : ["anyvector","int","int","int","anyint","int"],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.wrregion*`` : write a region, direct or single-indirect
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.wrregioni.<return type>.<any int>.<any int>.<any int>`` : integer element type (not i1)
### * ``llvm.genx.wrregionf.<return type>.<any float>.<any int>.<any int>`` : fp element type
###
### * arg0: vector to write region in to
### * arg1: subvector or scalar to write into the region (overloaded)
### * arg2: i32 vstride in elements, constant
### * arg3: i32 width in elements, constant
### * arg4: i32 stride in elements, constant
### * arg5: i16 or vXi16 offset in bytes (overloaded)
### * arg6: i32 parent width, constant, ignored if offset is constant
### * arg7: vector of i1 mask, or scalar i1 (overloaded)
###
### * Return value: the updated vector with the region modified
###
### The return type must be a vector with the same type as the arg0 vector.
### The arg1 subvector must have the same element type as the arg0 vector
### and be no larger. Arg1 can be a scalar if the number of elements in
### the subregion is 1.
###
### There are two variants, an integer one and an fp one, because the
### intrinsic declaration language does not let us declare the arg1 type
### as any scalar or vector int or fp type.
###
### The element type must be an integral power of two number of bytes up to
### and including 8 bytes in size, thus one of i8, i16, i32, i64, half,
### float, double. In particular i1 is not allowed.
### The width must be non-zero and must divide the total size evenly.
###
### The arg7 mask is a vector of booleans, exactly as wide as the
### arg1 subvector, such that an element of the subvector is written into
### its place in the vector only if the corresponding element of the mask
### is true.
### Alternatively, arg7 can be a single i1 constant with value 1,
### meaning that the wrregion is unconditional.
###
### There is no requirement on vstride, width, stride or total size being
### a power of two or having any maximum.
###
### The offset in bytes arg can be i16 or vector of i16. If a vector, then
### its vector width must be the height of the region, i.e. the total
### size of the region divided by the width.
###
### After lowering, the arg1 subvector to write can be a scalar of the same
### type as an element of arg0, indicating that the region has one element.
### (Lowering lowers an insertelement to this type of wrregion.)
###
### The parent width arg is ignored if the offset arg is constant. If the
### offset arg is variable, then a non-undef parent width is a statement
### that the value of offset is such that a row of the region does not
### cross a multiple of parent width boundary. This is used by the backend
### to determine whether the region can be collapsed into another region.
###
    "wrregioni" : { "result" : "anyvector",
                    "arguments" : [0,"anyint","int","int","int","anyint","int","anyint"],
                    "attributes" : "NoMem"
                  },
    "wrregionf" : { "result" : "anyvector",
                    "arguments" : [0,"anyfloat","int","int","int","anyint","int","anyint"],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.vstore.<vector type>.<ptr type>`` : store a vector value into memory
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### This intrinsic has the exact semantics of an llvm store instruction.
### It is designed for reading and writing a pass-by-reference argument
### and it stops llvm optimizations from optimizing away accesses to the
### pass-by-reference arguments.
###
### * arg0: the vector to read from
### * arg1: the memory to be accessed
###
    "vstore" : { "result" : "void",
                 "arguments" : ["anyvector","anyptr"],
                 "attributes" : "None"
               },

### ``llvm.genx.vload.<return type>.<ptr type>`` : load a vector value from memory
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### This intrinsic has the exact semantics of an llvm load instruction.
### It is designed for reading and writing a pass-by-reference argument
### and it stops llvm optimizations from optimizing away accesses to the
### pass-by-reference arguments.
###
### * arg0: the memory to be accessed (overloaded)
### * Return value: the vector value read
###
    "vload" : { "result" : "anyvector",
                "arguments" : ["anyptr"],
                "attributes" : "None"
              },

## ------------------------------
### ALU type conversion intrinsics
### ------------------------------

### ``llvm.genx.fptosi.sat.<return type>.<any float>`` : convert floating point to signed integer with saturate
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: value to saturate, any scalar or vector floating point type (overloaded)
###
### * Return value: converted value, any scalar or vector integer type
###               (treated as signed) with same vector width as arg0
###
    "fptosi_sat" : { "result" : "anyint",
                     "arguments" : ["anyfloat"],
                     "attributes" : "NoMem"
                   },

### ``llvm.genx.fptoui.sat.<return type>.<any float>`` : convert floating point to unsigned integer with saturate
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: value to saturate, any scalar or vector floating point type (overloaded)
###
### * Return value: converted value, any scalar or vector integer type
###               (treated as unsigned) with same vector width as arg0
###
    "fptoui_sat" : { "result" : "anyint",
                     "arguments" : ["anyfloat"],
                     "attributes" : "NoMem"
                   },

### ``llvm.genx.sat.<return type>.<return type>`` : floating point saturate
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: value to saturate, any scalar or vector floating point type
###
### * Return value: saturated value, same type as arg0
###
### We represent floating point saturation by simply calling this intrinsic
### on the result of a floating point operation. This works because the
### value before saturation fits in the same type.
###
### We do not have an equivalent for integer saturation, because the
### before-saturation value needs a bigger integer type than the result.
### Instead, any integer operation that supports saturation needs an
### intrinsic for the saturating variant.
###
    "sat" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.*trunc.sat.<return type>.<any int>`` : integer truncation with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.sstrunc.sat`` : signed result, signed operand
### * ``llvm.genx.sutrunc.sat`` : signed result, unsigned operand
### * ``llvm.genx.ustrunc.sat`` : unsigned result, signed operand
### * ``llvm.genx.uutrunc.sat`` : unsigned result, unsigned operand
###
### * arg0: value to truncate, any scalar or vector integer type (overloaded)
###
### * Return value: truncated value, any scalar or vector integer type
###               with same vector width as arg0
###
    "sstrunc_sat" : { "result" : "anyint",
                      "arguments" : ["anyint"],
                      "attributes" : "NoMem"
                    },
    "sutrunc_sat" : { "result" : "anyint",
                      "arguments" : ["anyint"],
                      "attributes" : "NoMem"
                    },
    "ustrunc_sat" : { "result" : "anyint",
                      "arguments" : ["anyint"],
                      "attributes" : "NoMem"
                    },
    "uutrunc_sat" : { "result" : "anyint",
                      "arguments" : ["anyint"],
                      "attributes" : "NoMem"
                    },

## -------------------
### Modifier intrinsics
### -------------------
###
### Abs is the only source modifier that is represented
### by an intrinsic; neg(x) uses 0-x, and not(x) uses x^-1.
###
### ``llvm.genx.abs*.<return type>`` : take absolute value
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.absf`` : abs modifier for fp
### * ``llvm.genx.absi`` : abs modifier for integer
###
### * arg0: input value, scalar/vector
###
### * Return value: result, same type
###
    "absf" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },
    "absi" : { "result" : "anyint",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

## ----------------------------
### Boolean reduction intrinsics
### ----------------------------

### ``llvm.genx.all.<any int>`` : true if all input elements are true
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value: v*i1 (overloaded)
###
### * Return value: i1 result
###
    "all" : { "result" : "bool",
              "arguments" : ["anyint"],
              "attributes" : "NoMem"
            },

### ``llvm.genx.any.<any int>`` : true if any input element is true
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value: v*i1 (overloaded)
###
### * Return value: i1 result
###
    "any" : { "result" : "bool",
              "arguments" : ["anyint"],
              "attributes" : "NoMem"
            },

## ----------------------------
### SIMD control flow intrinsics
### ----------------------------
###
### ``goto`` and ``join`` instructions are represented by ``llvm.genx.simdcf.goto``
### and ``llvm.genx.simdcf.join`` intrinsics.
###
### The Architectural model
### ^^^^^^^^^^^^^^^
###
### The Architectural defines SIMD control flow in terms of each of the 32 channels
### having a PcIP (per-channel instruction pointer), which determines where a
### disabled channel will be re-enabled:
###
### * A goto has two targets, UIP (update IP) and JIP (join IP).
###
###   - A (forward) goto evaluates its vector condition, and, for each channel
###     that is enabled and the condition is true, it sets the channel's PcIP to
###     UIP, to mark that the channel is disabled until execution reaches the
###     join instruction at UIP. If, after disabling channels in this way, no
###     channels are left enabled, then execution jumps to JIP.
###
###     UIP and JIP may be different, as there may be channels already disabled
###     from an earlier goto with their PcIPs set to an earlier point than the
###     present goto's UIP. So JIP needs to be set to the earliest point that
###     a channel could have its PcIP pointing at.
###
###   - There is also a backward goto variant for use in a conditional loop
###     back edge (end of a do..while loop). It works the same as a forward goto
###     over an unconditional jump back to the top of the loop.
###
### * A join has one target, JIP. It reenables all channels that have PcIP set
###   to this join. If there are still no channels enabled, it jumps to JIP.
###
### * Each instruction's register write-back is gated by which channels are
###   enabled, unless the instruction has a nomask bit set. This is in addition
###   to optionally being gated by a predicate.
###
### * The action of the channel enable mask (and predicate) in a send depends
###   on the shared function. Some (e.g. gather and scatter) have the expected
###   semantics where disabled channels do not participate in the memory read/write,
###   and (in the case of a read) do not update that channel's result.
###
### This scheme allows arbitrarily unstructured SIMD control flow. For it to work
### and guarantee convergence, it is sufficient (not sure if it is necessary)
### for there to be a linear chain of join points, and each goto/join's UIP and
### JIP are forward in the chain, and JIPs are set correctly so it is not possible
### for execution to "miss out" a join point where a channel should have been
### enabled. (As above, a backward goto is handled in this
### model by being considered a forward goto over a backward unconditional jump.)
###
### In Gen code, this linear chain of join points does not actually have to be in
### program order, as long as the join point order with forward UIP and JIP is
### derivable.
###
### In vISA, the linear chain of join points does have to be in program order.
### vISA does not encode the JIP of a goto/join; instead it derives it itself.
### Also, vISA uses whether a goto's target is before or after to encode whether
### it is a conditional loop backedge branch.
###
### The LLVM IR model
### ^^^^^^^^^^^^^^^^^
###
### The model we use in LLVM IR is very similar to the above.
###
### The PcIP (per-channel instruction pointer) is replaced by:
###
### * a global (in the function) EM (execution mask), with each channel having a
###   bit that is 1 when the channel is enabled;
###
### * each join point has a RM (resume mask), with each channel having a bit
###   that is 1 if the channel is disabled and due to be re-enabled when execution
###   reaches that join point.
###
### A goto is represented by the ``llvm.genx.simdcf.goto`` intrinsic. Its
### inputs are the current EM value, the current RM value for its UIP, and the
### vector condition. Its results are the updated EM value, the updated RM
### value for its UIP, and a scalar bool that says whether all channels are now
### disabled and execution should branch to the JIP. This last result is then
### (usually) used in a standard LLVM conditional ``br`` instruction.
###
### A goto is implicitly attached to its UIP join by the input and output RM
### values being part of a web of RM values connected by goto and phi nodes
### and used in that join.
###
### A join is represented by the ``llvm.genx.simdcf.join`` intrinsic. Its
### inputs are the current EM value and the current RM value for this join.
### Its results are the updated EM value (this join's RM value is now effectively
### all zeros so it not returned as a result), and a scalar bool that says whether
### all channels are still disabled and execution should branch to the JIP.
### This last result is then (optionally) used in a standard LLVM conditional
### ``br`` instruction.
###
### An instruction's register write-back being gated by which channels are enabled
### is modeled by the current EM value (or the appropriate size left slice of it)
### being used as the predicate in a select or wrregion or shared function
### intrinsic.
###
### Note that EM is always 32 bit, but a join's RM may be smaller as it has the same
### vector width as the condition on all gotos that update it.
###
### This model is equivalent to the Architectural model, as long as:
###
### * there is only ever one EM value live at a time with an initial value in a
###   function of either all ones or the passed in call mask;
###
### * for each join point, there is only ever one RM value live at a time with an
###   initial value in a function of all zeros, and a value after the join point of
###   all zeros;
###
### * it is possible to re-order the code such that the "false" target of a
###   conditional branch that a goto or join is attached to is fall-through, and
###   all JIPs and UIPs are forward.
###
### Like any other variable with multiple values transformed to SSA, different
### EM values may be joined with a phi node. Similarly, for a particular join point's
### RM, different RM values may be joined with a phi node.
###
### The  ``llvm.genx.simdcf.goto`` and ``llvm.genx.simdcf.join`` intrinsics can
### only be generated to ``goto`` and ``join`` instructions if the GenX backend
### deems them to be used in a way that is equivalent to the Architectural model. Otherwise,
### they are lowered to equivalent but slower code that implements the semantics
### of the spec of the intrinsics below.
###
### There are more detailed requirements on the use of these intrinsics to be able
### to generate them to ``goto`` and ``join`` instructions documented in the
### GenXSimdCFConformance pass.
###
### ``llvm.genx.simdcf.goto.<return type>.<vector type>.<vector type>`` : goto instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: OldEM (old execution mask): v32i1 (overloaded)
### * arg1: OldRM (old resume mask): vector of i1 (overloaded)
### * arg2: SimdCond (the SIMD control flow condition): same type as arg1
###
### Return value: struct with the following elements:
###
### * ret0: NewEM (updated execution mask): v32i1
### * ret1: NewRM (updated resume mask): same type as arg1
### * ret2: BranchCond: i1
###
### The elements of the returned struct are calculated as follows:
###
### * NewEM = OldEM & (SimdCond one extended to v16i1)
### * NewRM = OldRM | (OldEM & ~(SimdCond & (OldEM truncated to size of SimdCond)))
### * BranchCond = !any(NewEM truncated to size of SimdCond)
###
### ``llvm.genx.simdcf.goto`` represents a Gen goto instruction, taking a
### vector condition, modifying the global EM and the UIP's RM, and
### resulting in a scalar condition to be used in a conditional branch whose
### "true" successor is the goto's JIP.
###
### If the BranchCond result is not used, then the goto's JIP is set to the
### join immediately after.
###
### If the BranchCond result is used in a conditional branch, and JIP is
### later than the earliest join point
### where a channel would be re-enabled, then it is undefined whether the
### resulting goto instruction's JIP is as specified here, or an earlier join
### point. (This rule is to allow for the vISA finalizer re-deriving the JIPs.)
###
### If the goto intrinsic's conditional branch simply branches over an empty block
### with an unconditional branch, then the GenX backend takes the intrinsic and
### the two branches to be a do..while back edge, giving a Gen ``goto``
### instruction with BranchCtrl=1, UIP set to the successor of the unconditional
### branch (the top of the do..while loop), and JIP set to the following join
### instruction.
###
### Channels already disabled in EM remain disabled. For enabled channels,
### any channel whose element in SimdCond is true becomes disabled in EM, and
### the corresponding bit in RM is set such that the channel becomes re-enabled
### upon reaching the RM's join point. If all channels in EM are then disabled,
### then BranchCond is true and the conditional branch in which it is used
### branches to the next join point in sequence.
###
### Note that SimdCond has the same sense as in the Gen goto instruction, but
### the opposite sense to that in a vISA forward goto instruction.
###
    "simdcf_goto" : { "result" : ["anyvector","anyvector","bool"],
                      "arguments" : [0,1,1],
                      "attributes" : "NoMem"
                    },

### ``llvm.genx.simdcf.join.<return type>.<vector type>`` : join instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: OldEM (old execution mask): v32i1 (overloaded)
### * arg1: RM (resume mask): vector of i1
###
### Return value: struct with the following elements:
###
### * ret0: NewEM (updated execution mask): v32i1
### * ret1: BranchCond: i1
###
### The elements of the returned struct are calculated as follows:
###
### * NewEM = OldEM | (RM zero extended to v32i1)
### * BranchCond = !any(NewEM truncated to size of RM)
###
### This is marked as having side effects to stop LLVM removing an otherwise
### unused join at an outer endif.
###
### ``llvm.genx.simdcf.join`` represents a Gen join instruction, using the join
### point's RM, modifying the global EM, and resulting in a scalar condition to
### be used (optionally) in a conditional branch whose "true" successor is
### the join's JIP.
###
### If the BranchCond result is not used, then the join's JIP is undefined; this
### case is used when it is known that at least one channel is enabled after
### the join so JIP will never be used.
###
### If the BranchCond result is used in a conditional branch, and JIP is
### later than the earliest join point
### where a channel would be re-enabled, then it is undefined whether the
### resulting goto instruction's JIP is as specified here, or an earlier join
### point. (This rule is to allow for the vISA finalizer re-deriving the JIPs.)
###
### Note that vISA does not have a join instruction; the vISA finalizer
### recovers the join points from the goto instructions assuming a linear order.
###
### Channels with a set bit in RM become enabled in EM. If all channels in EM are
### still disabled, then BranchCond is true and the conditional branch in which it
### is used branches to the next join point in sequence.
###
    "simdcf_join" : { "result" : ["anyvector","bool"],
                      "arguments" : [0,"anyvector"],
                      "attributes" : "None"
                    },

### ``llvm.genx.simdcf.savemask.<any vector>`` :
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: OldEM (old execution mask): v32i1 (overloaded)
### * ret:  temp i32 for saving the oldEM
    "simdcf_savemask" : { "result" : "int",
                          "arguments" : ["anyvector"],
                          "attributes" : "WriteMem,SideEffects"
                        },

### ``llvm.genx.simdcf.unmask.<return type>`` :
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: temp i32 from savemask
### * arg1: i32 constant, should be all-one
### * ret:  NewEM (updated execution mask, all-one): v32i1
    "simdcf_unmask" : { "result" : "anyvector",
                        "arguments" : ["int","int"],
                        "attributes" : "WriteMem,SideEffects"
                      },

### ``llvm.genx.simdcf.remask.<return type>`` :
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: OldEM (old execution mask): v32i1
### * arg1: temp i32 for restoring the EM
###
### Return value: NewEM (updated execution mask): v32i1
###
    "simdcf_remask" : { "result" : "anyvector",
                        "arguments" : [0,"int"],
                        "attributes" : "WriteMem,SideEffects"
                      },

### ``llvm.genx.simdcf.get.em`` :
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: EM (execution mask): v32i1
###
### * Return value: temp v32i1 to store EM
###
### This intrinsic prevents manipulations on EM usage
### and allows CM to create explicit value from EM.
### No masks are modified by this intrinsic.
###
### The WriteMem and SideEffects markers are used to
### prevent this instruction from being moved: in fact,
### EM is different in different locations even when the
### dominance of DF is not corrupted.
###
    "simdcf_get_em" : { "result" : "anyvector",
                        "arguments" : [0],
                        "attributes" : "WriteMem,SideEffects"
                      },

### --------------
### ALU intrinsics
### --------------

### add
### ^^^
### Non-saturating add intrinsic is not needed. A vISA non-saturating add
### where the result type is different to the operand type is represented
### by trunc/zext/sext of each operand and then an LLVM IR Add instruction.
###

### ``llvm.genx.*add.sat.<return type>.<any int>`` : add instruction with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssadd.sat`` : result signed, operands signed
### * ``llvm.genx.suadd.sat`` : result signed, operands unsigned
### * ``llvm.genx.usadd.sat`` : result unsigned, operands signed
### * ``llvm.genx.uuadd.sat`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type, even i64 (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar or vector integer type with same
###               vector width
###
### For an fp add, use the LLVM IR FAdd instruction, followed by
### llvm.genx.sat if saturation is required.
###
    "ssadd_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "suadd_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "usadd_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "uuadd_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },

### asr
### ^^^
### asr intrinsic is not needed. Because asr cannot overflow, an asr that
### saturates with a smaller result type than the execution type can be
### represented by an LLVM IR Asr instruction then an llvm.genx.sstrunc.sat.
###

### ``llvm.genx.*avg.<return type>.<any int>`` : integer averaging, no saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssavg`` : result signed, operands signed
### * ``llvm.genx.suavg`` : result signed, operands unsigned
### * ``llvm.genx.usavg`` : result unsigned, operands signed
### * ``llvm.genx.uuavg`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar/vector integer type (not i64)
###               with same vector width
###
    "ssavg" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "suavg" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "usavg" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "uuavg" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*avg.sat.<return type>.<any int>`` : integer averaging with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssavg.sat`` : result signed, operands signed
### * ``llvm.genx.suavg.sat`` : result signed, operands unsigned
### * ``llvm.genx.usavg.sat`` : result unsigned, operands signed
### * ``llvm.genx.uuavg.sat`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar/vector integer type (not i64)
###               with same vector width
###
    "ssavg_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "suavg_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "usavg_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "uuavg_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.*bfe.<return type>`` : bitfield extract
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.sbfe`` : bitfield extract, signed result
### * ``llvm.genx.ubfe`` : bitfield extract, unsigned result
###
### * arg0: first input, any scalar/vector i32 type
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
###
### * Return value: result, same type as arg0
###
    "sbfe" : { "result" : "anyint",
               "arguments" : [0,0,0],
               "attributes" : "NoMem"
             },
    "ubfe" : { "result" : "anyint",
               "arguments" : [0,0,0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.bfi.<return type>`` : bitfield insert
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input, any scalar/vector i32 type
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
### * arg3: fourth input, same type as arg0
###
### * Return value: result, same type as arg0
###
    "bfi" : { "result" : "anyint",
              "arguments" : [0,0,0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.bfrev.<return type>`` : reverse bits
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input, any scalar/vector i32 type
###
### * Return value: result, same type as arg0
###
    "bfrev" : { "result" : "anyint",
                "arguments" : [0],
                "attributes" : "NoMem"
              },

### ``llvm.genx.cbit.<return type>.<any int>`` : count set bits
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input, any scalar/vector integer type (overloaded)
###
### * Return value: result, int32 of same width as arg0
###
    "cbit" : { "result" : "anyint",
               "arguments" : ["anyint"],
               "attributes" : "NoMem"
             },

### cmp
### ^^^
### No intrinsic needed as the LLVM IR ICmp and FCmp instructions cover
### vISA functionality
###

### ``llvm.genx.cos.<return type>`` : cos instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "cos" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### div
### ^^^
### No intrinsic needed as the LLVM IR SDiv, UDiv and FDiv instructions
### cover vISA functionality
###

### ``llvm.genx.ieee.div.<return type>`` : Divide, IEEE variant
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input, any scalar/vector float/double type
### * arg1: second input, same type
###
### * Return value: result, same type
###
    "ieee_div" : { "result" : "anyfloat",
                   "arguments" : [0,0],
                   "attributes" : "NoMem"
                 },

### ``llvm.genx.dp2.<return type>`` : dp2 instruction (dot product on groups of 4 elements)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, any vector float with a multiple of 4 elements
### * arg1: second input value, same type as arg0
###
### * Return value: result, same type
###
    "dp2" : { "result" : "anyfloat",
              "arguments" : [0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.dp3.<return type>`` : dp3 instruction (dot product on groups of 3 elements)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, any vector float with a multiple of 4 elements
### * arg1: second input value, same type as arg0
###
### * Return value: result, same type
###
    "dp3" : { "result" : "anyfloat",
              "arguments" : [0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.dp4.<return type>`` : dp4 instruction (dot product on groups of 4 elements)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, any vector float with a multiple of 4 elements
### * arg1: second input value, same type as arg0
###
### * Return value: result, same type
###
    "dp4" : { "result" : "anyfloat",
              "arguments" : [0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.dph.<return type>`` : dph instruction (dot product homogenous)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, any vector float with a multiple of 4 elements
### * arg1: second input value, same type as arg0
###
### * Return value: result, same type
###
    "dph" : { "result" : "anyfloat",
              "arguments" : [0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.exp.<return type>`` : base 2 exponent
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "exp" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.*fbh.<return type>`` : find bit high
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.sfbh`` : find bit high, signed operand
### * ``llvm.genx.ufbh`` : find bit high, unsigned operand
###
### * arg0: input value, any scalar/vector i32 type
###
### * Return value: result, same type
###
    "sfbh" : { "result" : "anyint",
               "arguments" : [0],
               "attributes" : "NoMem"
             },
    "ufbh" : { "result" : "anyint",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.fbl.<return type>`` : find bit low
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector i32 type
###
### * Return value: result, same type
###
    "fbl" : { "result" : "anyint",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.frc.<return type>`` : fractional part
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float type
###
### * Return value: result, same type
###
    "frc" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.inv.<return type>`` : reciprocal
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "inv" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.line.<return type>`` : linear equation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, vector float with exactly 4 elements
### * arg1: second input value, vector float with a multiple of 4 elements
###
### * Return value: result, same type as arg1
###
    "line" : { "result" : "anyfloat",
               "arguments" : ["float4",0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.log.<return type>`` : base 2 logarithm
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "log" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.lrp.<return type>`` : linear interpolation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, any vector float with a multiple of 4 elements
### * arg1: second input value, same type as arg0
### * arg2: third input value, same type as arg0
###
### * Return value: result, same type
###
    "lrp" : { "result" : "anyfloat",
              "arguments" : [0,0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.lzd.<return type>`` : leading zero detection
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector i32 type
###
### * Return value: result, same type
###
    "lzd" : { "result" : "anyint",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.*mad.<return type>.<any int>`` : mad instruction, no saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssmad`` : result signed, operands signed
### * ``llvm.genx.sumad`` : result signed, operands unsigned
### * ``llvm.genx.usmad`` : result unsigned, operands signed
### * ``llvm.genx.uumad`` : result unsigned, operands unsigned
###
### result := arg0 * arg1 + arg2
###
### * Return value: result, any scalar or vector integer type with same
###                 vector width
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
### * arg2: third input, same type as result
###
    "ssmad" : { "result" : "anyint",
                "arguments" : ["anyint",1,0],
                "attributes" : "NoMem"
              },
    "sumad" : { "result" : "anyint",
                "arguments" : ["anyint",1,0],
                "attributes" : "NoMem"
              },
    "usmad" : { "result" : "anyint",
                "arguments" : ["anyint",1,0],
                "attributes" : "NoMem"
              },
    "uumad" : { "result" : "anyint",
                "arguments" : ["anyint",1,0],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*mad.sat.<return type>.<any int>`` : mad instruction with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssmad.sat`` : result signed, operands signed
### * ``llvm.genx.sumad.sat`` : result signed, operands unsigned
### * ``llvm.genx.usmad.sat`` : result unsigned, operands signed
### * ``llvm.genx.uumad.sat`` : result unsigned, operands unsigned
###
### result := sat(arg0 * arg1 + arg2)
###
### * Return value: result, any scalar or vector integer type with same
###                 vector width
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
### * arg2: third input, same type as result
###
    "ssmad_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "sumad_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "usmad_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "uumad_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.*max.<return type>.<any int>`` : max instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.smax`` : result and operands signed
### * ``llvm.genx.umax`` : result and operands unsigned
### * ``llvm.genx.fmax`` : result and operands float
###
### * arg0: first input, any scalar/vector integer/float type, even i64 (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar, vector integer/float type with same
###               vector width
###
### There is no need for a saturating variant of this intrinsic.
### Because max cannot overflow, a saturating max can be represented
### by this non-saturating max followed by the applicable one of the
### saturating trunc intrinsics.
###
    "smax" : { "result" : "anyint",
               "arguments" : ["anyint",1],
               "attributes" : "NoMem"
             },
    "umax" : { "result" : "anyint",
               "arguments" : ["anyint",1],
               "attributes" : "NoMem"
             },
    "fmax" : { "result" : "anyfloat",
               "arguments" : ["anyfloat",1],
               "attributes" : "NoMem"
             },

### ``llvm.genx.*min.<return type>`` : min instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.smin.<any int>`` : result and operands signed
### * ``llvm.genx.umin.<any int>`` : result and operands unsigned
### * ``llvm.genx.fmin.<any float>`` : result and operands float
###
### * arg0: first input, any scalar/vector integer/float type, even i64 (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar or vector integer/float type with same
###               vector width
###
### There is no need for a saturating variant of this intrinsic.
### Because min cannot overflow, a saturating min can be represented
### by this non-saturating min followed by the applicable one of the
### saturating trunc intrinsics.
###
    "smin" : { "result" : "anyint",
               "arguments" : ["anyint",1],
               "attributes" : "NoMem"
             },
    "umin" : { "result" : "anyint",
               "arguments" : ["anyint",1],
               "attributes" : "NoMem"
             },
    "fmin" : { "result" : "anyfloat",
               "arguments" : ["anyfloat",1],
               "attributes" : "NoMem"
             },

### mod
### ^^^
### No intrinsic needed as the LLVM IR SRem, URem and FRem instructions
### cover vISA functionality
###

### imad
### ^^^^
###
### ``llvm.genx.*imad.<{hi, lo}>.<any int>`` : imad instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.simad`` : result and operands signed
### * ``llvm.genx.uimad`` : result and operands unsigned
###
### result := {hi, lo} = arg0 * arg1 + arg2
###
### * arg0: first input, i32 scalar/vector integer type
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
###
    "simad" : { "result" : ["anyint", "anyint"],
                "arguments" :  [0, 0, 0],
                "attributes" :  "NoMem"
              },
    "uimad" : { "result" : ["anyint", "anyint"],
                "arguments" :  [0, 0, 0],
                "attributes" :  "NoMem"
              },

### mul
### ^^^
### Still need non-saaturating mul intrinsic as def-hoist/copy-prop in jitter
### cannot fully remove the trunc/zext/sext on each operand.
###
### ``llvm.genx.*mul.<return type>.<any int>`` : mul instruction, no saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssmul`` : result signed, operands signed, signed
### * ``llvm.genx.sumul`` : result signed, operands signed, unsigned
### * ``llvm.genx.usmul`` : result signed, operands unsigned, signed
### * ``llvm.genx.uumul`` : result signed, operands unsigned, unsigned
###
### result := arg0 * arg1
###
### * Return value: result, any scalar or vector integer type with same
###                 vector width
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
###
    "ssmul" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "sumul" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "usmul" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "uumul" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*mul.sat.<return type>.<any int>`` : mul instruction with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssmul.sat`` : result signed, operands signed
### * ``llvm.genx.sumul.sat`` : result signed, operands unsigned
### * ``llvm.genx.usmul.sat`` : result unsigned, operands signed
### * ``llvm.genx.uumul.sat`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type (not i64) (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar/vector integer type with same
###               vector width, even i64
###
### For an fp mul, use the LLVM IR FMul instruction, followed by
### llvm.genx.sat if saturation is required.
###
    "ssmul_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "sumul_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "usmul_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "uumul_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.*mulh.<return type>.<any int>`` : mulh instruction, no saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.smulh`` : signed
### * ``llvm.genx.umulh`` : unsigned
###
### * arg0: first input, any scalar/vector i32 type (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, same type as arg0
###
    "smulh" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "umulh" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },

### not
### ^^^
### Intrinsic not needed; use LLVM IR Xor instruction with -1
###

### or
### ^^
### Intrinsic not needed; use LLVM IR Or instruction
###

### ``llvm.genx.pln.<return type>.<any float>`` : plane equation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, vector float with exactly 4 elements
### * arg1: second input value, vector float with a multiple of 16 elements (overloaded)
###
### * Return value: result, vector float with half as many elements as arg1
###
    "pln" : { "result" : "anyfloat",
              "arguments" : ["float4","anyfloat"],
              "attributes" : "NoMem"
            },

### ``llvm.genx.pow.<return type>`` : power
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input, any scalar/vector half/float type
### * arg1: second input, same type
###
### * Return value: result, same type
###
    "pow" : { "result" : "anyfloat",
              "arguments" : [0,0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.rndd.<return type>`` : round down
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float type
###
### * Return value: result, same type
###
    "rndd" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.rnde.<return type>`` : round to even
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float type
###
### * Return value: result, same type
###
    "rnde" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.rndu.<return type>`` : round up
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float type
###
### * Return value: result, same type
###
    "rndu" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.rndz.<return type>`` : round to zero
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float type
###
### * Return value: result, same type
###
    "rndz" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.rsqrt.<return type>`` : reciprocal square root
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "rsqrt" : { "result" : "anyfloat",
                "arguments" : [0],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*sad2.<return type>.<any int>`` : two-wide sum of absolute differences
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssad2`` : signed argument and result
### * ``llvm.genx.usad2`` : unsigned argument and result
###
### * arg0: first input, vector of i8, multiple of 2 wide (overloaded)
### * arg1: second input, same type
###
### * Return value: result, vector of i16 of same vector width
###
    "ssad2" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "usad2" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*sad2add.<return type>.<any int>`` : two-wide sum of absolute differences and add
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.sssad2add`` : signed result and args
### * ``llvm.genx.uusad2add`` : unsigned result and args
### * ``llvm.genx.ussad2add`` : unsigned result and signed args
### * ``llvm.genx.susad2add`` : signed result and unsigned args
###
### * arg0: first input, vector of i8, multiple of 2 wide (overloaded)
### * arg1: second input, same type
### * arg2: third input, vector of i16 of same vector width
###
### * Return value: result, same type as arg2
###
    "sssad2add" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "uusad2add" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "ussad2add" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },
    "susad2add" : { "result" : "anyint",
                    "arguments" : ["anyint",1,0],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.*sad2add.sat.<return type>.<any int>`` : two-wide sum of absolute differences and add, saturated
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.sssad2add.sat`` : signed result and args
### * ``llvm.genx.uusad2add.sat`` : unsigned result and args
### * ``llvm.genx.ussad2add.sat`` : unsigned result and signed args
### * ``llvm.genx.susad2add.sat`` : signed result and unsigned args
###
### * arg0: first input, vector of i8, multiple of 2 wide (overloaded)
### * arg1: second input, same type
### * arg2: third input, vector of i16 of same vector width
###
### * Return value: result, same type as arg2
###
    "sssad2add_sat" : { "result" : "anyint",
                        "arguments" : ["anyint",1,0],
                        "attributes" : "NoMem"
                      },
    "uusad2add_sat" : { "result" : "anyint",
                        "arguments" : ["anyint",1,0],
                        "attributes" : "NoMem"
                      },
    "ussad2add_sat" : { "result" : "anyint",
                        "arguments" : ["anyint",1,0],
                        "attributes" : "NoMem"
                      },
    "susad2add_sat" : { "result" : "anyint",
                        "arguments" : ["anyint",1,0],
                        "attributes" : "NoMem"
                      },

### ``llvm.genx.*shl.<return type>.<any int>`` : shl instruction, no saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssshl`` : result signed, operands signed
### * ``llvm.genx.sushl`` : result signed, operands unsigned
### * ``llvm.genx.usshl`` : result unsigned, operands signed
### * ``llvm.genx.uushl`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type, even i64 (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar or vector integer type with same
###               vector width, even i64
###
    "ssshl" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "sushl" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "usshl" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },
    "uushl" : { "result" : "anyint",
                "arguments" : ["anyint",1],
                "attributes" : "NoMem"
              },

### ``llvm.genx.*shl.sat.<return type>.<any int>`` : shl instruction with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssshl.sat`` : result signed, operands signed
### * ``llvm.genx.sushl.sat`` : result signed, operands unsigned
### * ``llvm.genx.usshl.sat`` : result unsigned, operands signed
### * ``llvm.genx.uushl.sat`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type, even i64 (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar/vector integer type with same
###               vector width, even i64
###
    "ssshl_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "sushl_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "usshl_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },
    "uushl_sat" : { "result" : "anyint",
                    "arguments" : ["anyint",1],
                    "attributes" : "NoMem"
                  },

### shr
### ^^^
### Intrinsic is not needed. Because shr cannot overflow, an shr that
### saturates with a smaller result type than the execution type can be
### represented by an LLVM IR Shr instruction then an llvm.genx.sstrunc.sat.
###

### ``llvm.genx.ro*.<return type>.<any int>`` : rol and ror instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.rol`` : rotate left
### * ``llvm.genx.ror`` : rotate right
###
### * arg0: first input, any scalar/vector integer type (even i64) (overloaded)
### * arg1: second input, same type as arg0
###
### * Return value: result, any scalar or vector integer type with same
###               vector width (even i64)
###
    "rol" : { "result" : "anyint",
              "arguments" : ["anyint",1],
              "attributes" : "NoMem"
            },
    "ror" : { "result" : "anyint",
              "arguments" : ["anyint",1],
              "attributes" : "NoMem"
            },

### ``llvm.genx.sin.<return type>`` : reciprocal square root
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "sin" : { "result" : "anyfloat",
              "arguments" : [0],
              "attributes" : "NoMem"
            },

### ``llvm.genx.sqrt.<return type>`` : reciprocal square root
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector half/float type
###
### * Return value: result, same type
###
    "sqrt" : { "result" : "anyfloat",
               "arguments" : [0],
               "attributes" : "NoMem"
             },

### ``llvm.genx.ieee.sqrt.<return type>`` : reciprocal square root, IEEE variant
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: input value, any scalar/vector float/double type
###
### * Return value: result, same type
###
    "ieee_sqrt" : { "result" : "anyfloat",
                    "arguments" : [0],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.dpas.<return type>.<vector type>.<vector type>`` : dpas instruction (Dot Product Accumulate Systolic)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: accumulator, vector integer/float type
### * arg1: src1 (W), vector integer/float type (overloaded)
### * arg2: src2 (A), vector integer/float type (overloaded)
### * arg3: integer, encodes informatioin about the operation type
###
### * Return value: result, same type as arg0
###
    "dpas" : { "result" : "anyvector",
               "arguments" : [0,"anyvector","anyvector","int"],
               "attributes" : "NoMem"
             },

### ``llvm.genx.dpas2.<return type>.<vector type>.<vector type>.<vector type>`` : dpas instruction (Dot Product Accumulate Systolic)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: accumulator first input value, vector integer/float type
### * arg1: src1 input value, vector integer/float type
### * arg2: src2 fourth input value, integer type
### * arg3: int information of src1 PresisionType
### * arg4: int information of src2 PresisionType
### * arg5: int SystolicDepth
### * arg6: int RepeatCount
### * arg7: int sign dst( 0 - unsigned, 1 sign)
### * arg8: int sign src0
###
### * Return value: result
###
    "dpas2" : { "result" : "anyvector",
                "arguments" : ["anyvector","anyvector","anyvector","int","int", "int", "int", "int", "int"],
                "attributes" : "NoMem"
              },

### ``llvm.genx.dpas.nosrc0.<return type>.<vector type>.<vector type>`` : dpas instruction (Dot Product Accumulate Systolic) with no src0
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: second input value, vector integer/float type (overloaded)
### * arg1: third input value, vector integer/float type (overloaded)
### * arg2: fourth input value, integer type
###
### * Return value: result
###
    "dpas_nosrc0" : { "result" : "anyvector",
                      "arguments" : ["anyvector","anyvector","int"],
                      "attributes" : "NoMem"
                    },

### ``llvm.genx.dpasw.<return type>.<vector type>.<vector type>`` : dpasw instruction (Dot Product Accumulate Systolic)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: first input value, vector integer/float type
### * arg1: second input value, vector integer/float type (overloaded)
### * arg2: third input value, vector integer/float type (overloaded)
### * arg3: fourth input value, integer type
###
### * Return value: result, same type as arg0
###
    "dpasw" : { "result" : "anyvector",
                "arguments" : [0,"anyvector","anyvector","int"],
                "attributes" : "NoMem"
              },

### ``llvm.genx.dpasw.nosrc0.<return type>.<vector type>.<vector type>`` : dpasw instruction (Dot Product Accumulate Systolic) with no src0
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: second input value, vector integer/float type (overloaded)
### * arg1: third input value, vector integer/float type (overloaded)
### * arg2: fourth input value, integer type
###
### * Return value: result
###
    "dpasw_nosrc0" : { "result" : "anyvector",
                       "arguments" : ["anyvector","anyvector","int"],
                       "attributes" : "NoMem"
                     },

### ``llvm.genx.*dp4a*.<return type>.<vector type>.<vector type>.<vector type>`` : dp4a instruction (Dot Product 4 Accumulate)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssdp4a`` : result signed, operands signed
### * ``llvm.genx.sudp4a`` : result signed, operands unsigned
### * ``llvm.genx.usdp4a`` : result unsigned, operands signed
### * ``llvm.genx.uudp4a`` : result unsigned, operands unsigned
### * ``llvm.genx.ssdp4a_sat`` : result signed saturated, operands signed
### * ``llvm.genx.sudp4a_sat`` : result signed satruated, operands unsigned
### * ``llvm.genx.usdp4a_sat`` : result unsigned saturated, operands signed
### * ``llvm.genx.uudp4a_sat`` : result unsigned saturated, operands unsigned
###
###
### * arg0: first input value, vector integer type (overloaded)
### * arg1: second input value, vector integer type (overloaded)
### * arg2: third input value, vector integer type (overloaded)
###
### * Return value: result, vector integer type
###
    "ssdp4a" : { "result" : "anyvector",
                 "arguments" : ["anyvector","anyvector","anyvector"],
                 "attributes" : "NoMem"
               },
    "sudp4a" : { "result" : "anyvector",
                 "arguments" : ["anyvector","anyvector","anyvector"],
                 "attributes" : "NoMem"
               },
    "usdp4a" : { "result" : "anyvector",
                 "arguments" : ["anyvector","anyvector","anyvector"],
                 "attributes" : "NoMem"
               },
    "uudp4a" : { "result" : "anyvector",
                 "arguments" : ["anyvector","anyvector","anyvector"],
                 "attributes" : "NoMem"
               },
    "ssdp4a_sat" : { "result" : "anyvector",
                     "arguments" : ["anyvector","anyvector","anyvector"],
                     "attributes" : "NoMem"
                   },
    "sudp4a_sat" : { "result" : "anyvector",
                     "arguments" : ["anyvector","anyvector","anyvector"],
                     "attributes" : "NoMem"
                   },
    "usdp4a_sat" : { "result" : "anyvector",
                     "arguments" : ["anyvector","anyvector","anyvector"],
                     "attributes" : "NoMem"
                   },
    "uudp4a_sat" : { "result" : "anyvector",
                     "arguments" : ["anyvector","anyvector","anyvector"],
                     "attributes" : "NoMem"
                   },

### addc
### ^^^^
###
### ``llvm.genx.addc.<{carry, add}>.<any int>`` : add with carry
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.addc`` :
###
### * arg0: first input, i32 scalar/vector integer type
### * arg1: second input, same type as arg0
    "addc" : { "result" : ["anyint", "anyint"],
               "arguments" :  [0, 0],
               "attributes" :  "NoMem"
             },

### subb
### ^^^^
###
### ``llvm.genx.subb.<{borrow, sub}>.<any int>`` : sub with borrow
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.subb`` :
###
### * arg0: first input, i32 scalar/vector integer type
### * arg1: second input, same type as arg0
    "subb" : { "result" : ["anyint", "anyint"],
               "arguments" :  [0, 0],
               "attributes" :  "NoMem"
             },

### add3
### ^^^^
###
### ``llvm.genx.*add3.<return type>.<any int>`` : add3 instruction without saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.add3`` :
###
### * arg0: first input, any scalar/vector integer type, i16/i32 (overloaded)
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
    "add3" : { "result" : "anyint",
               "arguments" : ["anyint",1,1],
               "attributes" : "NoMem"
             },

### ``llvm.genx.*add3.sat.<return type>.<any int>`` : add3 instruction with saturation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.ssadd3.sat`` : result signed, operands signed
### * ``llvm.genx.suadd3.sat`` : result signed, operands unsigned
### * ``llvm.genx.usadd3.sat`` : result unsigned, operands signed
### * ``llvm.genx.uuadd3.sat`` : result unsigned, operands unsigned
###
### * arg0: first input, any scalar/vector integer type, i16/i32 (overloaded)
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
###
### * Return value: result, any scalar or vector integer type with same
###               vector width
###
    "ssadd3_sat" : { "result" : "anyint",
                     "arguments" : ["anyint",1,1],
                     "attributes" : "NoMem"
                   },
    "suadd3_sat" : { "result" : "anyint",
                     "arguments" : ["anyint",1,1],
                     "attributes" : "NoMem"
                   },
    "usadd3_sat" : { "result" : "anyint",
                     "arguments" : ["anyint",1,1],
                     "attributes" : "NoMem"
                   },
    "uuadd3_sat" : { "result" : "anyint",
                     "arguments" : ["anyint",1,1],
                     "attributes" : "NoMem"
                   },

### add3c
### ^^^^^
###
### ``llvm.genx.add3c.<{carry, add3}>.<any int>`` : add3 with carry
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.add3c`` :
###
### * arg0: first input, i32 scalar/vector integer type
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
    "add3c" : { "result" : ["anyint", "intvector"],
                "arguments" :  ["anyint",1,1,1],
                "attributes" :  "NoMem"
              },

### bfn
### ^^^
###
### ``llvm.genx.bfn.<return type>.<any int>`` : bfn instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.bfn`` :
###
### * arg0: first input, any scalar/vector integer type, i16/i32 (overloaded)
### * arg1: second input, same type as arg0
### * arg2: third input, same type as arg0
### * arg3: fourth input, byte, constant
    "bfn" : { "result" : "anyint",
              "arguments" : ["anyint",1,1,"char"],
              "attributes" : "NoMem"
            },

### srnd
### ^^^
###
### ``llvm.genx.srnd.<return type>.<any float>.<any float>`` : srnd instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.srnd`` :
###
### * arg0: first input, any vector f32/hf16 type
### * arg1: second input, same type as arg0
### * Return value: result, must be half if arg0 is f32, or ub if arg0 is half.
    "srnd" : { "result" : "anyvector",
               "arguments" : ["anyvector", "anyvector"],
               "attributes" : "NoMem"
             },

### bf_cvt
### ^^^^^^
###
### ``llvm.genx.bf.cvt.<return type>.<any float>`` : bf_cvt instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.bf.cvt`` :
###
### * arg0: first input, any scalar/vector bf/float type (overloaded)
###
### * Return value: result, must be float if arg0 is half, or half if arg0 is float.
###
    "bf_cvt" : { "result" : "anyfloat",
                 "arguments" : ["anyfloat"],
                 "attributes" : "NoMem"
               },

### tf32_cvt
### ^^^^^^
###
### ``llvm.genx.tf32.cvt.<return type>.<any float>`` : tf32_cvt instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.tf32.cvt`` :
###
### * arg0: first input, vector float type fp32/hf16
###
### * Return value: result, must be ud( Unsigned Doubleword)
###
    "tf32_cvt" : { "result" : "anyvector",
                   "arguments" : ["anyvector"],
                   "attributes" : "NoMem"
                 },

### qf_cvt
### ^^^^^^
###
### ``llvm.genx.qf.cvt.<return type>.<vector type>`` : qf_cvt instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.qf.cvt`` :
###
### * arg0: first input, any scalar/vector i8/half type (overloaded)
###
### * Return value: result, must be i8 if arg0 is half, or half if arg0 is i8.
###
    "qf_cvt" : { "result" : "anyvector",
                 "arguments" : ["anyvector"],
                 "attributes" : "NoMem"
               },

### ``llvm.genx.lsc.load.*.<return type if not void>.<any type>.<any type>`` : lsc_load instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.load.slm`` :
### * ``llvm.genx.lsc.load.bti`` :
### * ``llvm.genx.lsc.load.stateless`` :
### * ``llvm.genx.lsc.prefetch.bti`` :
### * ``llvm.genx.lsc.prefetch.stateless`` :
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Subopcode, [MBZ]
### * arg2: i8 Caching behavior for L1, [MBC]
### * arg3: i8 Caching behavior for L3, [MBC]
### * arg4: i16 Address scale, [MBC]
### * arg5: i32 Immediate offset added to each address, [MBC]
### * arg6: i8 The dataum size, [MBC]
### * arg7: i8 Number of elements to load per address (vector size), [MBC]
### * arg8: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg9: i8 Channel mask for quad versions, [MBC]
### * arg10: {1,32}Xi{16,32,64} The vector register holding offsets (overloaded)
###          for flat version Base Address + Offset[i] goes here
### * arg11: i32 surface to use for this operation. This can be an immediate or a register
###          for flat and bindless version pass zero here
###
### * Return value: the value read or void for prefetch
###
### Cache mappings are:
###
###   - 0 -> .df (default)
###   - 1 -> .uc (uncached)
###   - 2 -> .ca (cached)
###   - 3 -> .wb (writeback)
###   - 4 -> .wt (writethrough)
###   - 5 -> .st (streaming)
###   - 6 -> .ri (read-invalidate)
###
### Only certain combinations of CachingL1 with CachingL3 are valid on hardware.
###
### +---------+-----+-----------------------------------------------------------------------+
### |  L1     |  L3 | Notes                                                                 |
### +---------+-----+-----------------------------------------------------------------------+
### | .df     | .df | default behavior on both L1 and L3 (L3 uses MOCS settings)            |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .uc | uncached (bypass) both L1 and L3                                      |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .uc | streaming L1 / bypass L3                                              |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .ca | bypass L1 / cache in L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .ca     | .uc | cache in L1 / bypass L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .ca     | .ca | cache in both L1 and L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .ca | streaming L1 / cache in L3                                            |
### +---------+-----+-----------------------------------------------------------------------+
### | .ri     | .ca | read-invalidate (e.g. last-use) on L1 loads / cache in L3             |
### +---------+-----+-----------------------------------------------------------------------+
###
### Immediate offset. The compiler may be able to fuse this add into the message, otherwise
### additional instructions are generated to honor the semantics.
###
### Dataum size mapping is
###
###   - 1 = :u8
###   - 2 = :u16
###   - 3 = :u32
###   - 4 = :u64
###   - 5 = :u8u32 (load 8b, zero extend to 32b; store the opposite),
###   - 6 = :u16u32 (load 8b, zero extend to 32b; store the opposite),
###   - 7 = :u16u32h (load 16b into high 16 of each 32b; store the high 16)
###
    "lsc_load_slm" : { "result" : "anyvector",
                       "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                       "attributes" : "ReadMem"
                     },
    "lsc_load_stateless" : { "result" : "anyvector",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                             "attributes" : "ReadMem"
                           },
    "lsc_load_bindless" : { "result" : "anyvector",
                            "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                            "attributes" : "ReadMem"
                          },
    "lsc_load_bti" : { "result" : "anyvector",
                       "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                       "attributes" : "ReadMem"
                     },
    "lsc_prefetch_slm" : { "result" : "void",
                           "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                           "attributes" : "None"
                         },
    "lsc_prefetch_bti" : { "result" : "void",
                           "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                           "attributes" : "None"
                         },
    "lsc_prefetch_stateless" : { "result" : "void",
                                 "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                 "attributes" : "None"
                               },
    "lsc_prefetch_bindless" : { "result" : "void",
                                "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                "attributes" : "None"
                              },
    "lsc_load_quad_slm" : { "result" : "anyvector",
                            "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                            "attributes" : "ReadMem"
                          },
    "lsc_load_quad_stateless" : { "result" : "anyvector",
                                  "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                  "attributes" : "ReadMem"
                                },
    "lsc_load_quad_bindless" : { "result" : "anyvector",
                                 "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                 "attributes" : "ReadMem"
                               },
    "lsc_load_quad_bti" : { "result" : "anyvector",
                            "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                            "attributes" : "ReadMem"
                          },

### ``llvm.genx.lsc.load.merge.*.<return type if not void>.<any type>.<any type>`` : lsc_load merge instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.load.merge.slm`` :
### * ``llvm.genx.lsc.load.merge.bti`` :
### * ``llvm.genx.lsc.load.merge.stateless`` :
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Subopcode, [MBZ]
### * arg2: i8 Caching behavior for L1, [MBC]
### * arg3: i8 Caching behavior for L3, [MBC]
### * arg4: i16 Address scale, [MBC]
### * arg5: i32 Immediate offset added to each address, [MBC]
### * arg6: i8 The dataum size, [MBC]
### * arg7: i8 Number of elements to load per address (vector size), [MBC]
### * arg8: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg9: i8 Channel mask for quad versions, [MBC]
### * arg10: {1,32}Xi{16,32,64} The vector register holding offsets (overloaded)
###          for flat version Base Address + Offset[i] goes here
### * arg11: i32 surface to use for this operation. This can be an immediate or a register
###          for flat and bindless version pass zero here
### * arg12: VXi{16,32,64} The data to merge disable channels (overloaded)
###
### * Return value: the value read merged witg arg12 by predicate
###
### Cache mappings are:
###
###   - 0 -> .df (default)
###   - 1 -> .uc (uncached)
###   - 2 -> .ca (cached)
###   - 3 -> .wb (writeback)
###   - 4 -> .wt (writethrough)
###   - 5 -> .st (streaming)
###   - 6 -> .ri (read-invalidate)
###
### Only certain combinations of CachingL1 with CachingL3 are valid on hardware.
###
### +---------+-----+-----------------------------------------------------------------------+
### |  L1     |  L3 | Notes                                                                 |
### +---------+-----+-----------------------------------------------------------------------+
### | .df     | .df | default behavior on both L1 and L3 (L3 uses MOCS settings)            |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .uc | uncached (bypass) both L1 and L3                                      |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .uc | streaming L1 / bypass L3                                              |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .ca | bypass L1 / cache in L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .ca     | .uc | cache in L1 / bypass L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .ca     | .ca | cache in both L1 and L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .ca | streaming L1 / cache in L3                                            |
### +---------+-----+-----------------------------------------------------------------------+
### | .ri     | .ca | read-invalidate (e.g. last-use) on L1 loads / cache in L3             |
### +---------+-----+-----------------------------------------------------------------------+
###
### Immediate offset. The compiler may be able to fuse this add into the message, otherwise
### additional instructions are generated to honor the semantics.
### Alternative variant for predicated variant of loads - merge destination for disabled
### lanes with values from additional input(arg12)
###
### Dataum size mapping is
###
###   - 1 = :u8
###   - 2 = :u16
###   - 3 = :u32
###   - 4 = :u64
###   - 5 = :u8u32 (load 8b, zero extend to 32b; store the opposite),
###   - 6 = :u16u32 (load 8b, zero extend to 32b; store the opposite),
###   - 7 = :u16u32h (load 16b into high 16 of each 32b; store the high 16)
###
    "lsc_load_merge_slm" : { "result" : "anyvector",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                             "attributes" : "ReadMem"
                           },
    "lsc_load_merge_stateless" : { "result" : "anyvector",
                                   "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                    "attributes" : "ReadMem"
                                 },
    "lsc_load_merge_bindless" : { "result" : "anyvector",
                                  "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                  "attributes" : "ReadMem"
                                },
    "lsc_load_merge_bti" : { "result" : "anyvector",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                             "attributes" : "ReadMem"
                           },
    "lsc_load_merge_quad_slm" : { "result" : "anyvector",
                                  "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                  "attributes" : "ReadMem"
                                },
    "lsc_load_merge_quad_stateless" : { "result" : "anyvector",
                                        "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                        "attributes" : "ReadMem"
                                      },
    "lsc_load_merge_quad_bindless" : { "result" : "anyvector",
                                       "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                       "attributes" : "ReadMem"
                                     },
    "lsc_load_merge_quad_bti" : { "result" : "anyvector",
                                  "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","int"],
                                  "attributes" : "ReadMem"
                                },

### ``llvm.genx.lsc.store.*.<any type>.<any type>.<any vector>`` : lsc_store instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.store.slm`` :
### * ``llvm.genx.lsc.store.bti`` :
### * ``llvm.genx.lsc.store.stateless`` :
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate(overloaded)
### * arg1: i8 Subopcode, [MBZ]
### * arg2: i8 Caching behavior for L1, [MBC]
### * arg3: i8 Caching behavior for L3, [MBC]
### * arg4: i16 Address scale, [MBC]
### * arg5: {1,32}Xi32 Immediate offset added to each address, [MBC]
### * arg6: i8 The dataum size, [MBC]
### * arg7: i8 Number of elements to load per address (vector size), [MBC]
### * arg8: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg9: i8 Channel mask for quad version, [MBC]
### * arg10: {1,32}Xi{16,32,64} The vector register holding offsets (overloaded)
###          for flat version Base Address + Offset[i] goes here
### * arg11: VXi{16,32,64} The data to write (overloaded)
### * arg12: i32 surface to use for this operation. This can be an immediate or a register
###          for flat and bindless version pass zero here
###
### * Return value: void
###
### Cache mappings are:
###
###   - 0 -> .df (default)
###   - 1 -> .uc (uncached)
###   - 2 -> .ca (cached)
###   - 3 -> .wb (writeback)
###   - 4 -> .wt (writethrough)
###   - 5 -> .st (streaming)
###   - 6 -> .ri (read-invalidate)
###
### Only certain combinations of CachingL1 with CachingL3 are valid on hardware.
###
### +---------+-----+-----------------------------------------------------------------------+
### |  L1     |  L3 | Notes                                                                 |
### +---------+-----+-----------------------------------------------------------------------+
### | .df     | .df | default behavior on both L1 and L3 (L3 uses MOCS settings)            |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .uc | uncached (bypass) both L1 and L3                                      |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .uc | streaming L1 / bypass L3                                              |
### +---------+-----+-----------------------------------------------------------------------+
### | .uc     | .wb | bypass L1/ writeback L3                                               |
### +---------+-----+-----------------------------------------------------------------------+
### | .wt     | .uc | writethrough L1 / bypass L3                                           |
### +---------+-----+-----------------------------------------------------------------------+
### | .wt     | .wb | writethrough L1 / writeback L3                                        |
### +---------+-----+-----------------------------------------------------------------------+
### | .st     | .wb | streaming L1 / writeback L3                                           |
### +---------+-----+-----------------------------------------------------------------------+
### | .wb     | .wb | writeback both L1 and L3                                              |
### +---------+-----+-----------------------------------------------------------------------+
###
    "lsc_store_slm" : { "result" : "void",
                        "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                        "attributes" : "None"
                      },
    "lsc_store_stateless" : { "result" : "void",
                              "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                              "attributes" : "None"
                            },
    "lsc_store_bindless" : { "result" : "void",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                             "attributes" : "None"
                           },
    "lsc_store_bti" : { "result" : "void",
                        "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                        "attributes" : "None"
                      },
    "lsc_store_quad_slm" : { "result" : "void",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                             "attributes" : "None"
                           },
    "lsc_store_quad_stateless" : { "result" : "void",
                                   "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                                   "attributes" : "None"
                                 },
    "lsc_store_quad_bindless" : { "result" : "void",
                                  "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                                  "attributes" : "None"
                                },
    "lsc_store_quad_bti" : { "result" : "void",
                             "arguments" : ["any","char","char","char","short","int","char","char","char","char","any","anyvector","int"],
                             "attributes" : "None"
                           },

### ``llvm.genx.lsc.*2d.stateless.[return type].<vector type>.<address type>`` : 2d stateless load/prefecth instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.load2d.stateless.<return type>.<vector type>.<address type>`` :
### * ``llvm.genx.lsc.prefetch2d.stateless.<vector type>.<address type>`` :
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Caching behavior for L1, [MBC]
### * arg2: i8 Caching behavior for L3, [MBC]
### * arg3: i8 The dataum size, [MBC]
### * arg4: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg5: i8 number of blocks, [MBC]
### * arg6: i32 BlockWidth, [MBC]
### * arg7: i32 BlockHeight, [MBC]
### * arg8: i8 VNNI. This performs a VNNI transform during the access.
### * arg9: i32/i64 surface base address for this operation.
### * arg10: i32 surface width minus 1.
### * arg11: i32 surface height minus 1.
### * arg12: i32 surface pitch minus 1.
### * arg13: i32 Src0AddrX, the base X position of the 2D region to load or store.
### * arg14: i32 Src0AddrY, the base Y position of the 2D region to load or store.
###
### * Return value: the value read or void for prefetch
###
    "lsc_load2d_stateless" : { "result" : "anyvector",
                               "arguments" : ["anyvector","char","char","char","char","char","short","short","char","anyint","int","int","int","int","int"],
                               "attributes" : "ReadMem"
                             },
    "lsc_prefetch2d_stateless" : { "result" : "void",
                                   "arguments" : ["anyvector","char","char","char","char","char","short","short","char","anyint","int","int","int","int","int"],
                                   "attributes" : "None"
                                 },

## ``llvm.genx.lsc.store2d.stateless.<vector type>.<address type>.<vector type>`` : 2d stateless store
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Caching behavior for L1, [MBC]
### * arg2: i8 Caching behavior for L3, [MBC]
### * arg3: i8 The dataum size, [MBC]
### * arg4: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg5: i8 number of blocks, [MBC]
### * arg7: i32 BlockWidth, [MBC]
### * arg6: i32 BlockHeight, [MBC]
### * arg8: i8 VNNI. This performs a VNNI transform during the access.
### * arg9: i32/i64 surface base address for this operation.
### * arg10: i32 surface width minus 1.
### * arg11: i32 surface height minus 1.
### * arg12: i32 surface pitch minus 1.
### * arg13: i32 Src0AddrX, the base X position of the 2D region to load or store.
### * arg14: i32 Src0AddrY, the base Y position of the 2D region to load or store.
### * arg15: data to write (overloaded)
###
### * Return value: void
###
    "lsc_store2d_stateless" : { "result" : "void",
                                "arguments" : ["anyvector","char","char","char","char","char","short","short","char","anyint","int","int","int","int","int","anyvector"],
                                "attributes" : "None"
                              },


### ``llvm.genx.lsc.atomic.*.<return type>.<any type>.<any vector>`` : lsc_atomic instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### !!! Those are legacy ones! Use xatomic version instead !!!
###
### * ``llvm.genx.lsc.atomic.bti`` :
### * ``llvm.genx.lsc.atomic.slm`` :
### * ``llvm.genx.lsc.atomic.slateless`` :
###
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Subopcode, [MBZ]
### * arg2: i8 Caching behavior for L1, [MBC]
### * arg3: i8 Caching behavior for L3, [MBC]
### * arg4: i16 Address scale, [MBC]
### * arg5: {1,32}Xi32 Immediate offset added to each address, [MBC]
### * arg6: i8 The dataum size, [MBC]
### * arg7: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg8: i8 Number of elements to load per address (vector size), [MBC]
### * arg9: i8 Channel mask, currently ignored, [MBC].
### * arg10: i32/i64 surface base address for this operation.
### * arg11: {1,32}Xi{16,32,64} The vector register holding addresses. (overloaded)
### * arg12: i32 {1,32}Xi32 Src0 or undef (same vector size as predicate)
### * arg13: i32 {1,32}Xi32 Src1 or undef (same vector size as predicate)
### * arg14: i32 {1,32}Xi32 Old value of destination (same vector size as predicate), now always undef
###
    "lsc_atomic_bti" : { "result" : "any",
                         "arguments" : ["any","char","char","char","short","int","char","char","char","char","int","anyvector",0,0,0],
                         "attributes" : "None"
                       },
    "lsc_atomic_slm" : { "result" : "any",
                         "arguments" : ["any","char","char","char","short","int","char","char","char","char","int","anyvector",0,0,0],
                         "attributes" : "None"
                       },
    "lsc_atomic_stateless" : { "result" : "any",
                               "arguments" : ["any","char","char","char","short","int","char","char","char","char","int","anyvector",0,0,0],
                               "attributes" : "None"
                             },
    "lsc_atomic_bindless" : { "result" : "any",
                              "arguments" : ["any","char","char","char","short","int","char","char","char","char","int","anyvector",0,0,0],
                              "attributes" : "None"
                            },

### ``llvm.genx.lsc.xatomic.*.<return type>.<any type>.<any vector>`` : lsc_atomic instructions
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.xatomic.bti`` :
### * ``llvm.genx.lsc.xatomic.slm`` :
### * ``llvm.genx.lsc.xatomic.slateless`` :
### * ``llvm.genx.lsc.xatomic.bindless`` :
###
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 Subopcode, [MBZ]
### * arg2: i8 Caching behavior for L1, [MBC]
### * arg3: i8 Caching behavior for L3, [MBC]
### * arg4: i16 Address scale, [MBC]
### * arg5: {1,32}Xi32 Immediate offset added to each address, [MBC]
### * arg6: i8 Data size, [MBC]
### * arg7: i8 Number of elements to load per address (vector size), [MBC]
### * arg8: i8 Indicates if the data is transposed during the transfer, [MBC]
### * arg9: i8 Channel mask, currently ignored, [MBC]
### * arg10: {1,32}Xi{16,32,64} The vector register holding offsets (overloaded)
###          for flat version Base Address + Offset[i] goes here
### * arg11: i32 {1,32}Xi32 Src0 or undef (same vector size as predicate)
### * arg12: i32 {1,32}Xi32 Src1 or undef (same vector size as predicate)
### * arg13: i32 surface to use for this operation. This can be an immediate or a register
###          for flat and bindless version pass zero here
### * arg14: i32 {1,32}Xi32 Old value of destination (same vector size as predicate), now always undef
###
    "lsc_xatomic_bti" : { "result" : "any",
                          "arguments" : ["any","char","char","char","short","int","char","char","char","char","anyvector",0,0,"int",0],
                          "attributes" : "None"
                        },
    "lsc_xatomic_slm" : { "result" : "any",
                          "arguments" : ["any","char","char","char","short","int","char","char","char","char","anyvector",0,0,"int",0],
                          "attributes" : "None"
                        },
    "lsc_xatomic_stateless" : { "result" : "any",
                                "arguments" : ["any","char","char","char","short","int","char","char","char","char","anyvector",0,0,"int",0],
                                "attributes" : "None"
                              },
    "lsc_xatomic_bindless" : { "result" : "any",
                               "arguments" : ["any","char","char","char","short","int","char","char","char","char","anyvector",0,0,"int",0],
                               "attributes" : "None"
                             },

### ``llvm.genx.lsc.fence.<vector type>`` : lsc_fence instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.lsc.fence`` :
###
### * Exec_size ignored unless operation is transposed (DataOrder == Tranpose)
### * arg0: {1,32}Xi1 predicate (overloaded)
### * arg1: i8 SFID
### * arg2: i8 Fence operation
### * arg3: i8 Fence operation scope
###
### [2] Mappings are:
###     0 -> .ugm (unified global memory)
###     1 -> .ugml (low-bandwith untyped global memory)
###     2 -> .tgm (typed global memory)
###     3 -> .slm (shared local memory)
###
### [3] Mappings are:
###     0 -> .none (no operation)
###     1 -> .evict (dirty lines evicted and invalidated from L1)
###     2 -> .invalidate (invalidate all clean lines)
###     3 -> .discard (direct and clean lines are discarded w/o eviction)
###     4 -> .clean (dirty lines are written to memory, but retained in cache in clean state)
###     5 -> .flushl3 (flush only L3)
###
### [4] Mappings are:
###     0 -> .group (flush out to the threadgroup's scope)
###     1 -> .local (flush out to the local scope)
###     2 -> .tile (tile, flush out to several DSSs)
###     3 -> .gpu (entire GPU, flush out to the GPUs LLC)
###     4 -> .gpus (all GPUs in the system, flush out to memory shared by all GPUs)
###     5 -> .system (the entire system memory space)
###     6 -> .sysacq (the entire system memory space with system-acquire semantics)
###
    "lsc_fence" : { "result" : "void",
                    "arguments" : ["anyvector","char","char","char"],
                    "attributes" : "None"
                  },

### xor
### ^^^
### Intrinsic not needed; use LLVM IR Xor instruction
###

## ---------------------------------
### vISA reserved register intrinsics
### ---------------------------------

### ``llvm.genx.thread.*`` : read thread ID register
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.thread.x`` : read vISA v1 (%thread_x)
### * ``llvm.genx.thread.y`` : read vISA v2 (%thread_y)
###
### * Return value:  i16 the value read
###
    "thread_x" : { "result" : "short",
                   "arguments" : [],
                   "attributes" : "NoMem"
                 },
    "thread_y" : { "result" : "short",
                   "arguments" : [],
                   "attributes" : "NoMem"
                 },

### ``llvm.genx.group.id.*`` : read group ID register
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### ``llvm.genx.group.id.x`` : read vISA v7 (%group_id_x)
### ``llvm.genx.group.id.y`` : read vISA v8 (%group_id_y)
### ``llvm.genx.group.id.z`` : read vISA v23 (%group_id_z)
###
### * Return value:  i32 the value read
###
    "group_id_x" : { "result" : "int",
                     "arguments" : [],
                     "attributes" : "NoMem"
                   },
    "group_id_y" : { "result" : "int",
                     "arguments" : [],
                     "attributes" : "NoMem"
                   },
    "group_id_z" : { "result" : "int",
                     "arguments" : [],
                     "attributes" : "NoMem"
                   },

### ``llvm.genx.timestamp.<return type>`` : read vISA v11 (%timestamp)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Return value:  vxi32 the value read
###
### The vector width must be power of 2 and no larger than 4.
###
    "timestamp" : { "result" : "anyint",
                    "arguments" : [],
                    "attributes" : "None"
                  },

### ``llvm.genx.r0.<return type>`` : read vISA v12 (%r0)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Return value:  vxi32 or i32 the value read
###
### The vector width must be power of 2 and no larger than 8.
###
    "r0" : { "result" : "anyint",
             "arguments" : [],
             "attributes" : "ReadMem"
           },

### ``llvm.genx.sr0.<return type>`` : read vISA v13 (%sr0)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Return value:  vxi32 the value read
###
### The vector width must be 4
###
###
    "sr0" : { "result" : "anyint",
              "arguments" : [],
              "attributes" : "ReadMem"
            },

### ``llvm.genx.set.sr0.2`` : write vISA v13(0, 2) (%sr0.2)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### arg0: data to write (1 dword)
###
### * Return value:  void
###
    "set_sr0_2" : { "result" : "void",
                    "arguments" : ["int"],
                    "attributes" : "None"
                  },

### ``llvm.genx.get.color`` : read color value of the thread origin
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: i16 the value read
###
### This may not be the most appropriate way to access this value,
### but is a stop-gap solution.
###
    "get_color" : { "result" : "short",
                    "arguments" : [],
                    "attributes" : "NoMem"
                  },

### ``llvm.genx.get.hwid`` : read hw_id value
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: i32 the value read
###
    "get_hwid" : { "result" : "int",
                   "arguments" : [],
                   "attributes" : "NoMem"
                 },

### ``llvm.genx.ce0`` : read channel-enable register
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: i32 the value read
###
    "ce0" : { "result" : "int",
              "arguments" : [],
              "attributes" : "ReadMem"
            },

### ``llvm.genx.set.pause`` : set the pause register (v11.4)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### arg0: length of pause 10 bits (0-4 must be 0)
###
### Return Value: none
###
###
### Set the pause value - this pauses instruction issue until the value has been
### decremented to 0 (decrements every 32 clocks)
###
### We set this intrinsic to have side-effects (last field empty) to stop it being removed as it
### otherwise looks dead
    "set_pause" : { "result" : "void",
                    "arguments" : ["short"],
                    "attributes" : "None"
                  },

### ``llvm.genx.dummy.mov`` : insert a dummy mov to v0
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### arg0: a value that we want to mov to v0 (usually to trigger a scoreboard dependency)
###
### Return Value: none
###
###
### This is primarily used to set up scoreboard dependencies. If a value is mov'ed to v0 then it
### will trigger a scoreboard dependency check.
### As a word (16 bits) is usually the basic type of value that is worked with, you only need to
### dummy mov one of these from any payload to correctly trigger the dependency
###
### We set this intrinsic to have side-effects (last field empty) to stop it being removed as it
### otherwise looks dead and also to prevent any kind of code motion optimisation
    "dummy_mov" : { "result" : "void",
                    "arguments" : ["short"],
                    "attributes" : "None"
                  },

### The following 2 predef.reg intrinsics aren't translated directly to read/writes of the reg,
### instead they're baled together with rd/wrregions and in fact indicate that those rdr/wrrs
### should use predefined VISA registers as their sources/dests
###
### ``llvm.genx.read.predef.reg.<return type>`` : read predefined vISA reg
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: ID of the reg (1 dword)
### * arg1: value that could affect the reg being read (e.g. stackcall), may be undef
###
### * Return value:  value read
###
###
    "read_predef_reg" : { "result" : "any",
                          "arguments" : ["int", "any"],
                          "attributes" : "ReadMem"
                        },

### ``llvm.write.predef.reg.<return type>.<input_type>`` : write value to predefined vISA reg
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: ID of the reg (1 dword)
### * arg1: data to write (1 dword)
###
### * Return value:  value written
###
    "write_predef_reg" : { "result" : "any",
                           "arguments" : ["int", "any"],
                           "attributes" : "WriteMem"
                         },

## --------------------------
### Shared function intrinsics
### --------------------------
### These are in the order they appear in the vISA spec, not in
### alphabetical order.
###

### ``llvm.genx.dword.atomic.*.<return type>.<vector type.<any int>`` : dword atomic with binary operator
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic.add`` : vISA DWORD_ATOMIC ADD instruction
### * ``llvm.genx.dword.atomic.sub`` : vISA DWORD_ATOMIC SUB instruction
### * ``llvm.genx.dword.atomic.min`` : vISA DWORD_ATOMIC MIN instruction
### * ``llvm.genx.dword.atomic.max`` : vISA DWORD_ATOMIC MAX instruction
### * ``llvm.genx.dword.atomic.xchg`` : vISA DWORD_ATOMIC XCHG instruction
### * ``llvm.genx.dword.atomic.and`` : vISA DWORD_ATOMIC AND instruction
### * ``llvm.genx.dword.atomic.or`` : vISA DWORD_ATOMIC OR instruction
### * ``llvm.genx.dword.atomic.xor`` : vISA DWORD_ATOMIC XOR instruction
### * ``llvm.genx.dword.atomic.imin`` : vISA DWORD_ATOMIC IMIN instruction
### * ``llvm.genx.dword.atomic.imax`` : vISA DWORD_ATOMIC IMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXi32 src
### * arg4: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic_add" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_sub" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_min" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_max" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_xchg" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },
    "dword_atomic_and" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_or" : { "result" : "anyvector",
                          "arguments" : ["anyvector","int","anyint",0,0],
                          "attributes" : "None",
                        },
    "dword_atomic_xor" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_imin" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },
    "dword_atomic_imax" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },

### ``llvm.genx.dword.atomic2.*.<return type>.<vector type.<any int>`` : dword atomic with binary operator (variant with no oldval)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic2.add`` : vISA DWORD_ATOMIC ADD instruction
### * ``llvm.genx.dword.atomic2.sub`` : vISA DWORD_ATOMIC SUB instruction
### * ``llvm.genx.dword.atomic2.min`` : vISA DWORD_ATOMIC MIN instruction
### * ``llvm.genx.dword.atomic2.max`` : vISA DWORD_ATOMIC MAX instruction
### * ``llvm.genx.dword.atomic2.xchg`` : vISA DWORD_ATOMIC XCHG instruction
### * ``llvm.genx.dword.atomic2.and`` : vISA DWORD_ATOMIC AND instruction
### * ``llvm.genx.dword.atomic2.or`` : vISA DWORD_ATOMIC OR instruction
### * ``llvm.genx.dword.atomic2.xor`` : vISA DWORD_ATOMIC XOR instruction
### * ``llvm.genx.dword.atomic2.imin`` : vISA DWORD_ATOMIC IMIN instruction
### * ``llvm.genx.dword.atomic2.imax`` : vISA DWORD_ATOMIC IMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXi32 src
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic2_add" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_sub" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_min" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_max" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_xchg" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },
    "dword_atomic2_and" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_or" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",0],
                           "attributes" : "None",
                         },
    "dword_atomic2_xor" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_imin" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },
    "dword_atomic2_imax" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },

### ``llvm.genx.dword.atomic.*.<return type>.<vector type>.<any int>`` : dword atomic with fmin/fmax operation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic.fmin`` : vISA DWORD_ATOMIC FMIN instruction
### * ``llvm.genx.dword.atomic.fmax`` : vISA DWORD_ATOMIC FMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate(overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXfloat src
### * arg4: vXfloat original value of the register that the data is read into
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic_fmin" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },
    "dword_atomic_fmax" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },
    "dword_atomic_fadd" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },
    "dword_atomic_fsub" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","anyint",0,0],
                            "attributes" : "None",
                          },

### ``llvm.genx.dword.atomic2.*.<return type>.<vector type>.<any int>`` : dword atomic with fmin/fmax operation (variant with no oldval)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic2.fmin`` : vISA DWORD_ATOMIC FMIN instruction
### * ``llvm.genx.dword.atomic2.fmax`` : vISA DWORD_ATOMIC FMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate(overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXfloat src
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic2_fmin" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },
    "dword_atomic2_fmax" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },
    "dword_atomic2_fadd" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },
    "dword_atomic2_fsub" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","anyint",0],
                             "attributes" : "None",
                           },


### ``llvm.genx.dword.atomic.*.<return type>.<any vector>`` : dword atomic with inc/dec operation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic.inc`` : vISA DWORD_ATOMIC INC instruction
### * ``llvm.genx.dword.atomic.dec`` : vISA DWORD_ATOMIC DEC instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes
### * arg3: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic_inc" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,0],
                           "attributes" : "None",
                         },
    "dword_atomic_dec" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,0],
                           "attributes" : "None",
                         },

### ``llvm.genx.dword.atomic2.*.<return type>.<any vector>`` : dword atomic with inc/dec operation (variant with no oldval)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.dword.atomic2.inc`` : vISA DWORD_ATOMIC INC instruction
### * ``llvm.genx.dword.atomic2.dec`` : vISA DWORD_ATOMIC DEC instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic2_inc" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0],
                            "attributes" : "None",
                          },
    "dword_atomic2_dec" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0],
                            "attributes" : "None",
                          },

### ``llvm.genx.dword.atomic.cmpxchg.<return type>.<vector type>`` : vISA DWORD_ATOMIC CMPXCHG instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes
### * arg3: vXi32 src0
### * arg4: vXi32 src1
### * arg5: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic_cmpxchg" : { "result" : "anyvector",
                               "arguments" : ["anyvector","int",0,0,0,0],
                               "attributes" : "None",
                             },

### ``llvm.genx.dword.atomic2.cmpxchg.<return type>.<vector type>`` : vISA DWORD_ATOMIC CMPXCHG instruction (variant with no oldval)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes
### * arg3: vXi32 src0
### * arg4: vXi32 src1
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic2_cmpxchg" : { "result" : "anyvector",
                                "arguments" : ["anyvector","int",0,0,0],
                                "attributes" : "None",
                              },

### ``llvm.genx.dword.atomic.fcmpwr.<return type>.<vector type>.<any int>`` : vISA DWORD_ATOMIC FCMPWR instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXfloat src0
### * arg4: vXfloat src1
### * arg5: vXfloat original value of the register that the data is read into
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic_fcmpwr" : { "result" : "anyvector",
                              "arguments" : ["anyvector","int","anyint",0,0,0],
                              "attributes" : "None",
                            },

### ``llvm.genx.dword.atomic2.fcmpwr.<return type>.<vector type>.<any int>`` : vISA DWORD_ATOMIC FCMPWR instruction (variant with no oldval)
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 element offset in bytes (overloaded)
### * arg3: vXfloat src0
### * arg4: vXfloat src1
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 8 or 16.
###
    "dword_atomic2_fcmpwr" : { "result" : "anyvector",
                               "arguments" : ["anyvector","int","anyint",0,0],
                               "attributes" : "None",
                             },

### ``llvm.genx.typed.atomic.*.<return type>.<vector type>.<any int>`` : atomic typed with binary operator
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.typed.atomic.add`` : vISA TYPED_ATOMIC ADD instruction
### * ``llvm.genx.typed.atomic.sub`` : vISA TYPED_ATOMIC SUB instruction
### * ``llvm.genx.typed.atomic.min`` : vISA TYPED_ATOMIC MIN instruction
### * ``llvm.genx.typed.atomic.max`` : vISA TYPED_ATOMIC MAX instruction
### * ``llvm.genx.typed.atomic.xchg`` : vISA TYPED_ATOMIC XCHG instruction
### * ``llvm.genx.typed.atomic.and`` : vISA TYPED_ATOMIC AND instruction
### * ``llvm.genx.typed.atomic.or`` : vISA TYPED_ATOMIC OR instruction
### * ``llvm.genx.typed.atomic.xor`` : vISA TYPED_ATOMIC XOR instruction
### * ``llvm.genx.typed.atomic.imin`` : vISA TYPED_ATOMIC IMIN instruction
### * ``llvm.genx.typed.atomic.imax`` : vISA TYPED_ATOMIC IMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXT src
### * arg3: vXi32 u (overloaded)
### * arg4: vXi32 v - can be a constant 0 and becomes undef in lowering
### * arg5: vXi32 r - can be a constant 0 and becomes undef in lowering
### * arg6: vXi32 LOD - can be constant 0 and becomes undef in lowering
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width (which in reality must be 8)
###
    "typed_atomic_add" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_sub" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_min" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_max" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_xchg" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },
    "typed_atomic_and" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_or" : { "result" : "anyvector",
                          "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                          "attributes" : "None"
                        },
    "typed_atomic_xor" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_imin" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },
    "typed_atomic_imax" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },

### ``llvm.genx.typed.atomic.*.<return type>.<vector type>.<any int>`` : atomic typed with fmin/fmax operation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.typed.atomic.fmin`` : vISA TYPED_ATOMIC FMIN instruction
### * ``llvm.genx.typed.atomic.fmax`` : vISA TYPED_ATOMIC FMAX instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXfloat src
### * arg3: vXi32 u (overloaded)
### * arg4: vXi32 v - can be a constant 0 and becomes undef in lowering
### * arg5: vXi32 r - can be a constant 0 and becomes undef in lowering
### * arg6: vXi32 LOD - can be a constant 0 and becomes undef in lowering
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width (which in reality must be 8)
###
    "typed_atomic_fmin" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },
    "typed_atomic_fmax" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },
    "typed_atomic_fadd" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },
    "typed_atomic_fsub" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int",0,"anyint",2,2,2],
                            "attributes" : "None"
                          },

### ``llvm.genx.typed.atomic.*.<return type>.<vector type>.<any int>`` : atomic typed with inc/dec operation
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.typed.atomic.inc`` : vISA TYPED_ATOMIC INC instruction
### * ``llvm.genx.typed.atomic.dec`` : vISA TYPED_ATOMIC DEC instruction
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXi32 u (overloaded)
### * arg3: vXi32 v - can be a constant 0 and becomes undef in lowering
### * arg4: vXi32 r - can be a constant 0 and becomes undef in lowering
### * arg5: vXi32 LOD - can be a constant 0 and becomes undef in lowering
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width (which in reality must be 8)
###
    "typed_atomic_inc" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",2,2,2],
                           "attributes" : "None"
                         },
    "typed_atomic_dec" : { "result" : "anyvector",
                           "arguments" : ["anyvector","int","anyint",2,2,2],
                           "attributes" : "None"
                         },

### ``llvm.genx.typed.atomic.cmpxchg.<return type>.<vector type>.<any int>`` : vISA TYPED_ATOMIC CMPXCHG instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXT src0
### * arg3: vXT src1
### * arg4: vXi32 u (overloaded)
### * arg5: vXi32 v - can be a constant 0 and becomes undef in lowering
### * arg6: vXi32 r - can be a constant 0 and becomes undef in lowering
### * arg7: vXi32 LOD - can be a constant 0 and becomes undef in lowering
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width (which in reality must be 8)
###
    "typed_atomic_cmpxchg" : { "result" : "anyvector",
                               "arguments" : ["anyvector","int",0,0,"anyint",2,2,2],
                               "attributes" : "None"
                             },

### ``llvm.genx.typed.atomic.fcmpwr.<return type>.<vector type>.<any int>`` : vISA TYPED_ATOMIC FCMPWR instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 surface index
### * arg2: vXfloat src0
### * arg2: vXfloat src1
### * arg3: vXi32 u (overloaded)
### * arg4: vXi32 v - can be a constant 0 and becomes undef in lowering
### * arg5: vXi32 r - can be a constant 0 and becomes undef in lowering
### * arg6: vXi32 LOD - can be a constant 0 and becomes undef in lowering
###
### * Return value: vXfloat the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width (which in reality must be 8)
###
    "typed_atomic_fcmpwr" : { "result" : "anyvector",
                              "arguments" : ["anyvector","int",0,0,"anyint",2,2,2],
                              "attributes" : "None"
                            },

### ``llvm.genx.gather.private.<return type>.<vector type>.<any int>`` : CMC internal, no VISA
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: v-by-i1 predicate (overloaded)
### * (Num_elts inferred from data type)
### * arg1: base pointer
### * arg2: vXi32 element offset in elements (overloaded)
### * arg3: old value of the data read
###
### * Return value: the data read
###
### The vector width of the return value is the number of elements to read,
### which must be 1, 8 or 16.
###
### The element offset arg must have the same vector width.
###
    "gather_private" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyptr","anyint",0],
                         "attributes" : "ReadMem"
                       },

### ``llvm.genx.gather.scaled.<return type>.<vector type>.<any int>`` : vISA GATHER_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
### * arg2: i16 scale, constant
### * arg3: i32 surface index
### * arg4: i32 global offset in bytes
### * arg5: vXi32 element offset in bytes (overloaded)
### * arg6: old value of the data read
###
### * Return value: the data read
###
### The vector width of the element offset arg is the number of elements to
### read, which must be power of 2 and less than or equal to 32.
###
### The predicate arg must have the same vector width.
###
### The old value of the data read (the return value) must have UD, D or
### F type. For 1 and 2 byte (1 x num blocks) reads the upper bytes have
### undefined values in the returned value.
###
    "gather_scaled" : { "result" : "anyvector",
                        "arguments" : ["anyvector","int","short","int","int","anyint",0],
                        "attributes" : "ReadMem",
                      },

### ``llvm.genx.gather.scaled2`` : vISA GATHER_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### This intrinsic doesn't have redundant predicate and old value that can be inferred
### from resulting wrregion.
###
### * (Exec_size inferred from element offset type)
### * arg0: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
### * arg1: i16 scale, constant
### * arg2: i32 surface index
### * arg3: i32 global offset in bytes
### * arg4: vXi32 element offset in bytes (overloaded)
###
### * Return value: the data read
###
### The vector width of the element offset arg is the number of elements to
### read, which must be power of 2 and less than or equal to 32.
###
### For 1 and 2 byte (1 x num blocks) reads the upper bytes have
### undefined values in the returned value.
###
    "gather_scaled2" : { "result" : "anyvector",
                         "arguments" : ["int","short","int","int","anyint"],
                         "attributes" : "ReadMem",
                       },

### ``llvm.genx.gather.masked.scaled2`` : vISA GATHER_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
###
### * (Exec_size inferred from element offset type)
### * arg0: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
### * arg1: i16 scale, constant
### * arg2: i32 surface index
### * arg3: i32 global offset in bytes
### * arg4: vXi32 element offset in bytes (overloaded)
### * arg5: vXi1 predicate (overloaded)
###
### * Return value: the data read
###
    "gather_masked_scaled2" : { "result" : "anyvector",
                                "arguments" : ["int","short","int","int","anyint","anyvector"],
                                "attributes" : "ReadMem",
                              },


### ``llvm.genx.gather4.scaled.<return type>.<vector type>.<any int>`` : vISA GATHER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded) (overloaded)
### * arg1: i32 channel mask, constant
### * arg2: i16 scale, constant
### * arg3: i32 surface index
### * arg4: i32 global offset in bytes
### * arg5: vXi32 element offset in bytes (overloaded)
### * arg6: old value of the data read
###
### * Return value: the data read
###
### The vector width of the element offset arg is the number of elements to
### read, which must be 8 or 16.
### The predicate arg must have the same vector width.
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
### The vector width of the return value must be the number of elements
### times the number of channels to read per element.
### The element type of the return value must be i32 or float.
###
    "gather4_scaled" : { "result" : "anyvector",
                         "arguments" : ["anyvector","int","short","int","int","anyint",0],
                         "attributes" : "ReadMem" ,
                       },

### ``llvm.genx.gather4.scaled2`` : vISA GATHER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### This intrinsic doesn't have redundant predicate and old value that can be inferred
### from resulting wrregion.
###
### * (Exec_size inferred from element offset type)
### * arg0: i32 channel mask, constant
### * arg1: i16 scale, constant
### * arg2: i32 surface index
### * arg3: i32 global offset in bytes
### * arg4: vXi32 element offset in bytes
###
### * Return value: the data read
###
### The vector width of the element offset arg is the number of elements to
### read, which must be 8 or 16.
### The predicate arg must have the same vector width.
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
### The vector width of the return value must be the number of elements
### times the number of channels to read per element.
### The element type of the return value must be i32 or float.
###
    "gather4_scaled2" : { "result" : "anyvector",
                          "arguments" : ["int","short","int","int","anyint"],
                          "attributes" : "ReadMem",
                        },

### ``llvm.genx.gather4.masked.scaled2`` : vISA GATHER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: i32 channel mask, constant
### * arg1: i16 scale, constant
### * arg2: i32 surface index
### * arg3: i32 global offset in bytes
### * arg4: vXi32 element offset in bytes
### * arg5: vXi1 predicate (overloaded)
###
### * Return value: the data read
###
    "gather4_masked_scaled2" : { "result" : "anyvector",
                                 "arguments" : ["int","short","int","int","anyint","anyvector"],
                                 "attributes" : "ReadMem",
                               },


### ``llvm.genx.gather4.typed.<return type>.<vector type>.<vector type>`` : vISA GATHER4_TYPED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant
### * arg1: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg2: i32 surface index
### * arg3: vXi32 U pixel address (overloaded)
### * arg4: vXi32 V pixel address
### * arg5: vXi32 R pixel address
### * arg6: old value of the data read
###
### * Return value: the data read
###
### The vector widths of the U pixel address, V pixel address and R pixel
### address args must be equal and are the number of elements to read, which
### must be 8 or 16. (16 is split into 2x 8 by the GenX backend.)
### The predicate arg must have the same vector width.
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels to read.
### The number of 1 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element. Mask "0000" is not allowed.
### The vector width of the return value must be the number of elements
### times the number of channels to read per element.
### The element type of the return value must be i32 or float.
###
    "gather4_typed" : { "result" : "anyvector",
                        "arguments" : ["int","anyvector","int","anyvector",2,2,0],
                        "attributes" : "ReadMem",
                      },

### ``llvm.genx.media.ld.<return type>`` : vISA MEDIA_LD instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 modifiers, constant
### * arg1: i32 surface index
### * arg2: i32 plane, constant
### * arg3: i32 block width in bytes, constant
### * (block height inferred from return type size and block width)
### * arg4: i32 x byte offset
### * arg5: i32 y byte offset
###
### * Return value: the data read.
###
### The number of bytes taken by a row in the return value, the "rounded
### block width", is the block width rounded up to the next power of two
### no less than 4. The size of the return type must be a multiple of
### this rounded block width, and the multiplier is the block height.
###
### The block width has a maximum of 32 (64 on BDW+). The maxmimum byte
### size of the return type is 256.
###
    "media_ld" : { "result" : "anyvector",
                   "arguments" : ["int","int","int","int","int","int"],
                   "attributes" : "ReadMem",
                 },

### ``llvm.genx.media.st.<vector type>`` : vISA MEDIA_ST instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 modifiers, constant
### * arg1: i32 surface index
### * arg2: i32 plane, constant
### * arg3: i32 block width in bytes, constant
### * (block height inferred from data type size and block width)
### * arg4: i32 x byte offset
### * arg5: i32 y byte offset
### * arg6: data to write (overloaded)
###
### The number of bytes taken by a row in the return value, the "rounded
### block width", is the block width rounded up to the next power of two
### no less than 4. The size of the data to write type must be a multiple of
### this rounded block width, and the multiplier is the block height.
###
### The block width has a maximum of 32 (64 on BDW+). The maxmimum byte
### size of the data to write is 256.
###
    "media_st" : { "result" : "void",
                   "arguments" : ["int","int","int","int","int","int","anyvector"],
                   "attributes" : "None",
                 },

### ``llvm.genx.oword.ld*.<return type>`` : oword load instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.oword.ld`` : vISA OWORD_LD instruction
### * ``llvm.genx.oword.ld.unaligned`` : vISA OWORD_LD_UNALIGNED instruction
###
### * (log2 number of owords inferred from return type)
### * arg0: i32 is_modified, constant
### * arg1: i32 surface index
### * arg2: i32 offset (in owords for .ld / in bytes for .ld.unaligned)
###
### * Return value: the data read.
###
### The byte size of the return type must be 16, 32, 64, or 128.
###
    "oword_ld" : { "result" : "anyvector",
                   "arguments" : ["int","int","int"],
                   "attributes" : "ReadMem",
                 },
    "oword_ld_unaligned" : { "result" : "anyvector",
                             "arguments" : ["int","int","int"],
                             "attributes" : "ReadMem",
                           },

### ``llvm.genx.oword.st.<vector type>`` : vISA OWORD_ST instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (log2 number of owords inferred from return type)
### * arg0: i32 surface index
### * arg1: i32 offset (in owords)
### * arg2: data to write (overloaded)
###
### The byte size of the data to write must be 16, 32, 64, or 128.
###
    "oword_st" : { "result" : "void",
                   "arguments" : ["int","int","anyvector"],
                   "attributes" : "None",
                 },

### ``llvm.genx.scatter.private.<vector type>.<ptr type>.<any int>.<vector type>`` : CM internal, no VISA
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: v-by-i1 predicate (overloaded)
### * arg1: base pointer (overloaded)
### * arg2 vXi32 element offset in elements (overloaded)
### * arg3: the data to write. The first <num_elts> elements will be used. (overloaded)
###
### The vector width of the data to write is the number of elements to write,
### which must be 1, 8 or 16.
### The element offset arg must have the same vector width.
###
    "scatter_private" : { "result" : "void",
                          "arguments" : ["anyvector","anyptr","anyint","anyvector"],
                          "attributes" : "None"
                        },

### ``llvm.genx.scatter.scaled.<vector type>.<any int>.<vector type>`` : vISA SCATTER_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
### * arg2: i16 scale, constant
### * arg3: i32 surface index
### * arg4: i32 global offset in bytes
### * arg5: vXi32 element offset (overloaded)
### * arg6: data to write (overloaded)
###
### The vector width of the element offset arg is the number of elements to
### write, which must be power of 2 and less than or equal to 32.
###
### The predicate arg must have the same vector width.
###
### The data type to write must have UD, D or F type. For 1 and 2 byte (1 x num
### blocks) accesses the upper bytes will be ignored.
###
    "scatter_scaled" : { "result" : "void",
                         "arguments" : ["anyvector","int","short","int","int","anyint","anyvector"],
                         "attributes" : "None",
                       },

### ``llvm.genx.scatter4.scaled.<vector type>.<any int>.<vector type>`` : vISA SCATTER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (Exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 channel mask, constant
### * arg2: i16 scale, constant
### * arg3: i32 surface index
### * arg4: i32 global offset in bytes
### * arg5: vXi32 element offset in bytes (overloaded)
### * arg6: data to write (overloaded)
###
### The vector width of the element offset arg is the number of elements to
### write, which must be 8 or 16.
### The predicate arg must have the same vector width.
### The instruction writes up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to write per element.
### The channels to write must be contiguous and starting at channel 0.
### The vector width of the data to write must be the number of elements
### times the number of channels to write per element.
### The element type of the data to write must be i32 or float.
###
    "scatter4_scaled" : { "result" : "void",
                          "arguments" : ["anyvector","int","short","int","int","anyint","anyvector"],
                          "attributes" : "None",
                        },

### ``llvm.genx.scatter4.typed.<vector type>.<vector type>.<vector type>`` : vISA SCATTER4_TYPED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant
### * arg1: vXi1 predicate (Num_elts inferred from U pixel address type) (overloaded)
### * arg2: i32 surface index
### * arg3: v8Xi32 U pixel address (overloaded)
### * arg4: v8Xi32 V pixel address
### * arg5: v8Xi32 R pixel address
### * arg6: data to write (overloaded)
###
### The vector widths of the U pixel address, V pixel address and R pixel
### address args must be equal and are the number of elements to write, which
### must be 8 or 16. (16 is split into 2x 8 by the GenX backend.)
### The predicate arg must have the same vector width.
### The instruction writes up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels to write.
### The number of 1 bits in that lower 4 bits of the channel mask arg is the
### number of channels to write per element. Mask "0000" is not allowed.
### The vector width of the return value must be the number of elements
### times the number of channels to read per element.
### The element type of the source value must be i32 or float.
###
    "scatter4_typed" : { "result" : "void",
                         "arguments" : ["int","anyvector","int","anyvector",1,1,"anyvector"],
                         "attributes" : "None",
                       },

### ``llvm.genx.transpose.ld.<return type>`` : vISA TRANSPOSE_LD instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: i32 log2 block width in i32s, constant (0-3)
### * (log2 block height inferred from block width and data type, 0-3)
### * arg2: i32 X offset
### * arg3: i32 Y offset
###
### * Return value: the data read
###
### The vector width of the return value is the number of elements to read.
### This must be a multiple of the block width. The block height is then
### inferred from those values.
### The element type of the return value must be i32 or float.
###
    "transpose_ld" : { "result" : "anyvector",
                       "arguments" : ["int","int","int","int"],
                       "attributes" : "ReadMem"
                     },

### ``llvm.genx.untyped.atomic.*.<return type>.<vector type>`` : vISA UNTYPED_ATOMIC with binary operator
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.untyped.atomic.add`` : vISA UNTYPED_ATOMIC ADD instruction
### * ``llvm.genx.untyped.atomic.sub`` : vISA UNTYPED_ATOMIC SUB instruction
### * ``llvm.genx.untyped.atomic.min`` : vISA UNTYPED_ATOMIC MIN instruction
### * ``llvm.genx.untyped.atomic.max`` : vISA UNTYPED_ATOMIC MAX instruction
### * ``llvm.genx.untyped.atomic.xchg`` : vISA UNTYPED_ATOMIC XCHG instruction
### * ``llvm.genx.untyped.atomic.and`` : vISA UNTYPED_ATOMIC AND instruction
### * ``llvm.genx.untyped.atomic.or`` : vISA UNTYPED_ATOMIC OR instruction
### * ``llvm.genx.untyped.atomic.xor`` : vISA UNTYPED_ATOMIC XOR instruction
### * ``llvm.genx.untyped.atomic.imin`` : vISA UNTYPED_ATOMIC IMIN instruction
### * ``llvm.genx.untyped.atomic.imax`` : vISA UNTYPED_ATOMIC IMAX instruction
###
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg1: i32 surface index
### * arg2: i32 global offset in i32s
### * arg3: vXi32 element offset in i32s
### * arg4: vXi32 src
### * arg5: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
##same vector / width, which must be 8 or 16.
###
    "untyped_atomic_add" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_sub" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_min" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_max" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_xchg" : { "result" : "anyvector",
                              "arguments" : ["anyvector","int","int",0,0,0],
                              "attributes" : "None"
                            },
    "untyped_atomic_and" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_or" : { "result" : "anyvector",
                            "arguments" : ["anyvector","int","int",0,0,0],
                            "attributes" : "None"
                          },
    "untyped_atomic_xor" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_imin" : { "result" : "anyvector",
                              "arguments" : ["anyvector","int","int",0,0,0],
                              "attributes" : "None"
                            },
    "untyped_atomic_imax" : { "result" : "anyvector",
                              "arguments" : ["anyvector","int","int",0,0,0],
                              "attributes" : "None"
                            },

### ``llvm.genx.untyped.atomic.*.<return type>.<vector type>`` : vISA UNTYPED_ATOMIC with inc/dec
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.untyped.atomic.inc`` : vISA UNTYPED_ATOMIC INC instruction
### * ``llvm.genx.untyped.atomic.dec`` : vISA UNTYPED_ATOMIC DEC instruction
###
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg1: i32 surface index
### * arg2: i32 global offset in i32s
### * arg3: vXi32 element offset in i32s
### * arg4: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset and the return value must have the same vector
### width, which must be 8 or 16.
###
    "untyped_atomic_inc" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0],
                             "attributes" : "None"
                           },
    "untyped_atomic_dec" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","int",0,0],
                             "attributes" : "None"
                           },

### ``llvm.genx.untyped.atomic.cmpxchg.<return type>.<vector type>`` : vISA UNTYPED_ATOMIC CMPXCHG instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg1: i32 surface index
### * arg2: i32 global offset in i32s
### * arg3: vXi32 element offset in i32s
### * arg4: vXi32 src0
### * arg5: vXi32 src1
### * arg6: vXi32 original value of the register that the data is read into
###
### * Return value: vXi32 the old value read
###
### Predicate, element offset, src0, src1, and the return value must all have
### the same vector width, which must be 8 or 16.
###
    "untyped_atomic_cmpxchg" : { "result" : "anyvector",
                                 "arguments" : ["anyvector","int","int",0,0,0,0],
                                 "attributes" : "None"
                               },

### ``llvm.genx.svm.block.ld*.<return type>.<address type>`` : vISA SVM BLOCK_LD instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * ``llvm.genx.svm.block.ld`` : vISA SVM BLOCK_LD instruction with oword alignment
### * ``llvm.genx.svm.block.ld.unaligned`` : vISA SVM BLOCK_LD instruction with
###   dword alignment
###
### * (log2 number of oword inferred from data type)
### * arg0: i32/i64 address
###
### * Return value: data read
###
### The data read must have a size that is a power of two from 16 to 128
### bytes.
###
    "svm_block_ld" : { "result" : "anyvector",
                       "arguments" : ["anyint"],
                       "attributes" : "ReadMem"
                     },
    "svm_block_ld_unaligned" : { "result" : "anyvector",
                                 "arguments" : ["anyint"],
                                 "attributes" : "ReadMem"
                               },

### ``llvm.genx.svm.block.st.<address type><vector type>`` : vISA SVM BLOCK_ST instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (log2 number of oword inferred from data type)
### * arg0: i32/i64 address
### * arg1: data to write (overloaded)
###
### The data to write must have a size that is a power of two from 16 to 128
### bytes.
###
    "svm_block_st" : { "result" : "void",
                       "arguments" : ["anyint","anyvector"],
                       "attributes" : "None"
                     },

### ``llvm.genx.svm.gather.<return type>.<vector type>.<any int>`` : vISA SVM GATHER instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (exec size inferred from address vector width)
### * arg0: vXi1 predicate (Num_elts inferred from this arg) (overloaded)
### * (block size inferred from data element type)
### * arg1: i32 log2 num blocks, constant (0/1/2/3 for num blocks 1/2/4/8)
### * arg2: vXi64 address (X = 8 or 16) (overloaded)
### * arg3: old value of the data read
###
### * Return value: data read
###
### The return value element type is i8 for block size 1, i32/float for
### block size 4, or i64/double for block size 8.
### The return value vector width is the address vector width times
### number of blocks (rounded up to 4 if block size is 1).
###
    "svm_gather" : { "result" : "anyvector",
                     "arguments" : ["anyvector","int","anyint",0],
                     "attributes" : "ReadMem"
                   },

### ``llvm.genx.svm.gather4.scaled.<return type>.<vector type>.<any int>`` : vISA SVM GATHER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 channel mask, constant
### * arg2: i16 scale, constant
### * arg3: i64 global address in bytes
### * arg4: vXi64 element offset in bytes (overloaded)
### * arg5: old value of the data read
###
### * Return value: the data read
###
### The vector width of the element offset arg is the number of elements to
### read, which must be 8 or 16.
### The predicate arg must either have the same vector width, or be a scalar
### i1 constant with value 1.
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
### The vector width of the return value must be the number of elements
### times the number of channels to read per element.
### The element type of the return value must be i32 or float.
###
    "svm_gather4_scaled" : { "result" : "anyvector",
                             "arguments" : ["anyvector","int","short","long","anyint",0],
                             "attributes" : "ReadMem"
                           },

### ``llvm.genx.svm.scatter.<vector type>.<any int>.<vector type>`` : vISA SVM SCATTER instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (exec size inferred from address vector width)
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * (block size inferred from data element type)
### * arg1: i32 log2 num blocks, constant (0/1/2/3 for num blocks 1/2/4/8)
### * arg2: vXi64 address (X = 8 or 16) (overloaded)
### * arg3: data to write (overloaded)
###
### The data to write element type is i8 for block size 1, i32/float for
### block size 4, or i64/double for block size 8.
### The data vector width is the address vector width times
### number of blocks (rounded up to 4 if block size is 1).
###
    "svm_scatter" : { "result" : "void",
                      "arguments" : ["anyvector","int","anyint","anyvector"],
                      "attributes" : "None"
                    },

### ``llvm.genx.svm.scatter4.scaled.<vector type>.<any int>.<vector type>`` : vISA SVM SCATTER4_SCALED instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * (exec_size inferred from element offset type)
### * arg0: vXi1 predicate (overloaded)
### * arg1: i32 channel mask, constant
### * arg2: i16 scale, constant
### * arg3: i64 global address in bytes
### * arg4: vXi64 element offset in bytes (overloaded)
### * arg5: data to write (overloaded)
###
### The vector width of the element offset arg is the number of elements to
### read, which must be 8 or 16.
### The predicate arg must either have the same vector width, or be a scalar
### i1 constant with value 1.
### The instruction writes up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to write per element.
### The vector width of the data to write arg must be the number of elements
### times the number of channels to read per element.
### The element type of the data to write arg must be i32 or float.
###
    "svm_scatter4_scaled" : { "result" : "void",
                              "arguments" : ["anyvector","int","short","long","anyint","anyvector"],
                              "attributes" : "None"
                            },

### ``llvm.genx.svm.atomic.*.<return type>.<vector type>.<any int>`` : vISA SVM_ATOMIC with binary operator
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.svm.atomic.add`` : vISA SVM_ATOMIC ADD instruction
### * ``llvm.genx.svm.atomic.sub`` : vISA SVM_ATOMIC SUB instruction
### * ``llvm.genx.svm.atomic.min`` : vISA SVM_ATOMIC MIN instruction
### * ``llvm.genx.svm.atomic.max`` : vISA SVM_ATOMIC MAX instruction
### * ``llvm.genx.svm.atomic.xchg`` : vISA SVM_ATOMIC XCHG instruction
### * ``llvm.genx.svm.atomic.and`` : vISA SVM_ATOMIC AND instruction
### * ``llvm.genx.svm.atomic.or`` : vISA SVM_ATOMIC OR instruction
### * ``llvm.genx.svm.atomic.xor`` : vISA SVM_ATOMIC XOR instruction
### * ``llvm.genx.svm.atomic.imin`` : vISA SVM_ATOMIC IMIN instruction
### * ``llvm.genx.svm.atomic.imax`` : vISA SVM_ATOMIC IMAX instruction
###
### * arg0: vXi1 predicate (Num_elts inferred from this arg) (overloaded)
### * arg1: vXi64 element addresses in bytes (overloaded)
### * arg2: vXi32/vXi64 src
### * arg3: original value of the register that the data is read into
###
### * Return value: vXi32/vXi64 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 2, 4, or 8.
###
    "svm_atomic_add" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_sub" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_min" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_max" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_xchg" : { "result" : "anyvector",
                          "arguments" : ["anyvector","anyint",0,0],
                          "attributes" : "None"
                        },
    "svm_atomic_and" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_or" : { "result" : "anyvector",
                        "arguments" : ["anyvector","anyint",0,0],
                        "attributes" : "None"
                      },
    "svm_atomic_xor" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0,0],
                         "attributes" : "None"
                       },
    "svm_atomic_imin" : { "result" : "anyvector",
                          "arguments" : ["anyvector","anyint",0,0],
                          "attributes" : "None"
                        },
    "svm_atomic_imax" : { "result" : "anyvector",
                          "arguments" : ["anyvector","anyint",0,0],
                          "attributes" : "None"
                        },

### ``llvm.genx.svm.atomic.*.<return type>.<vector type>.<any int>`` : vISA SVM_ATOMIC with inc/dec
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.svm.atomic.inc`` : vISA SVM_ATOMIC INC instruction
### * ``llvm.genx.svm.atomic.dec`` : vISA SVM_ATOMIC DEC instruction
###
### * arg0: vXi1 predicate (Num_elts inferred from this arg) (overloaded)
### * arg1: vXi64 element addresses in bytes (overloaded)
### * arg2: original value of the register that the data is read into
###
### * Return value: vXi32/vXi64 the old value read
###
### Predicate, element offset and the return value must have the same vector
### width, which must be 1, 2, 4 or 8.
###
    "svm_atomic_inc" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0],
                         "attributes" : "None"
                       },
    "svm_atomic_dec" : { "result" : "anyvector",
                         "arguments" : ["anyvector","anyint",0],
                         "attributes" : "None"
                       },

### ``llvm.genx.svm.atomic.cmpxchg.<return type>.<vector type>.<any int>`` : vISA SVM_ATOMIC CMPXCHG instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg1: vXi64 element addresses in bytes (overloaded)
### * arg2: vXi32/vXi64 src0
### * arg3: vXi32/vXi64 src1
### * arg4: original value of the register that the data is read into
###
### * Return value: vXi32/vXi64 the old value read
###
### Predicate, element offset, src0, src1, and the return value must all have
### the same vector width, which must be 1, 2, 4 or 8.
###
    "svm_atomic_cmpxchg" : { "result" : "anyvector",
                             "arguments" : ["anyvector","anyint",0,0,0],
                             "attributes" : "None"
                           },

### ``llvm.genx.svm.atomic.*.<return type>.<vector type>.<any int>`` : vISA SVM_ATOMIC with binary operator
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.svm.atomic.fmin`` : vISA SVM_ATOMIC FMIN instruction
### * ``llvm.genx.svm.atomic.fmax`` : vISA SVM_ATOMIC FMAX instruction
###
### * arg0: vXi1 predicate (Num_elts inferred from this arg) (overloaded)
### * arg1: vXi64 element addresses in bytes (overloaded)
### * arg2: vXf32 src
### * arg3: original value of the register that the data is read into
###
### * Return value: vXf32 the old value read
###
### Predicate, element offset, src, and the return value must all have the
### same vector width, which must be 1, 2, 4, or 8.
###
    "svm_atomic_fmin" : { "result" : "anyvector",
                          "arguments" : ["anyvector","anyint",0,0],
                          "attributes" : "None"
                        },
    "svm_atomic_fmax" : { "result" : "anyvector",
                          "arguments" : ["anyvector","anyint",0,0],
                          "attributes" : "None"
                        },

### ``llvm.genx.svm.atomic.fcmpwr.<return type>.<vector type>.<any int>`` : vISA SVM_ATOMIC FCMPWR instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: vXi1 predicate (Num_elts inferred from element offset type) (overloaded)
### * arg1: vXi64 element addresses in bytes (overloaded)
### * arg2: vXf32 src0
### * arg3: vXf32 src1
### * arg4: original value of the register that the data is read into
###
### * Return value: vXf32 the old value read
###
### Predicate, element offset, src0, src1, and the return value must all have
### the same vector width, which must be 1, 2, 4 or 8.
###
    "svm_atomic_fcmpwr" : { "result" : "anyvector",
                            "arguments" : ["anyvector","anyint",0,0,0],
                            "attributes" : "None"
                          },

### ``llvm.genx.load.<return type>.<any int>`` : vISA LOAD (sampler load) instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant (simd_mode inferred from pixel address operands)
### * arg1: i32 surface index
### * arg2: vXi32 U pixel address (overloaded)
### * arg3: vXi32 V pixel address
### * arg4: vXi32 R pixel address
###
### * Return value: the data read
###
### The vector widths of the U pixel address, V pixel address and R pixel
### address args must be equal, and either 8 or 16.
###
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
###
### For SIMD8 pre-BDW, the vector width of the data read must be 32.
### For SIMD8 BDW+, or for SIMD16, the vector width of the data read must be
### the SIMD width times the number of enabled channels.
###
### The element type of the return value must be i32 or float.
###
    "load" : { "result" : "anyvector",
               "arguments" : ["int","int","anyint",1,1],
               "attributes" : "ReadMem"
             },

### ``llvm.genx.sample.<return type>.<any float>`` : vISA SAMPLE instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant (simd_mode inferred from pixel address operands)
### * arg1: i32 sampler index
### * arg2: i32 surface index
### * arg3: vXfloat U pixel address(overloaded)
### * arg4: vXfloat V pixel address
### * arg5: vXfloat R pixel address
###
### * Return value: the data read
###
### The vector widths of the U pixel address, V pixel address and R pixel
### address args must be equal, and either 8 or 16.
###
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
###
### For SIMD8 pre-BDW, the vector width of the data read must be 32.
### For SIMD8 BDW+, or for SIMD16, the vector width of the data read must be
### the SIMD width times the number of enabled channels.
###
### The element type of the return value must be i32 or float.
###
    "sample" : { "result" : "anyvector",
                 "arguments" : ["int","int","int","anyfloat",1,1],
                 "attributes" : "ReadMem"
               },

### ``llvm.genx.sample.<return type>.unorm`` : vISA SAMPLE_UNORM instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant
### * arg1: i32 sampler index
### * arg2: i32 surface index
### * arg3: float U pixel address
### * arg4: float V pixel address
### * arg5: float DeltaU
### * arg6: float DeltaV
###
### * Return value: v8i16 the data read
###
### The instruction reads up to 4 channels per element, with the lowest 4
### bits of the channel mask arg giving the mask of channels _not_ to read.
### The number of 0 bits in that lower 4 bits of the channel mask arg is the
### number of channels to read per element.
###
    "sample_unorm" : { "result" : "anyvector",
                       "arguments" : ["int","int","int","float","float","float","float"],
                       "attributes" : "ReadMem"
                     },

### ``llvm.genx.3d.sample.<return type>.<vector type>....`` : vISA 3D_SAMPLE instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 opcode, constant
### * arg1: vXi1 predicate mask, used to determine execution size (overloaded)
### * arg2: i32 channel mask, constant
### * arg3: i16 aoffimmi
### * arg4: i32 sampler index
### * arg5: i32 surface index
### * argN: vXf or vXhf operand, for 6 <= N <= 20 (all overloaded)
###
### * Return value: the data read
###
    "3d_sample" : { "result" : "anyvector",
                    "arguments" : ["int","anyvector","int","short","int","int","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector"],
                    "attributes" : "ReadMem"
                  },

### ``llvm.genx.3d.load.<return type>.<vector type>....`` : vISA 3D_LOAD instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 opcode, constant
### * arg1: vXi1 predicate mask, used to determine execution size(overloaded)
### * arg2: i32 channel mask, constant
### * arg3: i16 aoffimmi
### * arg4: i32 surface index
### * argN: vXf or vXhf operand, for 5 <= N <= 19 (all overloaded)
###
### * Return value: the data read
###
    "3d_load" : { "result" : "anyvector",
                  "arguments" : ["int","anyvector","int","short","int","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector","anyvector"],
                  "attributes" : "ReadMem"
                },

### ``llvm.genx.avs.<return type>`` : vISA AVS instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 channel mask, constant
### * arg1: i32 sampler index
### * arg2: i32 surface index
### * arg3: float U offset
### * arg4: float V offset
### * arg5: float deltaU
### * arg6: float deltaV
### * arg7: float u2d
### * arg8: i32 groupID
### * arg9: i32 verticalBlockNumber
### * arg10: i32 Output format control, constant
### * arg11: float v2d
### * arg12: i32 execMode, constant
### * arg13: i8 IEFBypass
###
### * Return value: the data read.
###
### The actual data returned is determined by a combination of <channel>,
### <cntrl>, <execMode>, as well as whether output shuffle is enabled in the
### sampler state.
###
### SIMD Control Flow: channel enable is ignored.
###
    "avs" : { "result" : "anyvector",
              "arguments" : ["int","int","int","float","float","float","float","float","int","int","int","float","int","char"],
              "attributes" : "ReadMem"
            },

### ``llvm.genx.barrier`` : vISA BARRIER instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
    "barrier" : { "result" : "void",
                  "arguments" : [],
                  "attributes" : "Convergent"
                },

### ``llvm.genx.sbarrier`` : vISA SBARRIER instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i8 signal flag, constant
###
    "sbarrier" : { "result" : "void",
                   "arguments" : ["char"],
                   "attributes" : "Convergent"
                 },

### ``llvm.genx.nbarrier`` : vISA NBARRIER instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i8 signal flag, constant
### * arg1: i8 barrier id
### * arg2: i8 number of threads
###
    "nbarrier" : { "result" : "void",
                   "arguments" : ["char","char","char"],
                   "attributes" : "Convergent"
                 },

### ``llvm.genx.cache.flush`` : vISA CACHE_FLUSH instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
    "cache_flush" : { "result" : "void",
                      "arguments" : [],
                      "attributes" : "None"
                    },

### ``llvm.genx.fence`` : vISA FENCE instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i8 mask, constant
###
    "fence" : { "result" : "void",
                "arguments" : ["char"],
                "attributes" : "None"
              },

### ``llvm.genx.wait`` : vISA WAIT instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i8 thread mask
###
    "wait" : { "result" : "void",
               "arguments" : ["char"],
               "attributes" : "None"
             },

### ``llvm.genx.yield`` : vISA YIELD instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
    "yield" : { "result" : "void",
                "arguments" : [],
                "attributes" : "None"
              },

### ``llvm.genx.raw.send.<return type>.<any int>.<vector type>`` : vISA RAW_SEND instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i32 modifier whether it is send or sendc, constant
### * (exec_size inferred from predicate vector width, defaulting to 16
###          if predicate is i1)
### * arg1: i1/vXi1 predicate (overloaded)
### * arg2: i32 extended message descriptor, constant
### * (numsrc inferred from src size)
### * (numdst inferred from dst size)
### * arg3: i32 desc
### * arg4: src (overloaded)
### * arg5: old_dst
###
### * Return value: dst
###
### The SEND instruction has a field for the size of each of src
### and dst. These are inferred by rounding the size of each of src and
### dst up to the next whole GRF.
###
### If the send writes to the whole of dst, or the program does not care what
### was in those registers before, then set old_dst to UndefValue (of the same
### type as dst). If on the other hand the send is predicated and the program
### needs to see what was in the parts of destination registers not written
### by the send, then use old_dst as the "old value of destination registers"
### input.
###
### The predicate must be constant i1 with value 1 for a message that is not
### predicatable. For a predicatable message, it must be a vector of i1 with
### width determining the execution size.
###
    "raw_send" : { "result" : "anyvector",
                   "arguments" : ["int","anyint","int","int","anyvector",0],
                   "attributes" : "None"
                 },

### ``llvm.genx.raw.send.noresult.<any int>.<vector type>`` : vISA RAW_SEND instruction with no result
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i32 modifier whether it is send or sendc, constant
### * (exec_size inferred from predicate vector width, defaulting to 16
###          if predicate is i1)
### * arg1: i1/vXi1 predicate (overloaded)
### * arg2: i32 extended message descriptor, constant
### * (numsrc inferred from src size)
###       (numdst is 0)
### * arg3: i32 desc
### * arg4: src (overloaded)
###
### The SEND instruction has a field for the size of src. This is inferred by
### rounding the size of src up to the next whole GRF.
###
### The predicate must be constant i1 with value 1 for a message that is not
### predicatable. For a predicatable message, it must be a vector of i1 with
### width determining the execution size.
###
    "raw_send_noresult" : { "result" : "void",
                            "arguments" : ["int","anyint","int","int","anyvector"],
                            "attributes" : "None"
                          },

### ``llvm.genx.raw.sends.<return type>.<any int>.<vector type>.<vector type>`` : vISA RAW_SENDS instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i32 modifier whether it is send or sendc, constant
### * (exec_size inferred from predicate vector width, defaulting to 16
###          if predicate is i1)
### * arg1: i1/vXi1 predicate  (overloaded)
### * arg2: i8 sfid
### * arg3: i32 extended message descriptor, constant
### * (numsrc inferred from src size)
### * (numsrc2 inferred from src2 size)
### * (numdst inferred from dst size)
### * arg4: i32 desc
### * arg5: src  (overloaded)
### * arg6: src2  (overloaded)
### * arg7: old_dst
###
### * Return value: dst
###
### The SENDS instruction has a field for the size of each of src, src2
### and dst. These are inferred by rounding the size of each of src, src2 and
### dst up to the next whole GRF.
###
### If the send writes to the whole of dst, or the program does not care what
### was in those registers before, then set old_dst to UndefValue (of the same
### type as dst). If on the other hand the send is predicated and the program
### needs to see what was in the parts of destination registers not written
### by the send, then use old_dst as the "old value of destination registers"
### input.
###
### The predicate must be constant i1 with value 1 for a message that is not
### predicatable. For a predicatable message, it must be a vector of i1 with
### width determining the execution size.
###
    "raw_sends" : { "result" : "anyvector",
                    "arguments" : ["int","anyint","char","int","int","anyvector","anyvector",0],
                    "attributes" : "None"
                  },

### ``llvm.genx.raw.sends.noresult.<any int>.<vector type>.<vector type>`` : vISA RAW_SENDS instruction with no result
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i32 modifier whether it is send or sendc, constant
### * (exec_size inferred from predicate vector width, defaulting to 16
###          if predicate is i1)
### * arg1: i1/vXi1 predicate  (overloaded)
### * arg2: i8 sfid
### * arg3: i32 extended message descriptor
### * (numsrc inferred from src size)
### * (numsrc2 inferred from src2 size)
### * (numdst is 0)
### * arg4: i32 desc
### * arg5: src  (overloaded)
### * arg6: src2  (overloaded)
###
### The SENDS instruction has a field for the size of each of src and src2.
### These are inferred by rounding the size of each of src and src2 up to the
### next whole GRF.
###
### The predicate must be constant i1 with value 1 for a message that is not
### predicatable. For a predicatable message, it must be a vector of i1 with
### width determining the execution size.
###
    "raw_sends_noresult" : { "result" : "void",
                             "arguments" : ["int","anyint","char","int","int","anyvector","anyvector"],
                             "attributes" : "None"
                           },

### ``llvm.genx.raw.send2.<return type>.<vector type>.<vector type>`` : vISA RAW_SEND instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * arg0  i8 modifier, bit-0 represents sendc, bit-1 repersents EOT
### * arg1  i8 exec_size
### * arg2: i1/vXi1 predicate  (overloaded)
### * arg3: i8 numsrc1
### * (numsrc2 is 0)
### * arg4: i8 numdst
### * arg5: i8 sfid
### * arg6: i32 extended message descriptor
### * arg7: i32 message descriptor
### * arg8: src  (overloaded)
### * (src2 is NULL)
### * arg9: old_ds
###
### * Return value: dst
###
### This intrinsic supports full encoding of the vISA raw_send instruction.
###
    "raw_send2" : { "result" : "anyvector",
                    "arguments" : ["char","char","anyvector","char","char","char","int","int","anyvector",0],
                    "attributes" : "None"
                  },

### ``llvm.genx.raw.send2.noresult.<vector type>.<vector type>`` : vISA RAW_SEND instruction with no result
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i8 modifier, bit-0 represents sendc, bit-1 repersents EOT
### * arg1  i8 exec_size
### * arg2: i1/vXi1 predicate  (overloaded)
### * arg3: i8 numsrc1
### * (numsrc2 is 0)
### * (numdst is 0)
### * arg4: i8 sfid
### * arg5: i32 extended message descriptor
### * arg6: i32 message descriptor
### * arg7: src (overloaded)
### * (src2 is NULL)
###
### This intrinsic supports full encoding of the vISA raw_send instruction with no result.
###
    "raw_send2_noresult" : { "result" : "void",
                             "arguments" : ["char","char","anyvector","char","char","int","int","anyvector"],
                             "attributes" : "None"
                           },

### ``llvm.genx.raw.sends2.<return type>.<vector type>.<vector type>.<vector type>`` : vISA RAW_SENDS instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i8 modifier, bit-0 represents sendc, bit-1 repersents EOT
### * arg1  i8 exec_size
### * arg2: i1/vXi1 predicate  (overloaded)
### * arg3: i8 numsrc1
### * arg4: i8 numsrc2
### * arg5: i8 numdst
### * arg6: i8 sfid
### * arg7: i32 extended message descriptor
### * arg8: i32 message descriptor
### * arg9: src  (overloaded)
### * arg10: src2  (overloaded)
### * arg11: old_dst
###
### * Return value: dst
###
### This intrinsic supports full encoding of the vISA raw_sends instruction.
###
    "raw_sends2" : { "result" : "anyvector",
                     "arguments" : ["char","char","anyvector","char","char","char","char","int","int","anyvector","anyvector",0],
                     "attributes" : "None"
                   },

### ``llvm.genx.raw.sends2.noresult.<vector type>.<vector type>.<vector type>`` : vISA RAW_SENDS instruction with no result
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0  i8 modifier, bit-0 represents sendc, bit-1 repersents EOT
### * arg1  i8 exec_size
### * arg2: i1/vXi1 predicate  (overloaded)
### * arg3: i8 numsrc1
### * arg4: i8 numsrc2
### * (numdst is 0)
### * arg5: i8 sfid
### * arg6: i32 extended message descriptor
### * arg7: i32 message descriptor
### * arg8: src  (overloaded)
### * arg9: src2  (overloaded)
###
### This intrinsic supports full encoding of the vISA raw_sends instruction with no result.
###
    "raw_sends2_noresult" : { "result" : "void",
                              "arguments" : ["char","char","anyvector","char","char","char","int","int","anyvector","anyvector"],
                              "attributes" : "None"
                            },

## ---------------------------
### Video Analytics Instrinsics
### ---------------------------
###
### ``llvm.genx.va.convolve2d.<return type>`` vISA VA 2d Convolve instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 properties value specifying return data format and input region size, constant
###
### * Return value: v64i16 or v16i16 matrix, depending on properties value
###
    "va_convolve2d" : { "result" : "anyint",
                        "arguments" : ["int","int","float","float","int"],
                        "attributes" : "ReadMem"
                      },

### ``llvm.genx.va.hdc.convolve2d`` vISA VA HDC 2d Convolve instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 properties value specifying return data format and input region size, constant
### * arg5: i32 destination surface
### * arg6: i16 destination surface x-offset
### * arg7: i16 destination surface y-offset
###
    "va_hdc_convolve2d" : { "result" : "void",
                            "arguments" : ["int","int","float","float","int","int","short","short"],
                            "attributes" : "None"
                          },

### ``llvm.genx.va.erode.<return type>`` vISA VA Erode instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 properties value specifying return data format, constant
###
### * Return value: vXi32
###
    "va_erode" : { "result" : "anyint",
                   "arguments" : ["int","int","float","float","int"],
                   "attributes" : "ReadMem"
                 },

### ``llvm.genx.va.hdc.erode`` vISA VA HDC Erode instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 destination surface
### * arg5: i16 destination surface x-offset
### * arg6: i16 destination surface y-offset
###
    "va_hdc_erode" : { "result" : "void",
                       "arguments" : ["int","int","float","float","int","short","short"],
                       "attributes" : "None"
                     },

### ``llvm.genx.va.dilate.<return type>`` vISA VA Dilate instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 properties value specifying return data format, constant
###
### * Return value: vXi32
###
    "va_dilate" : { "result" : "anyint",
                    "arguments" : ["int","int","float","float","int"],
                    "attributes" : "ReadMem"
                  },

### ``llvm.genx.va.hdc.dilate`` vISA VA HDC Dilate instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 destination surface
### * arg5: i16 destination surface x-offset
### * arg6: i16 destination surface y-offset
###
    "va_hdc_dilate" : { "result" : "void",
                        "arguments" : ["int","int","float","float","int","short","short"],
                        "attributes" : "None"
                      },

### ``llvm.genx.va.minmax.<return type>`` vISA MinMax instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: i32 enable specific minmax functionality
###
### * Return: v32i8 or v16i16 depending on the surface format
###
    "va_minmax" : { "result" : "anyint",
                    "arguments" : ["int","float","float","int"],
                    "attributes" : "ReadMem"
                  },

### ``llvm.genx.va.minmax.filter.<return type>`` vISA MinMax Filter instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 specifies the size of the minmax value returned, constant
### * arg5: i32 specifies the return data format, constant
### * arg6: i32 enable specific minmax functionality
###
### * Return: vXi8 or vXi16 depending on return data size and format
###
    "va_minmax_filter" : { "result" : "anyint",
                           "arguments" : ["int","int","float","float","int","int","int"],
                           "attributes" : "ReadMem"
                         },

### ``llvm.genx.va.hdc.minmax.filter`` vISA HDC MinMax Filter instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 return data format, constant
### * arg5: i32 enable the specific minmax functionality, constant
### * arg6: i32 destination surface index
### * arg7: i16 destination surface x-offset
### * arg8: i16 destination surface y-offset
###
    "va_hdc_minmax_filter" : { "result" : "void",
                               "arguments" : ["int","int","float","float","int","int","int","short","short"],
                               "attributes" : "None"
                             },

### ``llvm.genx.va.bool.centroid.<return type>`` vISA Boolean Centroid instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: i8 vertical size
### * arg4: i8 horizontal size
###
### * Return: v16i8 or v16i16 depending on surface format
###
    "va_bool_centroid" : { "result" : "anyint",
                           "arguments" : ["int","float","float","char","char"],
                           "attributes" : "ReadMem"
                         },

### ``llvm.genx.va.centroid.<return type>`` vISA Centroid instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: i8 vertical size
###
### * Return: v32i32
###
    "va_centroid" : { "result" : "anyint",
                      "arguments" : ["int","float","float","char"],
                      "attributes" : "ReadMem"
                    },

### ``llvm.genx.va.1d.convolve.horizontal.<return type>`` vISA 1d convolve horizontal instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 mode, constant
###
### * Return: v16i16 or v64i16 depending on mode
###
    "va_1d_convolve_horizontal" : { "result" : "anyint",
                                    "arguments" : ["int","int","float","float","int"],
                                    "attributes" : "ReadMem"
                                  },

### ``llvm.genx.va.hdc.1d.convolve.horizontal`` vISA HDC 1d convolve horizontal instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 pixel size, constant
### * arg5: i32 destination surface index
### * arg6: i16 destination surface x-offset
### * arg7: i16 destination surface y-offset
###
    "va_hdc_1d_convolve_horizontal" : { "result" : "void",
                                        "arguments" : ["int","int","float","float","int","int","short","short"],
                                        "attributes" : "None"
                                      },

### ``llvm.genx.va.1d.convolve.vertical.<return type>`` vISA 1d convolve vertical instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 mode, constant
###
### * Return: v16i16 or v64i16 depending on mode
###
    "va_1d_convolve_vertical" : { "result" : "anyint",
                                  "arguments" : ["int","int","float","float","int"],
                                  "attributes" : "ReadMem"
                                },

### ``llvm.genx.va.hdc.1d.convolve.vertical`` vISA HDC 1d convolve vertical instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 pixel size, constant
### * arg5: i32 destination surface index
### * arg6: i16 destination surface x-offset
### * arg7: i16 destination surface y-offset
###
    "va_hdc_1d_convolve_vertical" : { "result" : "void",
                                      "arguments" : ["int","int","float","float","int","int","short","short"],
                                      "attributes" : "None"
                                    },

### ``llvm.genx.va.1pixel.convolve.<return type>.<any int>`` vISA 1 Pixel Convolve instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 mode, constant
### * arg5: v32i16 offsets (overloaded)
###
### * Return: v64i16 or v16i16 depending on mode.
###
    "va_1pixel_convolve" : { "result" : "anyint",
                             "arguments" : ["int","int","float","float","int","anyint"],
                             "attributes" : "ReadMem"
                           },

### ``llvm.genx.va.hdc.1pixel.convolve`` vISA HDC 1 Pixel Convolve instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
### * arg4: i32 pixel size, constant
### * arg5: v32i16 offsets
### * arg6: i32 destination surface index
### * arg7: i16 destination surface x-offset
### * arg8: i16 destination surface y-offset
###
    "va_hdc_1pixel_convolve" : { "result" : "void",
                                 "arguments" : ["int","int","float","float","int","anyint","int","short","short"],
                                 "attributes" : "None"
                               },

### ``llvm.genx.va.1pixel.convolve.1x1mode.<return type>`` vISA 1 Pixel Convolve (1x1 mode) instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 sampler index
### * arg1: i32 surface index
### * arg2: float normalized x-coordinate
### * arg3: float normalized y-coordinate
###
### * Return: v64i16 or v16i16 depending on mode.
###
    "va_1pixel_convolve_1x1mode" : { "result" : "anyint",
                                     "arguments" : ["int","int","float","float"],
                                     "attributes" : "ReadMem"
                                   },

### ``llvm.genx.va.lbp.creation.<return type>`` vISA LBP Creation instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: mode, constant
###
### * Return: v64i8 or v128i8 depending on mode
###
    "va_lbp_creation" : { "result" : "anyint",
                          "arguments" : ["int","float","float","int"],
                          "attributes" : "ReadMem"
                        },

### ``llvm.genx.va.hdc.lbp.creation`` vISA HDC LBP Creation instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: mode, constant
### * arg4: i32 destination surface index
### * arg5: i16 destination surface x-offset
### * arg6: i16 destination surface y-offset
###
    "va_hdc_lbp_creation" : { "result" : "void",
                              "arguments" : ["int","float","float","int","int","short","short"],
                              "attributes" : "None"
                            },

### ``llvm.genx.va.lbp.correlation.<return type>`` vISA LBP Correlation instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: i16 horizontal disparity
###
### * Return: v64i8
###
    "va_lbp_correlation" : { "result" : "anyint",
                             "arguments" : ["int","float","float","short"],
                             "attributes" : "ReadMem"
                           },

### ``llvm.genx.va.hdc.lbp.correlation`` vISA HDC LBP Correlation instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: i16 horizontal disparity
### * arg4: i32 destination surface index
### * arg5: i16 destination surface x-offset
### * arg6: i16 destination surface y-offset
###
    "va_hdc_lbp_correlation" : { "result" : "void",
                                 "arguments" : ["int","float","float","short","int","short","short"],
                                 "attributes" : "None"
                               },

### ``llvm.genx.va.correlation.search.<return type>`` vISA Correlation Search instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i32 surface index
### * arg1: float normalized x-coordinate
### * arg2: float normalized y-coordinate
### * arg3: float normalized vertical origin
### * arg4: float normalized horizontal origin
### * arg5: i8 x-direction size
### * arg6: i8 y-direction size
### * arg7: i8 x-direction search size
### * arg8: i8 y-direction search size
###
### * Return: vXi32
###
    "va_correlation_search" : { "result" : "anyint",
                                "arguments" : ["int","float","float","float","float","char","char","char","char"],
                                "attributes" : "ReadMem"
                              },

### ``llvm.genx.va.flood.fill.<return type>.<any int>`` vISA Flood Fill instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: i8 Is8Connect, constant (valid values 0 or 1).
### * arg1: v10i16 pixel mask horizontal direction (overloaded)
### * arg2: i16 pixel mask vertical direction left
### * arg3: i16 pixel mask vertical direction right
### * arg4: i16 loop count
###
### * Return: v8i16
###
    "va_flood_fill" : { "result" : "anyint",
                        "arguments" : ["char","anyint","short","short","short"],
                        "attributes" : "ReadMem"
                      },

##--------------------------------------------------------------------
### CM codegen internal intrinsics
### ------------------------------S

### ``llvm.genx.simdcf.predicate.<return type>`` : simd cf predication marker intrinsic.
###
### * arg0: vector with any element type
### * arg1: vector constant, same size as arg0
###
### * Return value: a vector composed of elements selected from arg0 or arg1
###   according to the implied SIMD CF predication mask.
###
### This is generated by clang codegen in the implementation of the CM
### reduction functions (cm_sum etc) whose behavior is sensitive to the
### surrounding SIMD CF context. It is lowered by the CMSimdCFLowering pass.
###
    "simdcf_predicate" : { "result" : "anyvector",
                           "arguments" : [0,0],
                           "attributes" : "None"
                         },

### llvm.genx.simdcf.any.<vector type> : simd cf marker intrinsic.
###
### arg0: vector of i1 (overloaded)
###
### Return value: i1 value as condition for a scalar control flow.
###
### This intrinsic is used to mark a simd cf that takes a predicate vector and
### returns a scalar value for scalar cf.
###
### This is generated by clang codegen in the implementation of SIMD control
### flow, and lowered by the CMSimdCFLowering pass.
###
    "simdcf_any" : { "result" : "bool",
                     "arguments" : ["anyvector"],
                     "attributes" : "None"
                   },

### ``llvm.genx.unmask.begin`` : simd-unmask region begin
###
### * Return value:  i32 old execution mask
###
### This intrinsic is used by front-end to mark the beginning of
### an unmask region, sets execution mask to all-active, and return
### the old mask in a temp.
### this intrinsic will be replaced by genx.simdcf.unmask by SimdCFLowering
###
    "unmask_begin" : { "result" : "int",
                       "arguments" : [],
                       "attributes" : "WriteMem,SideEffects"
                     },

### ``llvm.genx.unmask.end`` : simd-unmask region end
###
### arg0: temp to restore the execution-mask (1 dword)
###
### * Return value:  void
### This intrinsic is used by front-end to mark the end of an unmask
### region, set execution mask back using the temp value from unmask-begin.
### this intrinsic will be replaced by genx.simdcf.remask by SimdCFLowering
###
    "unmask_end" : { "result" : "void",
                     "arguments" : ["int"],
                     "attributes" : "WriteMem,SideEffects"
                   },

### ``llvm.genx.lane.id`` : implicit lane-id in the simd-fork statement
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.lane.id`` : read implicit lane_id
###
### * Return value:  i32
###
    "lane_id" : { "result" : "int",
                  "arguments" : [],
                  "attributes" : "NoMem"
                },

### ``llvm.genx.local.*.<return type>`` : read local ID register
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### * ``llvm.genx.local.id`` : read implicit arg local_id
### * ``llvm.genx.local.id16`` : read implicit arg local_id16
### * ``llvm.genx.local.size`` : read implicit arg local_size
###
### * Return value:  v3i32 - allows for x, y and z components
###                  v3i16   local ids in 16 bits
###
### This is generated by clang codegen and lowered by CMImpParam.
###
    "local_id" : { "result" : "anyvector",
                   "arguments" : [],
                   "attributes" : "NoMem"
                 },
    "local_id16" : { "result" : "anyvector",
                     "arguments" : [],
                     "attributes" : "NoMem"
                   },
    "local_size" : { "result" : "anyvector",
                     "arguments" : [],
                     "attributes" : "NoMem"
                   },

### ``llvm.genx.group.count.<return type>`` : read group count register
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### ``llvm.genx.group.count`` : read vISA v9 (%group_count_x)
###
### * Return value:  3xi32 the value read (allows for x, y and z components)
###
### This is generated by clang codegen and lowered by CMImpParam.
###
    "group_count" : { "result" : "anyvector",
                      "arguments" : [],
                      "attributes" : "NoMem"
                    },

### ``llvm.genx.get.scoreboard.bti`` : get scoreboard surface implicit
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: i32 the surfaceindex of the scoreboard bti
###
###This is generated by clang codegen and lowered by CMImpParam.
###
    "get_scoreboard_bti" : { "result" : "int",
                             "arguments" : [],
                             "attributes" : "NoMem"
                           },

### ``llvm.genx.get.scoreboard.deltas`` : get scoreboard deltas
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: vector of 16 i8 values (8 x and 8 y)
###
### This is generated by clang codegen and lowered by CMImpParam.
###
    "get_scoreboard_deltas" : { "result" : "char16",
                                "arguments" : [],
                                "attributes" : "NoMem"
                              },

### ``llvm.genx.get.scoreboard.depcnt`` : get the maximal scoreboard dependency count
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: i32
###
### This is generated by clang codegen and lowered by CMImpParam.
###
    "get_scoreboard_depcnt" : { "result" : "int",
                                "arguments" : [],
                                "attributes" : "NoMem"
                              },

### ``llvm.genx.predefined.surface`` : get predefined surface
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### Return Value: surface index of the specified id.
###
### This is generated by clang codegen when predefined surface is accessed.
###
    "predefined_surface" : { "result" : "int",
                             "arguments" : ["int"],
                             "attributes" : "NoMem"
                           },

##--------------------------------------------------------------------
### GenX backend internal intrinsics
### --------------------------------


### llvm.genx.constanti.<return type> : copy constant to register
### llvm.genx.constantf.<return type> : copy constant to register
###
### arg0: input value (constant, any scalar or vector type other than i1 or
###         vector of i1)
###
### Return value: same type
###
### This intrinsic is inserted by the GenXLowering pass
### to load a constant in a way that stops the subsequent CSE pass
### from propagating it back into the operand using it.
###
### There are two variants simply because there is no way of saying here
### that an argument can have any scalar or vector type.
###
    "constanti" : { "result" : "anyint",
                    "arguments" : [0],
                    "attributes" : "NoMem"
                  },
    "constantf" : { "result" : "anyfloat",
                    "arguments" : [0],
                    "attributes" : "NoMem"
                  },

### llvm.genx.convert.<return type> : convert register category (non address)
###
### arg0: input value (i32 or vector of i32)
###
### Return value: converted value (same type)
###
### This intrinsic is inserted by the GenXCatgory pass to represent
### a value being converted between two register categories. The input and
### result categories are not represented; they are implied by the other
### def/uses of the value. Address conversion is not covered by this
### intrinsic.
###
### The intrinsic is also inserted by GenXCoalescing to represent a copy
### of a value of category other than general. Thus the input and output
### might be both the same category, but not both general.
###
    "convert" : { "result" : "anyint",
                  "arguments" : [0],
                  "attributes" : "NoMem"
                },

### llvm.genx.convert.addr.<return type> : convert to address register category
###
### arg0: input value (i16 or vector of i16)
### arg1: constant offset (i16)
###
### Return value: converted value (same type)
###
### This intrinsic is inserted by the GenXCatgoryConversion pass to represent
### a value being converted from a general value to an address, used as the
### variable index in an element or region access. There it is created with
### offset set to 0; GenXAddressCommoning may adjust that offset to try and
### stop the address conversion falling outside of the register into which it
### points to avoid going out of spec (bug 4395).
###
    "convert_addr" : { "result" : "anyint",
                       "arguments" : [0,"short"],
                       "attributes" : "NoMem"
                     },

### llvm.genx.constantpred.<return type> : load constant predicate (i1 or vector of i1)
###
### arg0: constant i1 or vector of i1
###
### Return value: loaded value, same type
###
### This intrinsic is inserted by GenXLowering to load a predicate constant.
### We could just use a bitcast, except that EarlyCSE follows
### GenXConstantMaterialization and it has a habit of putting the constant
### back in the wrregion.
    "constantpred" : { "result" : "anyint",
                       "arguments" : [0],
                       "attributes" : "NoMem"
                     },

### llvm.genx.add.addr.<return type>.<any int> : add an offset onto an address register
###
### arg0: lhs input (i16 or vector of i16) (overloaded)
### arg1: rhs input (i16 or vector of i16)
###
### Return value: result of add (same type with arg1)
###
### When the result of a constant add/sub is used as a variable index in
### a region access, GenXCategoryConversion converts it into this intrinsic
### so that it will be considered an add to an address register.
###
    "add_addr" : { "result" : "anyint",
                   "arguments" : ["anyint",0],
                   "attributes" : "NoMem"
                 },

### llvm.genx.rdpredregion.<return type>.<any int> : read region at specified offset from a predicate
###
### arg0: i1 vector (overloaded)
### arg1: constant i32 offset (in elements)
###
### Return value: v4i1/v8i1/v16i1 result of region read
###
### The number of elements to read is determined from the number of elements
### in the return type, and must be 4, 8 or 16.
### The offset must be a multiple of the number of elements.
###
    "rdpredregion" : { "result" : "anyint",
                       "arguments" : ["anyint","int"],
                       "attributes" : "NoMem"
                     },

### llvm.genx.wrpredregion.<return type>.<any int> : write region at specified offset into a predicate
###
### arg0: i1 old value of vector
### arg1: i1 subvector to write into region (overloaded)
### arg2: constant i32 offset (in elements)
###
### Return value: v4i1/v8i1/v16i1 result of region write
###
### The number of elements to write is determined from the number of elements
### in the "subvector to write" arg, and must be 4, 8 or 16.
### The offset must be a multiple of the number of elements.
###
    "wrpredregion" : { "result" : "anyint",
                       "arguments" : [0,"anyint","int"],
                       "attributes" : "NoMem"
                     },

### llvm.genx.wrpredpredregion.<return type>.<any int> : predicated write region at specified offset
### into a predicate
###
### arg0: vXi1 old value of vector
### arg1: vYi1 subvector to write into region (overloaded)
### arg2: constant i32 offset (in elements)
### arg3: vXi1 predicate
###
### Return value: vXi1 result of region write
###
### The number of elements to write is determined from the number of elements
### in the "subvector to write" arg, and must be 4, 8 or 16.
### The offset must be a multiple of the number of elements.
###
### The constant offset indexes both the vector itself and the predicate. This
### intrinsic is valid only if the predicate is an EM value, and the subvector
### operand is the result of a cmp (which is then baled in).
###
    "wrpredpredregion" : { "result" : "anyint",
                           "arguments" : [0,"anyint","int",0],
                           "attributes" : "NoMem"
                         },

### ``llvm.genx.wrconstregion.<return type>.<vector type>.<any int>.<any int>`` : write a constant region
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: vector to write region in to
### * arg1: subvector to write into the region, constant  (overloaded)
### * arg2: i32 vstride in elements, constant
### * arg3: i32 width in elements, constant
### * arg4: i32 stride in elements, constant
### * arg5: i16 or vXi16 offset in bytes, constant  (overloaded)
### * arg6: i32 parent width, constant, ignored
### * arg7: constant scalar i1 predicate value 1  (overloaded)
###
### * Return value: the updated vector with the region modified
###
### This is the same as llvm.genx.wrregion, but with the following restrictions:
###
### * the subvector to write is constant;
### * the offset is constant;
### * the predicate is 1.
###
###It is used by GenXConstants when inserting code to load a constant, and
### specifically does not participate in simplification or constant
### propagation so we do not lose that constant loading code.
###
### The operands are the same as llvm.genx.wrregion so it can mostly be handled
### by the same code as llvm.genx.wrregion.
###
    "wrconstregion" : { "result" : "anyvector",
                        "arguments" : [0,"anyvector","int","int","int","anyint","int","anyint"],
                        "attributes" : "NoMem"
                      },

### ``llvm.genx.output`` : Mark output arguments
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Return value: void
###
### This implementation intrinsic is to mark a list of output arguments.
### This intrinsic call only extends the live range of marked arguments and
### emits no code.
###
    "output" : { "result" : "void",
                 "arguments" : ["vararg"],
                 "attributes" : "None"
               },

### ``llvm.genx.output.1.<any type>`` : Mark output argument
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * Return value: void
###
### SPIRV does not support functions with variable-length argument number,
### so output_1 is output analog with single argument
### This implementation intrinsic is to mark output argument.
### This intrinsic call only extends the live range of marked argument and
### emits no code.
###
    "output_1" : { "result" : "void",
                   "arguments" : ["any"],
                   "attributes" : "None"
                 },

## ``llvm.genx.print.buffer`` : read stateless pointer to print buffer
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## ``llvm.genx.print.buffer`` : read implicit arg print buffer ptr
##
## * return value: i64 address of print buffer
##
## this is generated by clang codegen and lowered by cmimpparam.
##
    "print_buffer" : { "result" : "long",
                       "arguments" : [],
                       "attributes" : "None"
                     },

## ``llvm.genx.print.format.index`` : add printf format string to collection
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## ``llvm.genx.print.format.index`` :  return index of printf format string
##
## * arg0: pointer for printf format string
##
## * Return value: the vector value read
##
    "print_format_index" : { "result" : "int",
                             "arguments" : ["anyptr"],
                             "attributes" : "NoMem"
                           },

## ``llvm.genx.address.convert`` : convert dataport address to integer
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * arg0: pointer kernel argument (svmptr_t or state pointer like image)
##
## * Return value: i32/i64, surface/sampler index or stateless address
##
## Intrinsic is used as a temporary SPIRV instruction to convert
## distinct address arguments into simple format (i32/i64) that is
## used across all memory instructions.  This is needed to encode
## SPIRV with appropriate types for kernel arguments.
##
    "address_convert" : { "result" : "anyint",
                          "arguments" : ["anyptr"],
                          "attributes" : "NoMem"
                        },

## ``llvm.genx.gaddr`` : take an address of a global variable
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## The semantics of this intrinsic is equal to ptrtoint instruction.
## Only global variable can be an argument of this intrinsic.
##
## * arg0: global variable
##
## * Return value: i64/i32 (depending on data layout) value of pointer
##
    "gaddr" : { "result" : "anyint",
                "arguments" :  ["anyptr"],
                "attributes" :  "NoMem"
              },

## ``llvm.genx.jump.table`` : CMC internal, no VISA
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * arg0: integer BasicBlock index in the full set of destinations
## * arg1-N: the full set of switch labels
##
## * Return value: selected label
##
## The intrinsic is a helper for switch jump tables generation. Arg0
## will be used by visa switchjmp as index. Return value and arg1-N are
## used to make ir semantically legal.
##
    "jump_table" : { "result" : "anyptr",
                     "arguments" :  ["anyint", "vararg"],
                     "attributes" :  "NoMem"
                   },

## ``llvm.genx.write.predef.surface`` : write predefined surface variable
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * arg0: ptr predefined surface variable
## * arg1: i32 value to write
##
## This corresponds to MOVS visa instruction and utilizes technique of using
## global variable in LLVM IR for predefined surfaces.
##
    "write_predef_surface" : { "result": "void",
                               "arguments" : ["anyptr", "int"],
                               "attributes" : "WriteMem",
                             },

## Internal VC memory intrinsics.
## These versions are supposed to use predefined visa variables like %bss.
## Intrinsics are supposed to be internal to VC backend.

## ``llvm.genx.dword.atomic2.*.predef.surface`` : dword atomic with binary operator with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * ``llvm.genx.dword.atomic2.add.predef.surface`` : vISA DWORD_ATOMIC ADD instruction
## * ``llvm.genx.dword.atomic2.sub.predef.surface`` : vISA DWORD_ATOMIC SUB instruction
## * ``llvm.genx.dword.atomic2.min.predef.surface`` : vISA DWORD_ATOMIC MIN instruction
## * ``llvm.genx.dword.atomic2.max.predef.surface`` : vISA DWORD_ATOMIC MAX instruction
## * ``llvm.genx.dword.atomic2.xchg.predef.surface`` : vISA DWORD_ATOMIC XCHG instruction
## * ``llvm.genx.dword.atomic2.and.predef.surface`` : vISA DWORD_ATOMIC AND instruction
## * ``llvm.genx.dword.atomic2.or.predef.surface`` : vISA DWORD_ATOMIC OR instruction
## * ``llvm.genx.dword.atomic2.xor.predef.surface`` : vISA DWORD_ATOMIC XOR instruction
## * ``llvm.genx.dword.atomic2.imin.predef.surface`` : vISA DWORD_ATOMIC IMIN instruction
## * ``llvm.genx.dword.atomic2.imax.predef.surface`` : vISA DWORD_ATOMIC IMAX instruction
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: ptr predefined surface (overloaded)
## * arg2: vXi32 element offset in bytes (overloaded)
## * arg3: vXi32 src
##
## * Return value: vXi32 the old value read
##
## Predicate, element offset, src, and the return value must all have the
## same vector width, which must be 1, 8 or 16.
##
    "dword_atomic2_add_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_sub_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_min_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_max_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_xchg_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },
    "dword_atomic2_and_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_or_predef_surface" : { "result" : "anyvector",
                                          "arguments" : ["anyvector","anyptr","anyint",0],
                                          "attributes" : "None",
                                        },
    "dword_atomic2_xor_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint",0],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_imin_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },
    "dword_atomic2_imax_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },

## ``llvm.genx.dword.atomic2.*.predef.surface`` : dword atomic with fmin/fmax operation with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * ``llvm.genx.dword.atomic2.fmin.predef.surface`` : vISA DWORD_ATOMIC FMIN instruction
## * ``llvm.genx.dword.atomic2.fmax.predef.surface`` : vISA DWORD_ATOMIC FMAX instruction
## * ``llvm.genx.dword.atomic2.fadd.predef.surface`` : vISA DWORD_ATOMIC FADD instruction
## * ``llvm.genx.dword.atomic2.fsub.predef.surface`` : vISA DWORD_ATOMIC FSUB instruction
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: ptr predefined surface (overloaded)
## * arg2: vXi32 element offset in bytes (overloaded)
## * arg3: vXfloat src
##
## * Return value: vXfloat the old value read
##
## Predicate, element offset, src, and the return value must all have the
## same vector width, which must be 1, 8 or 16.
##
    "dword_atomic2_fmin_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },
    "dword_atomic2_fmax_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },
    "dword_atomic2_fadd_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },
    "dword_atomic2_fsub_predef_surface" : { "result" : "anyvector",
                                            "arguments" : ["anyvector","anyptr","anyint",0],
                                            "attributes" : "None",
                                          },

## ``llvm.genx.dword.atomic2.*.predef.surface`` : dword atomic with inc/dec operation with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * ``llvm.genx.dword.atomic2.inc.predef.surface`` : vISA DWORD_ATOMIC INC instruction
## * ``llvm.genx.dword.atomic2.dec.predef.surface`` : vISA DWORD_ATOMIC DEC instruction
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: ptr predefined surface (overloaded)
## * arg2: vXi32 element offset in bytes (overloaded)
##
## * Return value: vXi32 the old value read
##
## Predicate, element offset, src, and the return value must all have the
## same vector width, which must be 1, 8 or 16.
##
    "dword_atomic2_inc_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint"],
                                           "attributes" : "None",
                                         },
    "dword_atomic2_dec_predef_surface" : { "result" : "anyvector",
                                           "arguments" : ["anyvector","anyptr","anyint"],
                                           "attributes" : "None",
                                         },

## ``llvm.genx.dword.atomic2.cmpxchg.predef.surface`` : vISA DWORD_ATOMIC CMPXCHG instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: ptr predefined surface (overloaded)
## * arg2: vXi32 element offset in bytes (overloaded)
## * arg3: vXi32 src0
## * arg4: vXi32 src1
##
## * Return value: vXi32 the old value read
##
## Predicate, element offset, src, and the return value must all have the
## same vector width, which must be 1, 8 or 16.
##
    "dword_atomic2_cmpxchg_predef_surface" : { "result" : "anyvector",
                                               "arguments" : ["anyvector","anyptr","anyint",0,0],
                                               "attributes" : "None",
                                             },

## ``llvm.genx.dword.atomic2.fcmpwr.predef.surface`` : vISA DWORD_ATOMIC FCMPWR instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: ptr predefined surface (overloaded)
## * arg2: vXi32 element offset in bytes (overloaded)
## * arg3: vXfloat src0
## * arg4: vXfloat src1
##
## * Return value: vXfloat the old value read
##
## Predicate, element offset, src, and the return value must all have the
## same vector width, which must be 1, 8 or 16.
##
    "dword_atomic2_fcmpwr_predef_surface" : { "result" : "anyvector",
                                              "arguments" : ["anyvector","anyptr","anyint",0,0],
                                              "attributes" : "None",
                                            },

## ``llvm.genx.gather.masked.scaled2.predef.surface`` : vISA GATHER_SCALED instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
## * arg1: i16 scale, constant
## * arg2: ptr predefined surface (overloaded)
## * arg3: i32 global offset in bytes
## * arg4: vXi32 element offset in bytes (overloaded)
## * arg5: vXi1 predicate (overloaded)
##
## * Return value: vXi32/float the data read
##
    "gather_masked_scaled2_predef_surface" : { "result" : "anyvector",
                                               "arguments" : ["int","short","anyptr","int","anyint","anyvector"],
                                               "attributes" : "ReadMem",
                                             },

## ``llvm.genx.gather4.masked.scaled2.predef.surface`` : vISA GATHER4_SCALED instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: i32 channel mask, constant
## * arg1: i16 scale, constant
## * arg2: ptr predefined surface (overloaded)
## * arg3: i32 global offset in bytes
## * arg4: vXi32 element offset in bytes
## * arg5: vXi1 predicate (overloaded)
##
## * Return value: vXi32/float the data read
##
    "gather4_masked_scaled2_predef_surface" : { "result" : "anyvector",
                                                "arguments" : ["int","short","anyptr","int","anyint","anyvector"],
                                                "attributes" : "ReadMem",
                                              },

## ``llvm.genx.scatter.scaled.predef.surface`` : vISA SCATTER_SCALED instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
## * arg2: i16 scale, constant
## * arg3: ptr predefined surface (overloaded)
## * arg4: i32 global offset in bytes
## * arg5: vXi32 element offset (overloaded)
## * arg6: data to write (overloaded)
##
## The vector width of the element offset arg is the number of elements to
## write, which must be power of 2 and less than or equal to 32.
##
## The predicate arg must have the same vector width.
##
## The data type to write must have UD, D or F type. For 1 and 2 byte (1 x num
## blocks) accesses the upper bytes will be ignored.
##
    "scatter_scaled_predef_surface" : { "result" : "void",
                                        "arguments" : ["anyvector","int","short","anyptr","int","anyint","anyvector"],
                                        "attributes" : "None",
                                      },

## ``llvm.genx.scatter4.scaled.predef.surface`` : vISA SCATTER4_SCALED instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (Exec_size inferred from element offset type)
## * arg0: vXi1 predicate (overloaded)
## * arg1: i32 channel mask, constant
## * arg2: i16 scale, constant
## * arg3: ptr predefined surface (overloaded)
## * arg4: i32 global offset in bytes
## * arg5: vXi32 element offset in bytes (overloaded)
## * arg6: data to write (overloaded)
##
## The vector width of the element offset arg is the number of elements to
## write, which must be 8 or 16.
## The predicate arg must have the same vector width.
## The instruction writes up to 4 channels per element, with the lowest 4
## bits of the channel mask arg giving the mask of channels _not_ to read.
## The number of 0 bits in that lower 4 bits of the channel mask arg is the
## number of channels to write per element.
## The channels to write must be contiguous and starting at channel 0.
## The vector width of the data to write must be the number of elements
## times the number of channels to write per element.
## The element type of the data to write must be i32 or float.
##
    "scatter4_scaled_predef_surface" : { "result" : "void",
                                         "arguments" : ["anyvector","int","short","anyptr","int","anyint","anyvector"],
                                         "attributes" : "None",
                                       },

## ``llvm.genx.oword.ld*.predef.surface`` : oword load instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * ``llvm.genx.oword.ld.predef.surface`` : vISA OWORD_LD instruction
## * ``llvm.genx.oword.ld.unaligned.predef.surface`` : vISA OWORD_LD_UNALIGNED instruction
##
## * (log2 number of owords inferred from return type)
## * arg0: i32 is_modified, constant
## * arg1: ptr predefined surface variable (overloaded)
## * arg2: i32 offset (in owords for .ld / in bytes for .ld.unaligned)
##
## * Return value: vXiN the data read.
##
## The byte size of the return type must be 16, 32, 64, or 128.
##
    "oword_ld_predef_surface" : { "result" : "anyvector",
                                  "arguments" : ["int", "anyptr", "int"],
                                  "attributes": "ReadMem",
                                },

    "oword_ld_unaligned_predef_surface" : { "result" : "anyvector",
                                            "arguments": ["int", "anyptr", "int"],
                                            "attributes" : "ReadMem",
                                          },

## ``llvm.genx.oword.st.predef.surface`` : vISA OWORD_ST instruction with predefined surface
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
##
## * (log2 number of owords inferred from return type)
## * arg0: ptr predefined surface variable (overloaded)
## * arg1: i32 offset (in owords)
## * arg2: data to write (overloaded)
##
## The byte size of the data to write must be 16, 32, 64, or 128.
##
    "oword_st_predef_surface" : { "result" : "void",
                                  "arguments" : ["anyptr", "int", "anyvector"],
                                  "attributes" : "None",
                                },


## ``llvm.genx.*madw.<return type>.<any int>`` : madw instruction, no saturation
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
## * ``llvm.genx.smadw`` : result signed
## * ``llvm.genx.umadw`` : result unsigned
##
## result := arg0 * arg1 + arg2
##
## * Return value: result, the full 64-bit of the results of multiplying two 32-bit
##                 integers and adding 32-bit integer(32b*32b+32b->64b).
##                 The low 32b of results are stored in the lower GRF and
##                 the high 32b of results are stored in the high GRF.
##
##                 Return width must be 2*GRF/sizeof(i32)
##                 Args width must be no more than GRF/sizeof(i32) and must be a power of two
##
## * arg0: first input, same element type as result
## * arg1: second input, same type as arg0
## * arg2: third input, same type as arg0
##
    "umadw" : { "result" : "anyint",
                "arguments" : ["anyint", 1, 1],
                "attributes" : "NoMem"
              },
    "smadw" : { "result" : "anyint",
                "arguments" : ["anyint", 1, 1],
                "attributes" : "NoMem"
              },

### ``llvm.genx.slm.init`` : slm_init instruction
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
###
### * arg0: slm size, i32 scalar integer type
###
    "slm_init" : { "result" : "void",
                   "arguments" : ["int"],
                   "attributes" : "None"
                 },
}
