libsimdpp  0.9.3
simdpp Namespace Reference

Namespaces

namespace  altivec
 
namespace  neon
 
namespace  null
 
namespace  sse
 

Classes

class  Arch
 Identifies supported instruction set. More...
 
class  float32x4
 Class representing float32x4 vector. More...
 
class  mask_float32x4
 Class representing a mask for 4x 32-bit floating-point vector. More...
 
class  float32x8
 
class  mask_float32x8
 Class representing a mask for 8x 32-bit floating-point vector. More...
 
class  float64x2
 
class  mask_float64x2
 Class representing a mask for 2x 64-bit floating-point vector. More...
 
class  float64x4
 
class  mask_float64x4
 Class representing a mask for 4x 64-bit floating-point vector. More...
 
class  int128
 Base class for all 128-bit integer objects. More...
 
class  basic_int16x16
 Generic class representing 16x 16-bit integer vector. More...
 
class  int16x16
 Class representing 16x 16-bit signed integer vector. More...
 
class  uint16x16
 Class representing 8x 16-bit unsigned integer vector. More...
 
class  mask_int16x16
 Class representing a mask for 16x 16-bit integer vector. More...
 
class  basic_int16x8
 Generic class representing 8x 16-bit integer vector. More...
 
class  int16x8
 Class representing 8x 16-bit signed integer vector. More...
 
class  uint16x8
 Class representing 8x 16-bit unsigned integer vector. More...
 
class  mask_int16x8
 Class representing mask for 8x 16-bit integer vector. More...
 
class  int256
 Base class for all 128-bit integer objects. More...
 
class  basic_int32x4
 Generic class representing 4x 32-bit integer vector. More...
 
class  int32x4
 Class representing 4x 32-bit signed integer vector. More...
 
class  uint32x4
 Class representing 4x 32-bit unsigned integer vector. More...
 
class  mask_int32x4
 Class representing mask for 4x 32-bit integer vector. More...
 
class  basic_int32x8
 Generic class representing 4x 32-bit integer vector. More...
 
class  int32x8
 Class representing 4x 32-bit signed integer vector. More...
 
class  uint32x8
 Class representing 4x 32-bit unsigned integer vector. More...
 
class  mask_int32x8
 Class representing a mask for 8x 32-bit integer vector. More...
 
class  basic_int64x2
 Generic class representing 2x 64-bit integer vector. More...
 
class  int64x2
 Class representing 2x 64-bit signed integer vector. More...
 
class  uint64x2
 Class representing 2x 64-bit unsigned integer vector. More...
 
class  mask_int64x2
 Class representing mask for 2x 64-bit integer vector. More...
 
class  basic_int64x4
 Generic class representing 2x 64-bit integer vector. More...
 
class  int64x4
 Class representing 2x 64-bit signed integer vector. More...
 
class  uint64x4
 Class representing 2x 64-bit unsigned integer vector. More...
 
class  mask_int64x4
 Class representing a mask for 4x 64-bit integer vector. More...
 
class  basic_int8x16
 Generic class representing 16x 8-bit integer vector. More...
 
class  int8x16
 Class representing 16x 8-bit signed integer vector. More...
 
class  uint8x16
 Class representing 16x 8-bit unsigned integer vector. More...
 
class  mask_int8x16
 Class representing mask for 16x 8-bit integer vector. More...
 
class  basic_int8x32
 Generic class representing 16x 8-bit integer vector. More...
 
class  int8x32
 Class representing 16x 8-bit signed integer vector. More...
 
class  uint8x32
 Class representing 16x 8-bit unsigned integer vector. More...
 
class  mask_int8x32
 Class representing a mask for 32x 8-bit integer vector. More...
 
struct  is_vector
 Allows detection whether specific type is a simdpp vector. More...
 
class  is_vector< float32x4 >
 
class  is_vector< float64x2 >
 
class  is_vector< float32x8 >
 
class  is_vector< float64x4 >
 
class  is_vector< int128 >
 
class  is_vector< int256 >
 
class  is_vector< int8x16 >
 
class  is_vector< int16x8 >
 
class  is_vector< int32x4 >
 
class  is_vector< int64x2 >
 
class  is_vector< int8x32 >
 
class  is_vector< int16x16 >
 
class  is_vector< int32x8 >
 
class  is_vector< int64x4 >
 
class  is_vector< uint8x16 >
 
class  is_vector< uint16x8 >
 
class  is_vector< uint32x4 >
 
class  is_vector< uint64x2 >
 
class  is_vector< uint8x32 >
 
class  is_vector< uint16x16 >
 
class  is_vector< uint32x8 >
 
class  is_vector< uint64x4 >
 
class  is_vector< basic_int8x16 >
 
class  is_vector< basic_int16x8 >
 
class  is_vector< basic_int32x4 >
 
class  is_vector< basic_int64x2 >
 
class  is_vector< basic_int8x32 >
 
class  is_vector< basic_int16x16 >
 
class  is_vector< basic_int32x8 >
 
class  is_vector< basic_int64x4 >
 
struct  is_mask
 Allows detection whether specific type is a simdpp mask. More...
 
class  is_mask< mask_int8x16 >
 
class  is_mask< mask_int8x32 >
 
class  is_mask< mask_int16x8 >
 
class  is_mask< mask_int16x16 >
 
class  is_mask< mask_int32x4 >
 
class  is_mask< mask_int32x8 >
 
class  is_mask< mask_int64x2 >
 
class  is_mask< mask_int64x4 >
 
class  is_mask< mask_float32x4 >
 
class  is_mask< mask_float32x8 >
 
class  is_mask< mask_float64x2 >
 
class  is_mask< mask_float64x4 >
 

Typedefs

typedef boost::function< Arch()> GetArchCb
 

Functions

Arch get_arch_gcc_builtin_cpu_supports ()
 Retrieves supported architecture using GCC __builtin_cpu_supports function. More...
 
Arch get_arch_linux_cpuinfo ()
 Retrieves supported architecture from Linux /proc/cpuinfo file. More...
 
basic_int8x16 bit_and (basic_int8x16 a, int128 b)
 Computes bitwise AND of integer vectors. More...
 
basic_int16x8 bit_and (basic_int16x8 a, int128 b)
 
basic_int32x4 bit_and (basic_int32x4 a, int128 b)
 
basic_int64x2 bit_and (basic_int64x2 a, int128 b)
 
basic_int8x32 bit_and (basic_int8x32 a, int256 b)
 
basic_int16x16 bit_and (basic_int16x16 a, int256 b)
 
basic_int32x8 bit_and (basic_int32x8 a, int256 b)
 
basic_int64x4 bit_and (basic_int64x4 a, int256 b)
 
basic_int8x16 bit_and (basic_int8x16 a, mask_int8x16 b)
 
basic_int16x8 bit_and (basic_int16x8 a, mask_int16x8 b)
 
basic_int32x4 bit_and (basic_int32x4 a, mask_int32x4 b)
 
basic_int64x2 bit_and (basic_int64x2 a, mask_int64x2 b)
 
basic_int8x32 bit_and (basic_int8x32 a, mask_int8x32 b)
 
basic_int16x16 bit_and (basic_int16x16 a, mask_int16x16 b)
 
basic_int32x8 bit_and (basic_int32x8 a, mask_int32x8 b)
 
basic_int64x4 bit_and (basic_int64x4 a, mask_int64x4 b)
 
mask_int8x16 bit_and (mask_int8x16 a, mask_int8x16 b)
 
mask_int16x8 bit_and (mask_int16x8 a, mask_int16x8 b)
 
mask_int32x4 bit_and (mask_int32x4 a, mask_int32x4 b)
 
mask_int64x2 bit_and (mask_int64x2 a, mask_int64x2 b)
 
mask_int8x32 bit_and (mask_int8x32 a, mask_int8x32 b)
 
mask_int16x16 bit_and (mask_int16x16 a, mask_int16x16 b)
 
mask_int32x8 bit_and (mask_int32x8 a, mask_int32x8 b)
 
mask_int64x4 bit_and (mask_int64x4 a, mask_int64x4 b)
 
void prefetch_read (const void *ptr)
 Prefetches data to the lowest level cache for reading. More...
 
void prefetch_write (const void *ptr)
 Prefetches data to the lowest level cache for writing. More...
 
template<class R , class T >
bit_cast (T t)
 Casts between unrelated types. More...
 
mask_int8x16 cmp_eq (basic_int8x16 a, basic_int8x16 b)
 Compares 8-bit values for equality. More...
 
mask_int8x32 cmp_eq (basic_int8x32 a, basic_int8x32 b)
 
mask_float64x2 cmp_gt (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for greater-than. More...
 
mask_float64x4 cmp_gt (float64x4 a, float64x4 b)
 
mask_float64x2 cmp_ge (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for greater-than. More...
 
mask_float64x4 cmp_ge (float64x4 a, float64x4 b)
 
basic_int16x8 to_int16x8 (int8x16 a)
 Sign extends the first 8 values of a signed int8x16 vector to 16-bits. More...
 
basic_int16x16 to_int16x16 (int8x32 a)
 Sign extends the first 16 values of a signed int8x32 vector to 16-bits. More...
 
basic_int16x8 to_int16x8 (uint8x16 a)
 Extends the first 8 values of a unsigned int8x16 vector to 16-bits. More...
 
basic_int16x16 to_int16x16 (uint8x32 a)
 Extends the first 16 values of a unsigned int8x32 vector to 16-bits. More...
 
basic_int32x4 to_int32x4 (int16x8 a)
 Sign extends the first 4 values of a signed int16x8 vector to 32-bits. More...
 
basic_int32x8 to_int32x8 (int16x16 a)
 Sign extends the first 8 values of a signed int16x16 vector to 32-bits. More...
 
basic_int32x4 to_int32x4 (uint16x8 a)
 Zero-extends the values of a unsigned int16x8 vector to 32-bits. More...
 
basic_int32x8 to_int32x8 (uint16x16 a)
 Zero-extends the first 8 values of a unsigned int16x16 vector to 32-bits. More...
 
template<unsigned id>
float extract (float32x4 a)
 Extracts an element from float32x4 vector. More...
 
template<unsigned id>
double extract (float64x2 a)
 Extracts an element from float64x2 vector. More...
 
uint16_t extract_bits_any (uint8x16 a)
 Extracts a bit from each byte of each element of a int8x16 vector. More...
 
template<unsigned id>
uint16_t extract_bits (uint8x16 a)
 Extracts specific bit from each byte of each element of a int8x16 vector. More...
 
template<unsigned id>
basic_int8x16 insert (basic_int8x16 a, uint8_t x)
 Inserts an element into int8x16 vector at the position identified by id. More...
 
template<unsigned id>
basic_int16x8 insert (basic_int16x8 a, uint16_t x)
 Inserts an element into int16x8 vector at the position identified by id. More...
 
template<unsigned id>
basic_int32x4 insert (basic_int32x4 a, uint32_t x)
 Inserts an element into int32x4 vector at the position identified by id. More...
 
template<unsigned id>
basic_int64x2 insert (basic_int64x2 a, uint64_t x)
 Inserts an element into int64x2 vector at the position identified by id. More...
 
template<unsigned id>
float32x4 insert (float32x4 a, float x)
 Inserts an element into float32x4 vector at the position identified by id. More...
 
template<unsigned id>
float64x2 insert (float64x2 a, double x)
 Inserts an element into float64x2 vector at the position identified by id. More...
 
float32x4 abs (float32x4 a)
 Computes absolute value of floating point values. More...
 
float32x8 abs (float32x8 a)
 
basic_int8x16 add (basic_int8x16 a, basic_int8x16 b)
 Adds 8-bit integer values. More...
 
basic_int8x32 add (basic_int8x32 a, basic_int8x32 b)
 
int8x16 shift_r (int8x16 a, unsigned count)
 Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...
 
int8x32 shift_r (int8x32 a, unsigned count)
 
int128 load (int128 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location. More...
 
int256 load (int256 &a, const void *p)
 
float32x4 load (float32x4 &a, const float *p)
 
float32x8 load (float32x8 &a, const float *p)
 
float64x2 load (float64x2 &a, const double *p)
 
float64x4 load (float64x4 &a, const double *p)
 
void load_packed2 (float32x4 &a, float32x4 &b, const float *p)
 Loads 32-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (float32x8 &a, float32x8 &b, const float *p)
 
void store (void *p, int128 a)
 Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...
 
void store (void *p, int256 a)
 
void store (float *p, float32x4 a)
 
void store (float *p, float32x8 a)
 
void store (double *p, float64x2 a)
 
void store (double *p, float64x4 a)
 
basic_int8x16 zip_lo (basic_int8x16 a, basic_int8x16 b)
 Interleaves the lower halves of two vectors. More...
 
basic_int8x32 zip_lo (basic_int8x32 a, basic_int8x32 b)
 
basic_int16x8 zip_lo (basic_int16x8 a, basic_int16x8 b)
 
basic_int16x16 zip_lo (basic_int16x16 a, basic_int16x16 b)
 
basic_int32x4 zip_lo (basic_int32x4 a, basic_int32x4 b)
 
basic_int32x8 zip_lo (basic_int32x8 a, basic_int32x8 b)
 
basic_int64x2 zip_lo (basic_int64x2 a, basic_int64x2 b)
 
basic_int64x4 zip_lo (basic_int64x4 a, basic_int64x4 b)
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int64x4 permute (basic_int64x4 a)
 Permutes the values of each set of four consecutive 64-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float64x4 permute (float64x4 a)
 Permutes the values of each set of four consecutive 64-bit floating-point values. More...
 
Arch this_compile_arch ()
 Returns the instruction set flags that will be required by the currently compiled code. More...
 
void transpose2 (basic_int16x8 &a0, basic_int16x8 &a1)
 Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...
 
void transpose2 (basic_int16x16 &a0, basic_int16x16 &a1)
 
void transpose8 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3, basic_int8x16 &a4, basic_int8x16 &a5, basic_int8x16 &a6, basic_int8x16 &a7)
 Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...
 
void transpose8 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3, basic_int8x32 &a4, basic_int8x32 &a5, basic_int8x32 &a6, basic_int8x32 &a7)
 
void transpose8 (basic_int16x8 &a0, basic_int16x8 &a1, basic_int16x8 &a2, basic_int16x8 &a3, basic_int16x8 &a4, basic_int16x8 &a5, basic_int16x8 &a6, basic_int16x8 &a7)
 Transposes a 8x8 16-bit matrix within eight int16x8 vectors. More...
 
void transpose8 (basic_int16x16 &a0, basic_int16x16 &a1, basic_int16x16 &a2, basic_int16x16 &a3, basic_int16x16 &a4, basic_int16x16 &a5, basic_int16x16 &a6, basic_int16x16 &a7)
 
void transpose16 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3, basic_int8x16 &a4, basic_int8x16 &a5, basic_int8x16 &a6, basic_int8x16 &a7, basic_int8x16 &a8, basic_int8x16 &a9, basic_int8x16 &a10, basic_int8x16 &a11, basic_int8x16 &a12, basic_int8x16 &a13, basic_int8x16 &a14, basic_int8x16 &a15)
 Transposes a 16x16 8-bit matrix within sixteen int8x16 vectors. More...
 
void transpose16 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3, basic_int8x32 &a4, basic_int8x32 &a5, basic_int8x32 &a6, basic_int8x32 &a7, basic_int8x32 &a8, basic_int8x32 &a9, basic_int8x32 &a10, basic_int8x32 &a11, basic_int8x32 &a12, basic_int8x32 &a13, basic_int8x32 &a14, basic_int8x32 &a15)
 
Archoperator|= (Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Archoperator&= (Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator| (const Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator& (const Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator~ (const Arch &x)
 Bitwise operators for Arch. More...
 
float32x4 bit_and (float32x4 a, float32x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
float32x8 bit_and (float32x8 a, float32x8 b)
 Computes bitwise AND of floating-point vectors. More...
 
float32x4 bit_and (float32x4 a, int128 b)
 Computes bitwise AND of floating-point vectors. More...
 
float32x8 bit_and (float32x8 a, int256 b)
 Computes bitwise AND of floating-point vectors. More...
 
float32x4 bit_and (float32x4 a, mask_float32x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
float32x8 bit_and (float32x8 a, mask_float32x8 b)
 Computes bitwise AND of floating-point vectors. More...
 
mask_float32x4 bit_and (mask_float32x4 a, mask_float32x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
mask_float32x8 bit_and (mask_float32x8 a, mask_float32x8 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x2 bit_and (float64x2 a, float64x2 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x4 bit_and (float64x4 a, float64x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x2 bit_and (float64x2 a, int128 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x4 bit_and (float64x4 a, int256 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x2 bit_and (float64x2 a, mask_float64x2 b)
 Computes bitwise AND of floating-point vectors. More...
 
float64x4 bit_and (float64x4 a, mask_float64x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
mask_float64x2 bit_and (mask_float64x2 a, mask_float64x2 b)
 Computes bitwise AND of floating-point vectors. More...
 
mask_float64x4 bit_and (mask_float64x4 a, mask_float64x4 b)
 Computes bitwise AND of floating-point vectors. More...
 
basic_int8x16 bit_andnot (basic_int8x16 a, int128 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int16x8 bit_andnot (basic_int16x8 a, int128 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int32x4 bit_andnot (basic_int32x4 a, int128 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int64x2 bit_andnot (basic_int64x2 a, int128 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int8x32 bit_andnot (basic_int8x32 a, int256 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int16x16 bit_andnot (basic_int16x16 a, int256 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int32x8 bit_andnot (basic_int32x8 a, int256 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int64x4 bit_andnot (basic_int64x4 a, int256 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int8x16 bit_andnot (basic_int8x16 a, mask_int8x16 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int16x8 bit_andnot (basic_int16x8 a, mask_int16x8 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int32x4 bit_andnot (basic_int32x4 a, mask_int32x4 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int64x2 bit_andnot (basic_int64x2 a, mask_int64x2 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int8x32 bit_andnot (basic_int8x32 a, mask_int8x32 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int16x16 bit_andnot (basic_int16x16 a, mask_int16x16 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int32x8 bit_andnot (basic_int32x8 a, mask_int32x8 b)
 Computes bitwise AND NOT of integer vectors. More...
 
basic_int64x4 bit_andnot (basic_int64x4 a, mask_int64x4 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int8x16 bit_andnot (mask_int8x16 a, mask_int8x16 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int16x8 bit_andnot (mask_int16x8 a, mask_int16x8 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int32x4 bit_andnot (mask_int32x4 a, mask_int32x4 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int64x2 bit_andnot (mask_int64x2 a, mask_int64x2 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int8x32 bit_andnot (mask_int8x32 a, mask_int8x32 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int16x16 bit_andnot (mask_int16x16 a, mask_int16x16 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int32x8 bit_andnot (mask_int32x8 a, mask_int32x8 b)
 Computes bitwise AND NOT of integer vectors. More...
 
mask_int64x4 bit_andnot (mask_int64x4 a, mask_int64x4 b)
 Computes bitwise AND NOT of integer vectors. More...
 
float32x4 bit_andnot (float32x4 a, float32x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float32x8 bit_andnot (float32x8 a, float32x8 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float32x4 bit_andnot (float32x4 a, int128 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float32x8 bit_andnot (float32x8 a, int256 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float32x4 bit_andnot (float32x4 a, mask_float32x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float32x8 bit_andnot (float32x8 a, mask_float32x8 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
mask_float32x4 bit_andnot (mask_float32x4 a, mask_float32x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
mask_float32x8 bit_andnot (mask_float32x8 a, mask_float32x8 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x2 bit_andnot (float64x2 a, float64x2 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x4 bit_andnot (float64x4 a, float64x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x2 bit_andnot (float64x2 a, int128 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x4 bit_andnot (float64x4 a, int256 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x2 bit_andnot (float64x2 a, mask_float64x2 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
float64x4 bit_andnot (float64x4 a, mask_float64x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
mask_float64x2 bit_andnot (mask_float64x2 a, mask_float64x2 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
mask_float64x4 bit_andnot (mask_float64x4 a, mask_float64x4 b)
 Computes bitwise AND NOT of floating-point vectors. More...
 
basic_int8x16 bit_or (basic_int8x16 a, int128 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int16x8 bit_or (basic_int16x8 a, int128 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int32x4 bit_or (basic_int32x4 a, int128 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int64x2 bit_or (basic_int64x2 a, int128 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int8x32 bit_or (basic_int8x32 a, int256 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int16x16 bit_or (basic_int16x16 a, int256 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int32x8 bit_or (basic_int32x8 a, int256 b)
 Computes bitwise OR of integer vectors. More...
 
basic_int64x4 bit_or (basic_int64x4 a, int256 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int8x16 bit_or (mask_int8x16 a, mask_int8x16 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int16x8 bit_or (mask_int16x8 a, mask_int16x8 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int32x4 bit_or (mask_int32x4 a, mask_int32x4 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int64x2 bit_or (mask_int64x2 a, mask_int64x2 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int8x32 bit_or (mask_int8x32 a, mask_int8x32 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int16x16 bit_or (mask_int16x16 a, mask_int16x16 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int32x8 bit_or (mask_int32x8 a, mask_int32x8 b)
 Computes bitwise OR of integer vectors. More...
 
mask_int64x4 bit_or (mask_int64x4 a, mask_int64x4 b)
 Computes bitwise OR of integer vectors. More...
 
float32x4 bit_or (float32x4 a, float32x4 b)
 Computes bitwise OR of floating-point vectors. More...
 
float32x8 bit_or (float32x8 a, float32x8 b)
 Computes bitwise OR of floating-point vectors. More...
 
float32x4 bit_or (float32x4 a, int128 b)
 Computes bitwise OR of floating-point vectors. More...
 
float32x8 bit_or (float32x8 a, int256 b)
 Computes bitwise OR of floating-point vectors. More...
 
float64x2 bit_or (float64x2 a, float64x2 b)
 Computes bitwise OR of floating-point vectors. More...
 
float64x4 bit_or (float64x4 a, float64x4 b)
 Computes bitwise OR of floating-point vectors. More...
 
float64x2 bit_or (float64x2 a, int128 b)
 Computes bitwise OR of floating-point vectors. More...
 
float64x4 bit_or (float64x4 a, int256 b)
 Computes bitwise OR of floating-point vectors. More...
 
mask_float32x4 bit_or (mask_float32x4 a, mask_float32x4 b)
 Computes bitwise OR of floating-point vectors. More...
 
mask_float64x2 bit_or (mask_float64x2 a, mask_float64x2 b)
 Computes bitwise OR of floating-point vectors. More...
 
mask_float32x8 bit_or (mask_float32x8 a, mask_float32x8 b)
 Computes bitwise OR of floating-point vectors. More...
 
mask_float64x4 bit_or (mask_float64x4 a, mask_float64x4 b)
 Computes bitwise OR of floating-point vectors. More...
 
basic_int8x16 bit_xor (basic_int8x16 a, int128 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int16x8 bit_xor (basic_int16x8 a, int128 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int32x4 bit_xor (basic_int32x4 a, int128 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int64x2 bit_xor (basic_int64x2 a, int128 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int8x32 bit_xor (basic_int8x32 a, int256 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int16x16 bit_xor (basic_int16x16 a, int256 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int32x8 bit_xor (basic_int32x8 a, int256 b)
 Computes bitwise XOR of integer vectors. More...
 
basic_int64x4 bit_xor (basic_int64x4 a, int256 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int8x16 bit_xor (mask_int8x16 a, mask_int8x16 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int16x8 bit_xor (mask_int16x8 a, mask_int16x8 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int32x4 bit_xor (mask_int32x4 a, mask_int32x4 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int64x2 bit_xor (mask_int64x2 a, mask_int64x2 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int8x32 bit_xor (mask_int8x32 a, mask_int8x32 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int16x16 bit_xor (mask_int16x16 a, mask_int16x16 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int32x8 bit_xor (mask_int32x8 a, mask_int32x8 b)
 Computes bitwise XOR of integer vectors. More...
 
mask_int64x4 bit_xor (mask_int64x4 a, mask_int64x4 b)
 Computes bitwise XOR of integer vectors. More...
 
float32x4 bit_xor (float32x4 a, float32x4 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float32x8 bit_xor (float32x8 a, float32x8 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float32x4 bit_xor (float32x4 a, int128 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float32x8 bit_xor (float32x8 a, int256 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float64x2 bit_xor (float64x2 a, float64x2 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float64x4 bit_xor (float64x4 a, float64x4 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float64x2 bit_xor (float64x2 a, int128 b)
 Computes bitwise XOR of floating-point vectors. More...
 
float64x4 bit_xor (float64x4 a, int256 b)
 Computes bitwise XOR of floating-point vectors. More...
 
mask_float32x4 bit_xor (mask_float32x4 a, mask_float32x4 b)
 Computes bitwise XOR of floating-point vectors. More...
 
mask_float64x2 bit_xor (mask_float64x2 a, mask_float64x2 b)
 Computes bitwise XOR of floating-point vectors. More...
 
mask_float32x8 bit_xor (mask_float32x8 a, mask_float32x8 b)
 Computes bitwise XOR of floating-point vectors. More...
 
mask_float64x4 bit_xor (mask_float64x4 a, mask_float64x4 b)
 Computes bitwise XOR of floating-point vectors. More...
 
basic_int8x16 bit_not (basic_int8x16 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int16x8 bit_not (basic_int16x8 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int32x4 bit_not (basic_int32x4 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int64x2 bit_not (basic_int64x2 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int8x32 bit_not (basic_int8x32 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int16x16 bit_not (basic_int16x16 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int32x8 bit_not (basic_int32x8 a)
 Computes bitwise NOT of an integer vector. More...
 
basic_int64x4 bit_not (basic_int64x4 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int8x16 bit_not (mask_int8x16 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int16x8 bit_not (mask_int16x8 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int32x4 bit_not (mask_int32x4 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int64x2 bit_not (mask_int64x2 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int8x32 bit_not (mask_int8x32 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int16x16 bit_not (mask_int16x16 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int32x8 bit_not (mask_int32x8 a)
 Computes bitwise NOT of an integer vector. More...
 
mask_int64x4 bit_not (mask_int64x4 a)
 Computes bitwise NOT of an integer vector. More...
 
float32x4 bit_not (float32x4 a)
 Computes bitwise NOT of a floating-point vector. More...
 
float64x2 bit_not (float64x2 a)
 Computes bitwise NOT of a floating-point vector. More...
 
float32x8 bit_not (float32x8 a)
 Computes bitwise NOT of a floating-point vector. More...
 
float64x4 bit_not (float64x4 a)
 Computes bitwise NOT of a floating-point vector. More...
 
mask_float32x4 bit_not (mask_float32x4 a)
 Computes bitwise NOT of a floating-point vector. More...
 
mask_float64x2 bit_not (mask_float64x2 a)
 Computes bitwise NOT of a floating-point vector. More...
 
mask_float32x8 bit_not (mask_float32x8 a)
 Computes bitwise NOT of a floating-point vector. More...
 
mask_float64x4 bit_not (mask_float64x4 a)
 Computes bitwise NOT of a floating-point vector. More...
 
mask_int16x8 cmp_eq (basic_int16x8 a, basic_int16x8 b)
 Compares 16-bit values for equality. More...
 
mask_int16x16 cmp_eq (basic_int16x16 a, basic_int16x16 b)
 Compares 16-bit values for equality. More...
 
mask_int32x4 cmp_eq (basic_int32x4 a, basic_int32x4 b)
 Compares the values of two int32x4 vectors for equality. More...
 
mask_int32x8 cmp_eq (basic_int32x8 a, basic_int32x8 b)
 Compares the values of two int32x4 vectors for equality. More...
 
mask_int64x2 cmp_eq (basic_int64x2 a, basic_int64x2 b)
 Compares the values of two int64x2 vectors for equality. More...
 
mask_int64x4 cmp_eq (basic_int64x4 a, basic_int64x4 b)
 Compares the values of two int64x2 vectors for equality. More...
 
mask_float32x4 cmp_eq (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for equality. More...
 
mask_float32x8 cmp_eq (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for equality. More...
 
mask_float64x2 cmp_eq (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for equality. More...
 
mask_float64x4 cmp_eq (float64x4 a, float64x4 b)
 Compares the values of two float64x2 vectors for equality. More...
 
mask_int8x16 cmp_neq (basic_int8x16 a, basic_int8x16 b)
 Compares the values of two int8x16 vectors for inequality. More...
 
mask_int8x32 cmp_neq (basic_int8x32 a, basic_int8x32 b)
 Compares the values of two int8x16 vectors for inequality. More...
 
mask_int16x8 cmp_neq (basic_int16x8 a, basic_int16x8 b)
 Compares the values of two int16x8 vectors for inequality. More...
 
mask_int16x16 cmp_neq (basic_int16x16 a, basic_int16x16 b)
 Compares the values of two int16x8 vectors for inequality. More...
 
mask_int32x4 cmp_neq (basic_int32x4 a, basic_int32x4 b)
 Compares the values of two int32x4 vectors for inequality. More...
 
mask_int32x8 cmp_neq (basic_int32x8 a, basic_int32x8 b)
 Compares the values of two int32x4 vectors for inequality. More...
 
mask_int64x2 cmp_neq (basic_int64x2 a, basic_int64x2 b)
 Compares the values of two int64x2 vectors for inequality. More...
 
mask_int64x4 cmp_neq (basic_int64x4 a, basic_int64x4 b)
 Compares the values of two int64x2 vectors for inequality. More...
 
mask_float32x4 cmp_neq (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for inequality. More...
 
mask_float32x8 cmp_neq (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for inequality. More...
 
mask_float64x2 cmp_neq (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for inequality. More...
 
mask_float64x4 cmp_neq (float64x4 a, float64x4 b)
 Compares the values of two float64x2 vectors for inequality. More...
 
mask_int8x16 cmp_gt (int8x16 a, int8x16 b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
mask_int8x32 cmp_gt (int8x32 a, int8x32 b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
mask_int8x16 cmp_gt (uint8x16 a, uint8x16 b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
mask_int8x32 cmp_gt (uint8x32 a, uint8x32 b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
mask_int16x8 cmp_gt (int16x8 a, int16x8 b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
mask_int16x16 cmp_gt (int16x16 a, int16x16 b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
mask_int16x8 cmp_gt (uint16x8 a, uint16x8 b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
mask_int16x16 cmp_gt (uint16x16 a, uint16x16 b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
mask_int32x4 cmp_gt (int32x4 a, int32x4 b)
 Compares the values of two signed int32x4 vectors for greater-than. More...
 
mask_int32x8 cmp_gt (int32x8 a, int32x8 b)
 Compares the values of two signed int32x4 vectors for greater-than. More...
 
mask_int32x4 cmp_gt (uint32x4 a, uint32x4 b)
 Compares the values of two unsigned int32x4 vectors for greater-than. More...
 
mask_int32x8 cmp_gt (uint32x8 a, uint32x8 b)
 Compares the values of two unsigned int32x4 vectors for greater-than. More...
 
mask_float32x4 cmp_gt (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for greater-than. More...
 
mask_float32x8 cmp_gt (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for greater-than. More...
 
mask_float32x4 cmp_ge (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for greater-than or equal. More...
 
mask_float32x8 cmp_ge (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for greater-than or equal. More...
 
mask_int8x16 cmp_lt (int8x16 a, int8x16 b)
 Compares the values of two signed int8x16 vectors for less-than. More...
 
mask_int8x32 cmp_lt (int8x32 a, int8x32 b)
 Compares the values of two signed int8x16 vectors for less-than. More...
 
mask_int8x16 cmp_lt (uint8x16 a, uint8x16 b)
 Compares the values of two unsigned int8x16 vectors for less-than. More...
 
mask_int8x32 cmp_lt (uint8x32 a, uint8x32 b)
 Compares the values of two unsigned int8x16 vectors for less-than. More...
 
mask_int16x8 cmp_lt (int16x8 a, int16x8 b)
 Compares the values of two signed int16x8 vectors for less-than. More...
 
mask_int16x16 cmp_lt (int16x16 a, int16x16 b)
 Compares the values of two signed int16x8 vectors for less-than. More...
 
mask_int16x8 cmp_lt (uint16x8 a, uint16x8 b)
 Compares the values of two unsigned int16x8 vectors for less-than. More...
 
mask_int16x16 cmp_lt (uint16x16 a, uint16x16 b)
 Compares the values of two unsigned int16x8 vectors for less-than. More...
 
mask_int32x4 cmp_lt (int32x4 a, int32x4 b)
 Compares the values of two signed int32x4 vectors for less-than. More...
 
mask_int32x8 cmp_lt (int32x8 a, int32x8 b)
 Compares the values of two signed int32x4 vectors for less-than. More...
 
mask_int32x4 cmp_lt (uint32x4 a, uint32x4 b)
 Compares the values of two unsigned int32x4 vectors for less-than. More...
 
mask_int32x8 cmp_lt (uint32x8 a, uint32x8 b)
 Compares the values of two unsigned int32x4 vectors for less-than. More...
 
mask_float32x4 cmp_lt (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for less-than. More...
 
mask_float32x8 cmp_lt (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for less-than. More...
 
mask_float64x2 cmp_lt (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for less-than. More...
 
mask_float64x4 cmp_lt (float64x4 a, float64x4 b)
 Compares the values of two float64x2 vectors for less-than. More...
 
mask_float32x4 cmp_le (float32x4 a, float32x4 b)
 Compares the values of two float32x4 vectors for less-than or equal. More...
 
mask_float32x8 cmp_le (float32x8 a, float32x8 b)
 Compares the values of two float32x4 vectors for less-than or equal. More...
 
mask_float64x2 cmp_le (float64x2 a, float64x2 b)
 Compares the values of two float64x2 vectors for less-than or equal. More...
 
mask_float64x4 cmp_le (float64x4 a, float64x4 b)
 Compares the values of two float64x2 vectors for less-than or equal. More...
 
basic_int32x4 to_int32x4 (float32x4 a)
 Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...
 
basic_int32x8 to_int32x8 (float32x8 a)
 Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...
 
basic_int32x4 to_int32x4 (float64x2 a)
 Converts the values of a doublex2 vector into int32_t representation using truncation. More...
 
basic_int32x8 to_int32x8 (float64x4 a)
 Converts the values of a doublex2 vector into int32_t representation using truncation. More...
 
basic_int64x2 to_int64x2 (int32x4 a)
 Extends the values of a signed int32x4 vector to 64-bits. More...
 
basic_int64x4 to_int64x4 (int32x8 a)
 Extends the values of a signed int32x4 vector to 64-bits. More...
 
basic_int64x2 to_int64x2 (uint32x4 a)
 Extends the values of an unsigned int32x4 vector to 64-bits. More...
 
basic_int64x4 to_int64x4 (uint32x8 a)
 Extends the values of a signed int32x4 vector to 64-bits. More...
 
float32x4 to_float32x4 (int32x4 a)
 Converts 32-bit integer values to 32-bit float values. More...
 
float32x8 to_float32x8 (int32x8 a)
 Converts 32-bit integer values to 32-bit float values. More...
 
float32x4 to_float32x4 (float64x2 a)
 Converts 64-bit float values to 32-bit float values. More...
 
float32x8 to_float32x8 (float64x4 a)
 Converts 64-bit float values to 32-bit float values. More...
 
float64x2 to_float64x2 (int32x4 a)
 Converts the 32-bit integer values to 64-bit float values. More...
 
float64x4 to_float64x4 (int32x8 a)
 Converts the 32-bit integer values to 64-bit float values. More...
 
float64x2 to_float64x2 (float32x4 a)
 Converts the 32-bit float values to 64-bit float values. More...
 
float64x4 to_float64x4 (float32x8 a)
 Converts the 32-bit float values to 64-bit float values. More...
 
template<unsigned id>
uint8_t extract (basic_int8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<unsigned id>
int8_t extract (int8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<unsigned id>
uint16_t extract (basic_int16x8 a)
 Extracts the id-th element from int16x8 vector. More...
 
template<unsigned id>
int16_t extract (int16x8 a)
 Extracts the id-th element from int16x8 vector. More...
 
template<unsigned id>
uint32_t extract (basic_int32x4 a)
 Extracts the id-th element from int32x4 vector. More...
 
template<unsigned id>
int32_t extract (int32x4 a)
 Extracts the id-th element from int32x4 vector. More...
 
template<unsigned id>
uint64_t extract (basic_int64x2 a)
 Extracts an element from int64x2 vector. More...
 
template<unsigned id>
int64_t extract (int64x2 a)
 Extracts an element from int64x2 vector. More...
 
int256 combine (int128 a, int128 b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
float32x8 combine (float32x4 a, float32x4 b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
float64x4 combine (float64x2 a, float64x2 b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<int s0, int s1>
basic_int8x16 make_shuffle_bytes16_mask (basic_int8x16 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int8x32 make_shuffle_bytes16_mask (basic_int8x32 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int8x16 make_shuffle_bytes16_mask (basic_int8x16 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int8x32 make_shuffle_bytes16_mask (basic_int8x32 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x16 make_shuffle_bytes16_mask (basic_int8x16 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x32 make_shuffle_bytes16_mask (basic_int8x32 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x16 make_shuffle_bytes16_mask (basic_int8x16 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x32 make_shuffle_bytes16_mask (basic_int8x32 &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int16x8 make_shuffle_bytes16_mask (basic_int16x8 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int16x16 make_shuffle_bytes16_mask (basic_int16x16 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int16x8 make_shuffle_bytes16_mask (basic_int16x8 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int16x16 make_shuffle_bytes16_mask (basic_int16x16 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x8 make_shuffle_bytes16_mask (basic_int16x8 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x16 make_shuffle_bytes16_mask (basic_int16x16 &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int32x4 make_shuffle_bytes16_mask (basic_int32x4 &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int32x8 make_shuffle_bytes16_mask (basic_int32x8 &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int32x4 make_shuffle_bytes16_mask (basic_int32x4 &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3>
basic_int32x8 make_shuffle_bytes16_mask (basic_int32x8 &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int64x2 make_shuffle_bytes16_mask (basic_int64x2 &mask)
 Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1>
basic_int64x4 make_shuffle_bytes16_mask (basic_int64x4 &mask)
 Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
mask_float32x4 isnan (float32x4 a)
 Checks whether elements in a are IEEE754 NaN. More...
 
mask_float32x8 isnan (float32x8 a)
 Checks whether elements in a are IEEE754 NaN. More...
 
mask_float64x2 isnan (float64x2 a)
 Checks whether elements in a are IEEE754 NaN. More...
 
mask_float64x4 isnan (float64x4 a)
 Checks whether elements in a are IEEE754 NaN. More...
 
mask_float32x4 isnan2 (float32x4 a, float32x4 b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
mask_float32x8 isnan2 (float32x8 a, float32x8 b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
mask_float64x2 isnan2 (float64x2 a, float64x2 b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
mask_float64x4 isnan2 (float64x4 a, float64x4 b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
float32x4 rcp_e (float32x4 a)
 Computes approximate reciprocal. More...
 
float32x8 rcp_e (float32x8 a)
 Computes approximate reciprocal. More...
 
float32x4 rcp_rh (float32x4 x, float32x4 a)
 Computes one Newton-Rhapson iterations for reciprocal. More...
 
float32x8 rcp_rh (float32x8 x, float32x8 a)
 Computes one Newton-Rhapson iterations for reciprocal. More...
 
float32x4 div (float32x4 a, float32x4 b)
 Divides the values of two vectors. More...
 
float32x8 div (float32x8 a, float32x8 b)
 Divides the values of two vectors. More...
 
float64x2 div (float64x2 a, float64x2 b)
 Divides the values of two vectors. More...
 
float64x4 div (float64x4 a, float64x4 b)
 Divides the values of two vectors. More...
 
float32x4 rsqrt_e (float32x4 a)
 Computes approximate reciprocal square root. More...
 
float32x8 rsqrt_e (float32x8 a)
 Computes approximate reciprocal square root. More...
 
float32x4 rsqrt_rh (float32x4 x, float32x4 a)
 Computes one Newton-Rhapson iteration for inverse of square root. More...
 
float32x8 rsqrt_rh (float32x8 x, float32x8 a)
 Computes one Newton-Rhapson iteration for inverse of square root. More...
 
float32x4 sqrt (float32x4 a)
 Computes square root. More...
 
float32x8 sqrt (float32x8 a)
 Computes square root. More...
 
float64x2 sqrt (float64x2 a)
 Computes square root. More...
 
float64x4 sqrt (float64x4 a)
 Computes square root. More...
 
float32x4 min (float32x4 a, float32x4 b)
 Computes minimum of the values in two vectors. More...
 
float32x8 min (float32x8 a, float32x8 b)
 Computes minimum of the values in two vectors. More...
 
float64x2 min (float64x2 a, float64x2 b)
 Computes minima of the values in two vectors. More...
 
float64x4 min (float64x4 a, float64x4 b)
 Computes minima of the values in two vectors. More...
 
float32x4 max (float32x4 a, float32x4 b)
 Computes maxima of the values of two vectors. More...
 
float32x8 max (float32x8 a, float32x8 b)
 Computes maxima of the values of two vectors. More...
 
float64x2 max (float64x2 a, float64x2 b)
 Computes maxima of the values of two vectors. More...
 
float64x4 max (float64x4 a, float64x4 b)
 Computes maxima of the values of two vectors. More...
 
float32x4 floor (float32x4 a)
 Rounds the values of a vector towards negative infinity. More...
 
float32x8 floor (float32x8 a)
 Rounds the values of a vector towards negative infinity. More...
 
float32x4 ceil (float32x4 a)
 Rounds the values a vector towards positive infinity. More...
 
float32x8 ceil (float32x8 a)
 Rounds the values of a vector towards negative infinity. More...
 
float32x4 trunc (float32x4 a)
 Rounds the values of a vector towards zero. More...
 
float32x8 trunc (float32x8 a)
 Rounds the values of a vector towards zero. More...
 
float64x2 abs (float64x2 a)
 Computes absolute value of floating point values. More...
 
float64x4 abs (float64x4 a)
 Computes absolute value of floating point values. More...
 
float32x4 sign (float32x4 a)
 Extracts sign bits from the values in float32x4 vector. More...
 
float32x8 sign (float32x8 a)
 Extracts sign bits from the values in float32x4 vector. More...
 
float64x2 sign (float64x2 a)
 Extracts sigh bit from the values in float64x2 vector. More...
 
float64x4 sign (float64x4 a)
 Extracts sigh bit from the values in float64x2 vector. More...
 
float32x4 add (float32x4 a, float32x4 b)
 Adds the values of two vectors. More...
 
float32x8 add (float32x8 a, float32x8 b)
 Adds the values of two vectors. More...
 
float64x2 add (float64x2 a, float64x2 b)
 Adds the values of two vectors. More...
 
float64x4 add (float64x4 a, float64x4 b)
 Adds the values of two vectors. More...
 
float32x4 sub (float32x4 a, float32x4 b)
 Substracts the values of two vectors. More...
 
float32x8 sub (float32x8 a, float32x8 b)
 Substracts the values of two vectors. More...
 
float64x2 sub (float64x2 a, float64x2 b)
 Subtracts the values of two vectors. More...
 
float64x4 sub (float64x4 a, float64x4 b)
 Subtracts the values of two vectors. More...
 
float32x4 neg (float32x4 a)
 Negates the values of a float32x4 vector. More...
 
float32x8 neg (float32x8 a)
 Negates the values of a float32x4 vector. More...
 
float64x2 neg (float64x2 a)
 Negates the values of a vector. More...
 
float64x4 neg (float64x4 a)
 Negates the values of a vector. More...
 
float32x4 mul (float32x4 a, float32x4 b)
 Multiplies the values of two vectors. More...
 
float32x8 mul (float32x8 a, float32x8 b)
 Multiplies the values of two vectors. More...
 
float64x2 mul (float64x2 a, float64x2 b)
 Multiplies the values of two vectors. More...
 
float64x4 mul (float64x4 a, float64x4 b)
 Multiplies the values of two vectors. More...
 
float32x4 fmadd (float32x4 a, float32x4 b, float32x4 c)
 Performs a fused multiply-add operation. More...
 
float32x8 fmadd (float32x8 a, float32x8 b, float32x8 c)
 Performs a fused multiply-add operation. More...
 
float64x2 fmadd (float64x2 a, float64x2 b, float64x2 c)
 Performs a fused multiply-add operation. More...
 
float64x4 fmadd (float64x4 a, float64x4 b, float64x4 c)
 Performs a fused multiply-add operation. More...
 
float32x4 fmsub (float32x4 a, float32x4 b, float32x4 c)
 Performs a fused multiply-sutract operation. More...
 
float32x8 fmsub (float32x8 a, float32x8 b, float32x8 c)
 Performs a fused multiply-sutract operation. More...
 
float64x2 fmsub (float64x2 a, float64x2 b, float64x2 c)
 Performs a fused multiply-sutract operation. More...
 
float64x4 fmsub (float64x4 a, float64x4 b, float64x4 c)
 Performs a fused multiply-sutract operation. More...
 
int8x16 min (int8x16 a, int8x16 b)
 Computes minimum of signed 8-bit values. More...
 
int8x32 min (int8x32 a, int8x32 b)
 Computes minimum of signed 8-bit values. More...
 
uint8x16 min (uint8x16 a, uint8x16 b)
 Computes minimum of the unsigned 8-bit values. More...
 
uint8x32 min (uint8x32 a, uint8x32 b)
 Computes minimum of the unsigned 8-bit values. More...
 
int16x8 min (int16x8 a, int16x8 b)
 Computes minimum of the signed 16-bit values. More...
 
int16x16 min (int16x16 a, int16x16 b)
 Computes minimum of the signed 16-bit values. More...
 
uint16x8 min (uint16x8 a, uint16x8 b)
 Computes minimum of the unsigned 16-bit values. More...
 
uint16x16 min (uint16x16 a, uint16x16 b)
 Computes minimum of the unsigned 16-bit values. More...
 
int32x4 min (int32x4 a, int32x4 b)
 Computes minimum of the signed 32-bit values. More...
 
int32x8 min (int32x8 a, int32x8 b)
 Computes minimum of the signed 32-bit values. More...
 
uint32x4 min (uint32x4 a, uint32x4 b)
 Computes minimum of the unsigned 32-bit values. More...
 
uint32x8 min (uint32x8 a, uint32x8 b)
 Computes minimum of the unsigned 32-bit values. More...
 
int8x16 max (int8x16 a, int8x16 b)
 Computes maximum of the signed 8-bit values. More...
 
int8x32 max (int8x32 a, int8x32 b)
 Computes maximum of the signed 8-bit values. More...
 
uint8x16 max (uint8x16 a, uint8x16 b)
 Computes maximum of the unsigned 8-bit values. More...
 
uint8x32 max (uint8x32 a, uint8x32 b)
 Computes maximum of the unsigned 8-bit values. More...
 
int16x8 max (int16x8 a, int16x8 b)
 Computes maximum of the signed 16-bit values. More...
 
int16x16 max (int16x16 a, int16x16 b)
 Computes maximum of the signed 16-bit values. More...
 
uint16x8 max (uint16x8 a, uint16x8 b)
 Computes maximum of the unsigned 16-bit values. More...
 
uint16x16 max (uint16x16 a, uint16x16 b)
 Computes maximum of the unsigned 16-bit values. More...
 
int32x4 max (int32x4 a, int32x4 b)
 Computes maximum of the signed 32-bit values. More...
 
int32x8 max (int32x8 a, int32x8 b)
 Computes maximum of the signed 32-bit values. More...
 
uint32x4 max (uint32x4 a, uint32x4 b)
 Computes maximum of the unsigned 32-bit values. More...
 
uint32x8 max (uint32x8 a, uint32x8 b)
 Computes maximum of the unsigned 32-bit values. More...
 
uint8x16 avg (uint8x16 a, uint8x16 b)
 Computes rounded average of the unsigned 8-bit values. More...
 
uint8x32 avg (uint8x32 a, uint8x32 b)
 Computes rounded average of the unsigned 8-bit values. More...
 
int8x16 avg (int8x16 a, int8x16 b)
 Computes rounded average of signed 8-bit values. More...
 
int8x32 avg (int8x32 a, int8x32 b)
 Computes rounded average of signed 8-bit values. More...
 
uint16x8 avg (uint16x8 a, uint16x8 b)
 Computes rounded average of unsigned 16-bit values. More...
 
uint16x16 avg (uint16x16 a, uint16x16 b)
 Computes rounded average of unsigned 16-bit values. More...
 
int16x8 avg (int16x8 a, int16x8 b)
 Computes rounded average of signed 16-bit values. More...
 
int16x16 avg (int16x16 a, int16x16 b)
 Computes rounded average of signed 16-bit values. More...
 
uint32x4 avg (uint32x4 a, uint32x4 b)
 Computes rounded average of unsigned 32-bit values. More...
 
uint32x8 avg (uint32x8 a, uint32x8 b)
 Computes rounded average of unsigned 32-bit values. More...
 
int32x4 avg (int32x4 a, int32x4 b)
 Computes rounded average of signed 32-bit values. More...
 
int32x8 avg (int32x8 a, int32x8 b)
 Computes rounded average of signed 32-bit values. More...
 
uint8x16 avg_trunc (uint8x16 a, uint8x16 b)
 Computes truncated average of the unsigned 8-bit values. More...
 
uint8x32 avg_trunc (uint8x32 a, uint8x32 b)
 Computes truncated average of the unsigned 8-bit values. More...
 
int8x16 avg_trunc (int8x16 a, int8x16 b)
 Computes truncated average of signed 8-bit values. More...
 
int8x32 avg_trunc (int8x32 a, int8x32 b)
 Computes truncated average of signed 8-bit values. More...
 
uint16x8 avg_trunc (uint16x8 a, uint16x8 b)
 Computes truncated average of unsigned 16-bit values. More...
 
uint16x16 avg_trunc (uint16x16 a, uint16x16 b)
 Computes truncated average of unsigned 16-bit values. More...
 
int16x8 avg_trunc (int16x8 a, int16x8 b)
 Computes truncated average of signed 16-bit values. More...
 
int16x16 avg_trunc (int16x16 a, int16x16 b)
 Computes truncated average of signed 16-bit values. More...
 
uint32x4 avg_trunc (uint32x4 a, uint32x4 b)
 Computes truncated average of unsigned 32-bit values. More...
 
uint32x8 avg_trunc (uint32x8 a, uint32x8 b)
 Computes truncated average of unsigned 32-bit values. More...
 
int32x4 avg_trunc (int32x4 a, int32x4 b)
 Computes truncated average of signed 32-bit values. More...
 
int32x8 avg_trunc (int32x8 a, int32x8 b)
 Computes truncated average of signed 32-bit values. More...
 
uint8x16 abs (int8x16 a)
 Computes absolute value of 8-bit integer values. More...
 
uint8x32 abs (int8x32 a)
 Computes absolute value of 8-bit integer values. More...
 
uint16x8 abs (int16x8 a)
 Computes absolute value of 16-bit integer values. More...
 
uint16x16 abs (int16x16 a)
 Computes absolute value of 16-bit integer values. More...
 
uint32x4 abs (int32x4 a)
 Computes absolute value of 32-bit integer values. More...
 
uint32x8 abs (int32x8 a)
 Computes absolute value of 32-bit integer values. More...
 
uint64x2 abs (int64x2 a)
 Computes absolute value of 64-bit integer values. More...
 
uint64x4 abs (int64x4 a)
 Computes absolute value of 64-bit integer values. More...
 
template<unsigned P>
uint8x16 div_p (uint8x16 num, uint8x16 den)
 Divides one 8-bit unsigned number by another. More...
 
template<unsigned P>
uint16x8 div_p (uint16x8 num, uint16x8 den)
 Divides one 8-bit unsigned number by another. More...
 
basic_int16x8 add (basic_int16x8 a, basic_int16x8 b)
 Adds 16-bit integer values. More...
 
basic_int16x16 add (basic_int16x16 a, basic_int16x16 b)
 Adds 16-bit integer values. More...
 
basic_int32x4 add (basic_int32x4 a, basic_int32x4 b)
 Adds 32-bit integer values. More...
 
basic_int32x8 add (basic_int32x8 a, basic_int32x8 b)
 Adds 32-bit integer values. More...
 
basic_int64x2 add (basic_int64x2 a, basic_int64x2 b)
 Adds 64-bit integer values. More...
 
basic_int64x4 add (basic_int64x4 a, basic_int64x4 b)
 Adds 64-bit integer values. More...
 
int8x16 adds (int8x16 a, int8x16 b)
 Adds and saturates signed 8-bit integer values. More...
 
int8x32 adds (int8x32 a, int8x32 b)
 Adds and saturates signed 8-bit integer values. More...
 
int16x8 adds (int16x8 a, int16x8 b)
 Adds and saturates signed 16-bit integer values. More...
 
int16x16 adds (int16x16 a, int16x16 b)
 Adds and saturates signed 16-bit integer values. More...
 
uint8x16 adds (uint8x16 a, uint8x16 b)
 Adds and saturates unsigned 8-bit integer values. More...
 
uint8x32 adds (uint8x32 a, uint8x32 b)
 Adds and saturates unsigned 8-bit integer values. More...
 
uint16x8 adds (uint16x8 a, uint16x8 b)
 Adds and saturates unsigned 16-bit integer values. More...
 
uint16x16 adds (uint16x16 a, uint16x16 b)
 Adds and saturates unsigned 16-bit integer values. More...
 
basic_int8x16 sub (basic_int8x16 a, basic_int8x16 b)
 Subtracts 8-bit integer values. More...
 
basic_int8x32 sub (basic_int8x32 a, basic_int8x32 b)
 Subtracts 8-bit integer values. More...
 
basic_int16x8 sub (basic_int16x8 a, basic_int16x8 b)
 Subtracts 16-bit integer values. More...
 
basic_int16x16 sub (basic_int16x16 a, basic_int16x16 b)
 Subtracts 16-bit integer values. More...
 
basic_int32x4 sub (basic_int32x4 a, basic_int32x4 b)
 Subtracts 32-bit integer values. More...
 
basic_int32x8 sub (basic_int32x8 a, basic_int32x8 b)
 Subtracts 32-bit integer values. More...
 
basic_int64x2 sub (basic_int64x2 a, basic_int64x2 b)
 Subtracts 64-bit integer values. More...
 
basic_int64x4 sub (basic_int64x4 a, basic_int64x4 b)
 Subtracts 64-bit integer values. More...
 
int8x16 subs (int8x16 a, int8x16 b)
 Subtracts and saturaters signed 8-bit integer values. More...
 
int8x32 subs (int8x32 a, int8x32 b)
 Subtracts and saturaters signed 8-bit integer values. More...
 
int16x8 subs (int16x8 a, int16x8 b)
 Subtracts and saturaters signed 16-bit integer values. More...
 
int16x16 subs (int16x16 a, int16x16 b)
 Subtracts and saturaters signed 16-bit integer values. More...
 
uint8x16 subs (uint8x16 a, uint8x16 b)
 Subtracts and saturaters unsigned 8-bit integer values. More...
 
uint8x32 subs (uint8x32 a, uint8x32 b)
 Subtracts and saturaters unsigned 8-bit integer values. More...
 
uint16x8 subs (uint16x8 a, uint16x8 b)
 Subtracts and saturaters unsigned 16-bit integer values. More...
 
uint16x16 subs (uint16x16 a, uint16x16 b)
 Subtracts and saturaters unsigned 16-bit integer values. More...
 
int8x16 neg (int8x16 a)
 Negates signed 8-bit values. More...
 
int8x32 neg (int8x32 a)
 Negates signed 8-bit values. More...
 
int16x8 neg (int16x8 a)
 Negates signed 16-bit values. More...
 
int16x16 neg (int16x16 a)
 Negates signed 16-bit values. More...
 
int32x4 neg (int32x4 a)
 Negates signed 32-bit values. More...
 
int32x8 neg (int32x8 a)
 Negates signed 32-bit values. More...
 
int64x2 neg (int64x2 a)
 Negates signed 64-bit values. More...
 
int64x4 neg (int64x4 a)
 Negates signed 64-bit values. More...
 
basic_int16x8 mul_lo (basic_int16x8 a, basic_int16x8 b)
 Multiplies 16-bit values and returns the lower part of the multiplication. More...
 
basic_int16x16 mul_lo (basic_int16x16 a, basic_int16x16 b)
 Multiplies 16-bit values and returns the lower part of the multiplication. More...
 
int16x8 mul_hi (int16x8 a, int16x8 b)
 Multiplies signed 16-bit values and returns the higher half of the result. More...
 
int16x16 mul_hi (int16x16 a, int16x16 b)
 Multiplies signed 16-bit values and returns the higher half of the result. More...
 
uint16x8 mul_hi (uint16x8 a, uint16x8 b)
 Multiplies unsigned 16-bit values and returns the higher half of the result. More...
 
uint16x16 mul_hi (uint16x16 a, uint16x16 b)
 Multiplies unsigned 16-bit values and returns the higher half of the result. More...
 
int128 mul_lo (basic_int32x4 a, basic_int32x4 b)
 Multiplies 32-bit values and returns the lower half of the result. More...
 
basic_int32x8 mul_lo (basic_int32x8 a, basic_int32x8 b)
 Multiplies 32-bit values and returns the lower half of the result. More...
 
int32x4 mull_lo (int16x8 a, int16x8 b)
 Multiplies signed 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...
 
int32x8 mull_lo (int16x16 a, int16x16 b)
 Multiplies signed 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...
 
uint32x4 mull_lo (uint16x8 a, uint16x8 b)
 Multiplies unsigned 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...
 
uint32x8 mull_lo (uint16x16 a, uint16x16 b)
 Multiplies unsigned 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...
 
int32x4 mull_hi (int16x8 a, int16x8 b)
 Multiplies signed 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...
 
int32x8 mull_hi (int16x16 a, int16x16 b)
 Multiplies signed 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...
 
uint32x4 mull_hi (uint16x8 a, uint16x8 b)
 Multiplies unsigned 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...
 
uint32x8 mull_hi (uint16x16 a, uint16x16 b)
 Multiplies unsigned 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...
 
int64x2 mull_lo (int32x4 a, int32x4 b)
 Multiplies signed 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...
 
int64x4 mull_lo (int32x8 a, int32x8 b)
 Multiplies signed 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...
 
uint64x2 mull_lo (uint32x4 a, uint32x4 b)
 Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...
 
uint64x4 mull_lo (uint32x8 a, uint32x8 b)
 Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...
 
int64x2 mull_hi (int32x4 a, int32x4 b)
 Multiplies signed 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...
 
int64x4 mull_hi (int32x8 a, int32x8 b)
 Multiplies signed 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...
 
uint64x2 mull_hi (uint32x4 a, uint32x4 b)
 Multiplies unsigned 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...
 
uint64x4 mull_hi (uint32x8 a, uint32x8 b)
 Multiplies unsigned 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...
 
uint8x16 shift_r (uint8x16 a, unsigned count)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
uint8x32 shift_r (uint8x32 a, unsigned count)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
int16x8 shift_r (int16x8 a, unsigned count)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
int16x16 shift_r (int16x16 a, unsigned count)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
uint16x8 shift_r (uint16x8 a, unsigned count)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
uint16x16 shift_r (uint16x16 a, unsigned count)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
int32x4 shift_r (int32x4 a, unsigned count)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
int32x8 shift_r (int32x8 a, unsigned count)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
uint32x4 shift_r (uint32x4 a, unsigned count)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
uint32x8 shift_r (uint32x8 a, unsigned count)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
int64x2 shift_r (int64x2 a, unsigned count)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
int64x4 shift_r (int64x4 a, unsigned count)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
uint64x2 shift_r (uint64x2 a, unsigned count)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
uint64x4 shift_r (uint64x4 a, unsigned count)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
basic_int8x16 shift_l (basic_int8x16 a, unsigned count)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
basic_int8x32 shift_l (basic_int8x32 a, unsigned count)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
basic_int16x8 shift_l (basic_int16x8 a, unsigned count)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
basic_int16x16 shift_l (basic_int16x16 a, unsigned count)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
basic_int32x4 shift_l (basic_int32x4 a, unsigned count)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
basic_int32x8 shift_l (basic_int32x8 a, unsigned count)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
basic_int64x2 shift_l (basic_int64x2 a, unsigned count)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
basic_int64x4 shift_l (basic_int64x4 a, unsigned count)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
int8x16 shift_r (int8x16 a)
 Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
int8x32 shift_r (int8x32 a)
 Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
uint8x16 shift_r (uint8x16 a)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
uint8x32 shift_r (uint8x32 a)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
int16x8 shift_r (int16x8 a)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
int16x16 shift_r (int16x16 a)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
uint16x8 shift_r (uint16x8 a)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
uint16x16 shift_r (uint16x16 a)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
int32x4 shift_r (int32x4 a)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
int32x8 shift_r (int32x8 a)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
uint32x4 shift_r (uint32x4 a)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
uint32x8 shift_r (uint32x8 a)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
int64x2 shift_r (int64x2 a)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
int64x4 shift_r (int64x4 a)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count>
uint64x2 shift_r (uint64x2 a)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
uint64x4 shift_r (uint64x4 a)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int8x16 shift_l (basic_int8x16 a)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int8x32 shift_l (basic_int8x32 a)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int16x8 shift_l (basic_int16x8 a)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int16x16 shift_l (basic_int16x16 a)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int32x4 shift_l (basic_int32x4 a)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int32x8 shift_l (basic_int32x8 a)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int64x2 shift_l (basic_int64x2 a)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count>
basic_int64x4 shift_l (basic_int64x4 a)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
basic_int8x16 load_u (basic_int8x16 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int16x8 load_u (basic_int16x8 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int32x4 load_u (basic_int32x4 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int64x2 load_u (basic_int64x2 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
float32x4 load_u (float32x4 &a, const float *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
float64x2 load_u (float64x2 &a, const double *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int8x32 load_u (basic_int8x32 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int16x16 load_u (basic_int16x16 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int32x8 load_u (basic_int32x8 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
basic_int64x4 load_u (basic_int64x4 &a, const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
float32x8 load_u (float32x8 &a, const float *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
float64x4 load_u (float64x4 &a, const double *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
void load_packed2 (basic_int8x16 &a, basic_int8x16 &b, const void *p)
 Loads 8-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int8x32 &a, basic_int8x32 &b, const void *p)
 Loads 8-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int16x8 &a, basic_int16x8 &b, const void *p)
 Loads 16-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int16x16 &a, basic_int16x16 &b, const void *p)
 Loads 16-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int32x4 &a, basic_int32x4 &b, const void *p)
 Loads 32-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int32x8 &a, basic_int32x8 &b, const void *p)
 Loads 32-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int64x2 &a, basic_int64x2 &b, const void *p)
 Loads 64-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (basic_int64x4 &a, basic_int64x4 &b, const void *p)
 Loads 64-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (float64x2 &a, float64x2 &b, const double *p)
 Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed2 (float64x4 &a, float64x4 &b, const double *p)
 Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
void load_packed3 (basic_int8x16 &a, basic_int8x16 &b, basic_int8x16 &c, const void *p)
 Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int8x32 &a, basic_int8x32 &b, basic_int8x32 &c, const void *p)
 Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int16x8 &a, basic_int16x8 &b, basic_int16x8 &c, const void *p)
 Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int16x16 &a, basic_int16x16 &b, basic_int16x16 &c, const void *p)
 Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int32x4 &a, basic_int32x4 &b, basic_int32x4 &c, const void *p)
 Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int32x8 &a, basic_int32x8 &b, basic_int32x8 &c, const void *p)
 Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int64x2 &a, basic_int64x2 &b, basic_int64x2 &c, const void *p)
 Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (basic_int64x4 &a, basic_int64x4 &b, basic_int64x4 &c, const void *p)
 Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (float32x4 &a, float32x4 &b, float32x4 &c, const float *p)
 Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (float32x8 &a, float32x8 &b, float32x8 &c, const float *p)
 Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (float64x2 &a, float64x2 &b, float64x2 &c, const double *p)
 Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed3 (float64x4 &a, float64x4 &b, float64x4 &c, const double *p)
 Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
void load_packed4 (basic_int8x16 &a, basic_int8x16 &b, basic_int8x16 &c, basic_int8x16 &d, const void *p)
 Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int8x32 &a, basic_int8x32 &b, basic_int8x32 &c, basic_int8x32 &d, const void *p)
 Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int16x8 &a, basic_int16x8 &b, basic_int16x8 &c, basic_int16x8 &d, const void *p)
 Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int16x16 &a, basic_int16x16 &b, basic_int16x16 &c, basic_int16x16 &d, const void *p)
 Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int32x4 &a, basic_int32x4 &b, basic_int32x4 &c, basic_int32x4 &d, const void *p)
 Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int32x8 &a, basic_int32x8 &b, basic_int32x8 &c, basic_int32x8 &d, const void *p)
 Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int64x2 &a, basic_int64x2 &b, basic_int64x2 &c, basic_int64x2 &d, const void *p)
 Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (basic_int64x4 &a, basic_int64x4 &b, basic_int64x4 &c, basic_int64x4 &d, const void *p)
 Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (float32x4 &a, float32x4 &b, float32x4 &c, float32x4 &d, const float *p)
 Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (float32x8 &a, float32x8 &b, float32x8 &c, float32x8 &d, const float *p)
 Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (float64x2 &a, float64x2 &b, float64x2 &c, float64x2 &d, const double *p)
 Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void load_packed4 (float64x4 &a, float64x4 &b, float64x4 &c, float64x4 &d, const double *p)
 Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
void stream (void *p, int128 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void stream (void *p, int256 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void stream (float *p, float32x4 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void stream (float *p, float32x8 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void stream (double *p, float64x2 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void stream (double *p, float64x4 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void store_first (void *p, basic_int8x16 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int8x32 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int16x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int16x16 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int32x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int32x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int64x2 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (void *p, basic_int64x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (float *p, float32x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (float *p, float32x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (double *p, float64x2 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_first (double *p, float64x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int8x16 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int8x32 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int16x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int16x16 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int32x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int32x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int64x2 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (void *p, basic_int64x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (float *p, float32x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (float *p, float32x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (double *p, float64x2 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_last (double *p, float64x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void store_packed2 (void *p, basic_int8x16 a, basic_int8x16 b)
 Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int8x32 a, basic_int8x32 b)
 Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int16x8 a, basic_int16x8 b)
 Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int16x16 a, basic_int16x16 b)
 Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int32x4 a, basic_int32x4 b)
 Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int32x8 a, basic_int32x8 b)
 Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int64x2 a, basic_int64x2 b)
 Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (void *p, basic_int64x4 a, basic_int64x4 b)
 Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (float *p, float32x4 a, float32x4 b)
 Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (float *p, float32x8 a, float32x8 b)
 Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (double *p, float64x2 a, float64x2 b)
 Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed2 (double *p, float64x4 a, float64x4 b)
 Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c)
 Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c)
 Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c)
 Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c)
 Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c)
 Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c)
 Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c)
 Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c)
 Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (float *p, float32x4 a, float32x4 b, float32x4 c)
 Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (float *p, float32x8 a, float32x8 b, float32x8 c)
 Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (double *p, float64x2 a, float64x2 b, float64x2 c)
 Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed3 (double *p, float64x4 a, float64x4 b, float64x4 c)
 Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c, basic_int8x16 d)
 Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c, basic_int8x32 d)
 Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c, basic_int16x8 d)
 Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c, basic_int16x16 d)
 Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d)
 Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c, basic_int32x8 d)
 Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c, basic_int64x2 d)
 Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c, basic_int64x4 d)
 Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (float *p, float32x4 a, float32x4 b, float32x4 c, float32x4 d)
 Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (float *p, float32x8 a, float32x8 b, float32x8 c, float32x8 d)
 Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (double *p, float64x2 a, float64x2 b, float64x2 c, float64x2 d)
 Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void store_packed4 (double *p, float64x4 a, float64x4 b, float64x4 c, float64x4 d)
 Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
float32x4 zip_lo (float32x4 a, float32x4 b)
 Interleaves the lower halves of two vectors. More...
 
float32x8 zip_lo (float32x8 a, float32x8 b)
 Interleaves the lower halves of two vectors. More...
 
float64x2 zip_lo (float64x2 a, float64x2 b)
 Interleaves the lower halves of two vectors. More...
 
float64x4 zip_lo (float64x4 a, float64x4 b)
 Interleaves the lower halves of two vectors. More...
 
basic_int8x16 zip_hi (basic_int8x16 a, basic_int8x16 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int8x32 zip_hi (basic_int8x32 a, basic_int8x32 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int16x8 zip_hi (basic_int16x8 a, basic_int16x8 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int16x16 zip_hi (basic_int16x16 a, basic_int16x16 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int32x4 zip_hi (basic_int32x4 a, basic_int32x4 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int32x8 zip_hi (basic_int32x8 a, basic_int32x8 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int64x2 zip_hi (basic_int64x2 a, basic_int64x2 b)
 Interleaves the higher halves of two vectors. More...
 
basic_int64x4 zip_hi (basic_int64x4 a, basic_int64x4 b)
 Interleaves the higher halves of two vectors. More...
 
float32x4 zip_hi (float32x4 a, float32x4 b)
 Interleaves the higher halves of two vectors. More...
 
float32x8 zip_hi (float32x8 a, float32x8 b)
 Interleaves the higher halves of two vectors. More...
 
float64x2 zip_hi (float64x2 a, float64x2 b)
 Interleaves the higher halves of two vectors. More...
 
float64x4 zip_hi (float64x4 a, float64x4 b)
 Interleaves the higher halves of two vectors. More...
 
template<unsigned shift>
basic_int8x16 move_l (basic_int8x16 a)
 Moves the elements in an int8x16 vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int8x32 move_l (basic_int8x32 a)
 Moves the elements in an int8x16 vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int16x8 move_l (basic_int16x8 a)
 Moves the 16-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int16x16 move_l (basic_int16x16 a)
 Moves the 16-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int32x4 move_l (basic_int32x4 a)
 Moves the 32-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int32x8 move_l (basic_int32x8 a)
 Moves the 32-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int64x2 move_l (basic_int64x2 a)
 Moves the 64-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int64x4 move_l (basic_int64x4 a)
 Moves the 64-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
float32x4 move_l (float32x4 a)
 Moves the 32-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
float32x8 move_l (float32x8 a)
 Moves the 32-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
float64x2 move_l (float64x2 a)
 Moves the 64-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
float64x4 move_l (float64x4 a)
 Moves the 64-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift>
basic_int8x16 move_r (basic_int8x16 a)
 Moves the 8-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int8x32 move_r (basic_int8x32 a)
 Moves the 8-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int16x8 move_r (basic_int16x8 a)
 Moves the 16-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int16x16 move_r (basic_int16x16 a)
 Moves the 16-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int32x4 move_r (basic_int32x4 a)
 Moves the 32-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int32x8 move_r (basic_int32x8 a)
 Moves the 32-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int64x2 move_r (basic_int64x2 a)
 Moves the 64-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
basic_int64x4 move_r (basic_int64x4 a)
 Moves the 64-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
float32x4 move_r (float32x4 a)
 Moves the 32-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
float32x8 move_r (float32x8 a)
 Moves the 32-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
float64x2 move_r (float64x2 a)
 Moves the 64-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift>
float64x4 move_r (float64x4 a)
 Moves the 64-bit elements in a vector to the right by shift positions. More...
 
template<unsigned s>
basic_int8x16 broadcast (basic_int8x16 a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int8x32 broadcast (basic_int8x32 a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int16x8 broadcast (basic_int16x8 a)
 Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int16x16 broadcast (basic_int16x16 a)
 Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int32x4 broadcast (basic_int32x4 a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int32x8 broadcast (basic_int32x8 a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int64x2 broadcast (basic_int64x2 a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int64x4 broadcast (basic_int64x4 a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
float32x4 broadcast (float32x4 a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
float32x8 broadcast (float32x8 a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
float64x2 broadcast (float64x2 a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
float64x4 broadcast (float64x4 a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s>
basic_int8x16 broadcast_w (basic_int8x16 a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...
 
template<unsigned s>
basic_int8x32 broadcast_w (basic_int8x32 a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...
 
template<unsigned s>
basic_int16x8 broadcast_w (basic_int16x8 a)
 Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...
 
template<unsigned s>
basic_int16x16 broadcast_w (basic_int16x16 a)
 Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...
 
template<unsigned s>
basic_int32x4 broadcast_w (basic_int32x4 a)
 Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...
 
template<unsigned s>
basic_int32x8 broadcast_w (basic_int32x8 a)
 Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...
 
template<unsigned s>
basic_int64x2 broadcast_w (basic_int64x2 a)
 Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...
 
template<unsigned s>
basic_int64x4 broadcast_w (basic_int64x4 a)
 Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...
 
template<unsigned s>
float32x4 broadcast_w (float32x4 a)
 Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...
 
template<unsigned s>
float32x8 broadcast_w (float32x8 a)
 Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...
 
template<unsigned s>
float64x2 broadcast_w (float64x2 a)
 Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...
 
template<unsigned s>
float64x4 broadcast_w (float64x4 a)
 Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...
 
template<unsigned shift>
basic_int8x16 align (basic_int8x16 lower, basic_int8x16 upper)
 Extracts a int8x16 vector from two concatenated int8x16 vectors. More...
 
template<unsigned shift>
basic_int8x32 align (basic_int8x32 lower, basic_int8x32 upper)
 Extracts a int8x16 vector from two concatenated int8x16 vectors. More...
 
template<unsigned shift>
basic_int16x8 align (basic_int16x8 lower, basic_int16x8 upper)
 Extracts a int16x8 vector from two concatenated int16x8 vectors. More...
 
template<unsigned shift>
basic_int16x16 align (basic_int16x16 lower, basic_int16x16 upper)
 Extracts a int16x8 vector from two concatenated int16x8 vectors. More...
 
template<unsigned shift>
basic_int32x4 align (basic_int32x4 lower, basic_int32x4 upper)
 Extracts a int32x4 vector from two concatenated int32x4 vectors. More...
 
template<unsigned shift>
basic_int32x8 align (basic_int32x8 lower, basic_int32x8 upper)
 Extracts a int32x4 vector from two concatenated int32x4 vectors. More...
 
template<unsigned shift>
basic_int64x2 align (basic_int64x2 lower, basic_int64x2 upper)
 Extracts a int64x2 vector from two concatenated int64x2 vectors. More...
 
template<unsigned shift>
basic_int64x4 align (basic_int64x4 lower, basic_int64x4 upper)
 Extracts a int64x2 vector from two concatenated int64x2 vectors. More...
 
template<unsigned shift>
float32x4 align (float32x4 lower, float32x4 upper)
 Extracts a float32x4 vector from two concatenated float32x4 vectors. More...
 
template<unsigned shift>
float32x8 align (float32x8 lower, float32x8 upper)
 Extracts a float32x4 vector from two concatenated float32x4 vectors. More...
 
template<unsigned shift>
float64x2 align (float64x2 lower, float64x2 upper)
 Extracts a float64x2 vector from two concatenated float64x2 vectors. More...
 
template<unsigned shift>
float64x4 align (float64x4 lower, float64x4 upper)
 Extracts a float64x2 vector from two concatenated float64x2 vectors. More...
 
basic_int8x16 blend (basic_int8x16 on, basic_int8x16 off, basic_int8x16 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int8x16 blend (basic_int8x16 on, basic_int8x16 off, mask_int8x16 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int8x32 blend (basic_int8x32 on, basic_int8x32 off, basic_int8x32 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int8x32 blend (basic_int8x32 on, basic_int8x32 off, mask_int8x32 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int16x8 blend (basic_int16x8 on, basic_int16x8 off, basic_int16x8 mask)
 Composes vector from two sources according to a mask. More...
 
basic_int16x16 blend (basic_int16x16 on, basic_int16x16 off, basic_int16x16 mask)
 Composes vector from two sources according to a mask. More...
 
basic_int16x8 blend (basic_int16x8 on, basic_int16x8 off, mask_int16x8 mask)
 Composes vector from two sources according to a mask. More...
 
basic_int16x16 blend (basic_int16x16 on, basic_int16x16 off, mask_int16x16 mask)
 Composes vector from two sources according to a mask. More...
 
basic_int32x4 blend (basic_int32x4 on, basic_int32x4 off, basic_int32x4 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int32x8 blend (basic_int32x8 on, basic_int32x8 off, basic_int32x8 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int32x4 blend (basic_int32x4 on, basic_int32x4 off, mask_int32x4 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int32x8 blend (basic_int32x8 on, basic_int32x8 off, mask_int32x8 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int64x2 blend (basic_int64x2 on, basic_int64x2 off, basic_int64x2 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int64x4 blend (basic_int64x4 on, basic_int64x4 off, basic_int64x4 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int64x2 blend (basic_int64x2 on, basic_int64x2 off, mask_int64x2 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int64x4 blend (basic_int64x4 on, basic_int64x4 off, mask_int64x4 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x4 blend (float32x4 on, float32x4 off, float32x4 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x4 blend (float32x4 on, float32x4 off, int128 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x8 blend (float32x8 on, float32x8 off, float32x8 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x8 blend (float32x8 on, float32x8 off, int256 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x4 blend (float32x4 on, float32x4 off, mask_float32x4 mask)
 Composes a vector from two sources according to a mask. More...
 
float32x8 blend (float32x8 on, float32x8 off, mask_float32x8 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x2 blend (float64x2 on, float64x2 off, float64x2 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x2 blend (float64x2 on, float64x2 off, int128 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x4 blend (float64x4 on, float64x4 off, float64x4 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x4 blend (float64x4 on, float64x4 off, int256 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x2 blend (float64x2 on, float64x2 off, mask_float64x2 mask)
 Composes a vector from two sources according to a mask. More...
 
float64x4 blend (float64x4 on, float64x4 off, mask_float64x4 mask)
 Composes a vector from two sources according to a mask. More...
 
basic_int8x16 unzip_lo (basic_int8x16 a, basic_int8x16 b)
 De-interleaves the odd(lower) elements of two int8x16 vectors. More...
 
basic_int8x32 unzip_lo (basic_int8x32 a, basic_int8x32 b)
 De-interleaves the odd(lower) elements of two int8x16 vectors. More...
 
basic_int16x8 unzip_lo (basic_int16x8 a, basic_int16x8 b)
 De-interleaves the odd(lower) elements of two int16x8 vectors. More...
 
basic_int16x16 unzip_lo (basic_int16x16 a, basic_int16x16 b)
 De-interleaves the odd(lower) elements of two int16x8 vectors. More...
 
basic_int32x4 unzip_lo (basic_int32x4 a, basic_int32x4 b)
 De-interleaves the odd(lower) elements of two int32x4 vectors. More...
 
basic_int32x8 unzip_lo (basic_int32x8 a, basic_int32x8 b)
 De-interleaves the odd(lower) elements of two int32x4 vectors. More...
 
basic_int64x2 unzip_lo (basic_int64x2 a, basic_int64x2 b)
 De-interleaves the odd(lower) elements of two int64x2 vectors. More...
 
basic_int64x4 unzip_lo (basic_int64x4 a, basic_int64x4 b)
 De-interleaves the odd(lower) elements of two int64x2 vectors. More...
 
float32x4 unzip_lo (float32x4 a, float32x4 b)
 De-interleaves the odd(lower) elements of two float32x4 vectors. More...
 
float32x8 unzip_lo (float32x8 a, float32x8 b)
 De-interleaves the odd(lower) elements of two float32x4 vectors. More...
 
float64x2 unzip_lo (float64x2 a, float64x2 b)
 De-interleaves the odd(lower) elements of two float64x2 vectors. More...
 
float64x4 unzip_lo (float64x4 a, float64x4 b)
 De-interleaves the odd(lower) elements of two float64x2 vectors. More...
 
basic_int8x16 unzip_hi (basic_int8x16 a, basic_int8x16 b)
 De-interleaves the even(higher) elements of two int8x16 vectors. More...
 
basic_int8x32 unzip_hi (basic_int8x32 a, basic_int8x32 b)
 De-interleaves the even(higher) elements of two int8x16 vectors. More...
 
basic_int16x8 unzip_hi (basic_int16x8 a, basic_int16x8 b)
 De-interleaves the even(higher) elements of two int16x8 vectors. More...
 
basic_int16x16 unzip_hi (basic_int16x16 a, basic_int16x16 b)
 De-interleaves the even(higher) elements of two int16x8 vectors. More...
 
basic_int32x4 unzip_hi (basic_int32x4 a, basic_int32x4 b)
 De-interleaves the even(higher) elements of two int32x4 vectors. More...
 
basic_int32x8 unzip_hi (basic_int32x8 a, basic_int32x8 b)
 De-interleaves the even(higher) elements of two int32x4 vectors. More...
 
basic_int64x2 unzip_hi (basic_int64x2 a, basic_int64x2 b)
 De-interleaves the even(higher) elements of two int64x2 vectors. More...
 
basic_int64x4 unzip_hi (basic_int64x4 a, basic_int64x4 b)
 De-interleaves the even(higher) elements of two int64x2 vectors. More...
 
float32x4 unzip_hi (float32x4 a, float32x4 b)
 De-interleaves the even(higher) elements of two float32x4 vectors. More...
 
float32x8 unzip_hi (float32x8 a, float32x8 b)
 De-interleaves the even(higher) elements of two float32x4 vectors. More...
 
float64x2 unzip_hi (float64x2 a, float64x2 b)
 De-interleaves the even(higher) elements of two float64x2 vectors. More...
 
float64x4 unzip_hi (float64x4 a, float64x4 b)
 De-interleaves the even(higher) elements of two float64x2 vectors. More...
 
int128 permute_bytes16 (int128 a, int128 mask)
 Selects bytes from a vector according to a mask. More...
 
float32x4 permute_bytes16 (float32x4 a, int128 mask)
 Selects bytes from a vector according to a mask. More...
 
float64x2 permute_bytes16 (float64x2 a, int128 mask)
 Selects bytes from a vector according to a mask. More...
 
int256 permute_bytes16 (int256 a, int256 mask)
 Selects bytes from a vector according to a mask. More...
 
float32x8 permute_bytes16 (float32x8 a, int256 mask)
 Selects bytes from a vector according to a mask. More...
 
float64x4 permute_bytes16 (float64x4 a, int256 mask)
 Selects bytes from a vector according to a mask. More...
 
int128 shuffle_bytes16 (int128 a, int128 b, int128 mask)
 Selects bytes from two vectors according to a mask. More...
 
float32x4 shuffle_bytes16 (float32x4 a, float32x4 b, int128 mask)
 Selects bytes from two vectors according to a mask. More...
 
float64x2 shuffle_bytes16 (float64x2 a, float64x2 b, int128 mask)
 Selects bytes from two vectors according to a mask. More...
 
int256 shuffle_bytes16 (int256 a, int256 b, int256 mask)
 Selects bytes from two vectors according to a mask. More...
 
float32x8 shuffle_bytes16 (float32x8 a, float32x8 b, int256 mask)
 Selects bytes from two vectors according to a mask. More...
 
float64x4 shuffle_bytes16 (float64x4 a, float64x4 b, int256 mask)
 Selects bytes from two vectors according to a mask. More...
 
int128 permute_zbytes16 (int128 a, int128 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
float32x4 permute_zbytes16 (float32x4 a, int128 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
float64x2 permute_zbytes16 (float64x2 a, int128 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
int256 permute_zbytes16 (int256 a, int256 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
float32x8 permute_zbytes16 (float32x8 a, int256 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
float64x4 permute_zbytes16 (float64x4 a, int256 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
int128 shuffle_zbytes16 (int128 a, int128 b, int128 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
float32x4 shuffle_zbytes16 (float32x4 a, float32x4 b, int128 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
float64x2 shuffle_zbytes16 (float64x2 a, float64x2 b, int128 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
int256 shuffle_zbytes16 (int256 a, int256 b, int256 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
float32x8 shuffle_zbytes16 (float32x8 a, float32x8 b, int256 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
float64x4 shuffle_zbytes16 (float64x4 a, float64x4 b, int256 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
int128 permute (basic_int16x8 a)
 Permutes the 16-bit values within each 4 consecutive values of the vector. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int16x16 permute (basic_int16x16 a)
 Permutes the 16-bit values within each 4 consecutive values of the vector. More...
 
template<unsigned s0, unsigned s1>
basic_int16x8 permute (basic_int16x8 a)
 Permutes the 16-bit values within sets of two consecutive elements of the vector. More...
 
template<unsigned s0, unsigned s1>
basic_int16x16 permute (basic_int16x16 a)
 Permutes the 16-bit values within sets of two consecutive elements of the vector. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x4 permute (basic_int32x4 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x8 permute (basic_int32x8 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
basic_int32x4 permute (basic_int32x4 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
basic_int32x8 permute (basic_int32x8 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x4 permute (float32x4 a)
 Permutes the values of each set of four consecutive 32-bit floating point values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x8 permute (float32x8 a)
 Permutes the values of each set of four consecutive 32-bit floating point values. More...
 
template<unsigned s0, unsigned s1>
float32x4 permute (float32x4 a)
 Permutes the values of each set of four consecutive 32-bit floating-point values. More...
 
template<unsigned s0, unsigned s1>
float32x8 permute (float32x8 a)
 Permutes the values of each set of four consecutive 32-bit floating-point values. More...
 
template<unsigned s0, unsigned s1>
basic_int64x2 permute (basic_int64x2 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
basic_int64x4 permute (basic_int64x4 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
float64x2 permute (float64x2 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
float64x4 permute (float64x4 a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1>
float64x2 shuffle1 (float64x2 a, float64x2 b)
 Selects 64-bit floating-point values from two vectors. More...
 
template<unsigned s0, unsigned s1>
float64x4 shuffle1 (float64x4 a, float64x4 b)
 Selects 64-bit floating-point values from two vectors. More...
 
template<unsigned s0, unsigned s1>
basic_int64x2 shuffle1 (basic_int64x2 a, basic_int64x2 b)
 Selects 64-bit values from two vectors. More...
 
template<unsigned s0, unsigned s1>
basic_int64x4 shuffle1 (basic_int64x4 a, basic_int64x4 b)
 Selects 64-bit values from two vectors. More...
 
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x4 shuffle2 (float32x4 a, float32x4 b)
 Selects 32-bit floating-point values from two vectors. More...
 
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x8 shuffle2 (float32x8 a, float32x8 b)
 Selects 32-bit floating-point values from two vectors. More...
 
template<unsigned s0, unsigned s1>
float32x4 shuffle2 (float32x4 a, float32x4 b)
 Selects 32-bit values from two vectors. More...
 
template<unsigned s0, unsigned s1>
float32x8 shuffle2 (float32x8 a, float32x8 b)
 Selects 32-bit values from two vectors. More...
 
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x4 shuffle2 (basic_int32x4 a, basic_int32x4 b)
 Selects 32-bit values from two vectors. More...
 
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x8 shuffle2 (basic_int32x8 a, basic_int32x8 b)
 Selects 32-bit values from two vectors. More...
 
template<unsigned s0, unsigned s1>
basic_int32x4 shuffle2 (basic_int32x4 a, basic_int32x4 b)
 Selects 32-bit values from two vectors. More...
 
template<unsigned s0, unsigned s1>
basic_int32x8 shuffle2 (basic_int32x8 a, basic_int32x8 b)
 Selects 32-bit values from two vectors. More...
 
void transpose2 (basic_int32x4 &a0, basic_int32x4 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (basic_int32x8 &a0, basic_int32x8 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (basic_int64x2 &a0, basic_int64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (basic_int64x4 &a0, basic_int64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (float32x4 &a0, float32x4 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void transpose2 (float32x8 &a0, float32x8 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void transpose2 (float64x2 &a0, float64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (float64x4 &a0, float64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose4 (basic_int32x4 &a0, basic_int32x4 &a1, basic_int32x4 &a2, basic_int32x4 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void transpose4 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (basic_int32x8 &a0, basic_int32x8 &a1, basic_int32x8 &a2, basic_int32x8 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (basic_int16x8 &a0, basic_int16x8 &a1, basic_int16x8 &a2, basic_int16x8 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (basic_int16x16 &a0, basic_int16x16 &a1, basic_int16x16 &a2, basic_int16x16 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 
void transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 

Function Documentation

uint8x16 simdpp::abs ( int8x16  a)
inline

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
uint8x32 simdpp::abs ( int8x32  a)
inline

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
uint16x8 simdpp::abs ( int16x8  a)
inline

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
uint16x16 simdpp::abs ( int16x16  a)
inline

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
uint32x4 simdpp::abs ( int32x4  a)
inline

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
uint32x8 simdpp::abs ( int32x8  a)
inline

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
uint64x2 simdpp::abs ( int64x2  a)
inline

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 5 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • Not vectorized in ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 10 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • Not vectorized in ALTIVEC.
uint64x4 simdpp::abs ( int64x4  a)
inline

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 5 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • Not vectorized in ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 10 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • Not vectorized in ALTIVEC.
uint8x16 simdpp::avg ( uint8x16  a,
uint8x16  b 
)
inline

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x32 simdpp::avg ( uint8x32  a,
uint8x32  b 
)
inline

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int8x16 simdpp::avg ( int8x16  a,
int8x16  b 
)
inline

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int8x32 simdpp::avg ( int8x32  a,
int8x32  b 
)
inline

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x8 simdpp::avg ( uint16x8  a,
uint16x8  b 
)
inline

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x16 simdpp::avg ( uint16x16  a,
uint16x16  b 
)
inline

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x8 simdpp::avg ( int16x8  a,
int16x8  b 
)
inline

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x16 simdpp::avg ( int16x16  a,
int16x16  b 
)
inline

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x4 simdpp::avg ( uint32x4  a,
uint32x4  b 
)
inline

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 12-13 instructions.
  • In AVX2 this intrinsic results in at least 6-7 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x8 simdpp::avg ( uint32x8  a,
uint32x8  b 
)
inline

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 12-13 instructions.
  • In AVX2 this intrinsic results in at least 6-7 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::avg ( int32x4  a,
int32x4  b 
)
inline

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 18-19 instructions.
  • In AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x8 simdpp::avg ( int32x8  a,
int32x8  b 
)
inline

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 18-19 instructions.
  • In AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x16 simdpp::avg_trunc ( uint8x16  a,
uint8x16  b 
)
inline

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x32 simdpp::avg_trunc ( uint8x32  a,
uint8x32  b 
)
inline

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int8x16 simdpp::avg_trunc ( int8x16  a,
int8x16  b 
)
inline

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int8x32 simdpp::avg_trunc ( int8x32  a,
int8x32  b 
)
inline

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x8 simdpp::avg_trunc ( uint16x8  a,
uint16x8  b 
)
inline

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x16 simdpp::avg_trunc ( uint16x16  a,
uint16x16  b 
)
inline

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x8 simdpp::avg_trunc ( int16x8  a,
int16x8  b 
)
inline

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x16 simdpp::avg_trunc ( int16x16  a,
int16x16  b 
)
inline

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x4 simdpp::avg_trunc ( uint32x4  a,
uint32x4  b 
)
inline

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x8 simdpp::avg_trunc ( uint32x8  a,
uint32x8  b 
)
inline

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::avg_trunc ( int32x4  a,
int32x4  b 
)
inline

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 8 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
int32x8 simdpp::avg_trunc ( int32x8  a,
int32x8  b 
)
inline

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 8 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
float32x4 simdpp::ceil ( float32x4  a)
inline

Rounds the values a vector towards positive infinity.

r0 = ceil(a0)
...
rN = ceil(aN)
128-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 13-15 instructions.
  • In NEON this intrinsic results in at least 11-13 instructions.
256-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 26-28 instructions.
  • In NEON this intrinsic results in at least 22-24 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::ceil ( float32x8  a)
inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
  • In NEON this intrinsic results in at least 20-21 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
mask_float32x4 simdpp::cmp_ge ( float32x4  a,
float32x4  b 
)
inline

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float32x8 simdpp::cmp_ge ( float32x8  a,
float32x8  b 
)
inline

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float64x2 simdpp::cmp_ge ( float64x2  a,
float64x2  b 
)
inline

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 >= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::cmp_ge ( float64x4  a,
float64x4  b 
)
inline
mask_float64x2 simdpp::cmp_gt ( float64x2  a,
float64x2  b 
)
inline

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::cmp_gt ( float64x4  a,
float64x4  b 
)
inline
mask_float32x4 simdpp::cmp_le ( float32x4  a,
float32x4  b 
)
inline

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float32x8 simdpp::cmp_le ( float32x8  a,
float32x8  b 
)
inline

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float64x2 simdpp::cmp_le ( float64x2  a,
float64x2  b 
)
inline

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::cmp_le ( float64x4  a,
float64x4  b 
)
inline

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_int8x16 simdpp::cmp_lt ( int8x16  a,
int8x16  b 
)
inline

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int8x32 simdpp::cmp_lt ( int8x32  a,
int8x32  b 
)
inline

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int8x16 simdpp::cmp_lt ( uint8x16  a,
uint8x16  b 
)
inline

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int8x32 simdpp::cmp_lt ( uint8x32  a,
uint8x32  b 
)
inline

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int16x8 simdpp::cmp_lt ( int16x8  a,
int16x8  b 
)
inline

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int16x16 simdpp::cmp_lt ( int16x16  a,
int16x16  b 
)
inline

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int16x8 simdpp::cmp_lt ( uint16x8  a,
uint16x8  b 
)
inline

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int16x16 simdpp::cmp_lt ( uint16x16  a,
uint16x16  b 
)
inline

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int32x4 simdpp::cmp_lt ( int32x4  a,
int32x4  b 
)
inline

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int32x8 simdpp::cmp_lt ( int32x8  a,
int32x8  b 
)
inline

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int32x4 simdpp::cmp_lt ( uint32x4  a,
uint32x4  b 
)
inline

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_int32x8 simdpp::cmp_lt ( uint32x8  a,
uint32x8  b 
)
inline

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float32x4 simdpp::cmp_lt ( float32x4  a,
float32x4  b 
)
inline

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float32x8 simdpp::cmp_lt ( float32x8  a,
float32x8  b 
)
inline

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
mask_float64x2 simdpp::cmp_lt ( float64x2  a,
float64x2  b 
)
inline

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::cmp_lt ( float64x4  a,
float64x4  b 
)
inline

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
float32x4 simdpp::div ( float32x4  a,
float32x4  b 
)
inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 10 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In ALTIVEC this intrinsic results in at least 19 instructions.
float32x8 simdpp::div ( float32x8  a,
float32x8  b 
)
inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 10 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In ALTIVEC this intrinsic results in at least 19 instructions.
float64x2 simdpp::div ( float64x2  a,
float64x2  b 
)
inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
float64x4 simdpp::div ( float64x4  a,
float64x4  b 
)
inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned P>
uint8x16 simdpp::div_p ( uint8x16  num,
uint8x16  den 
)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN
128-bit version:
The operations costs at least 9 instructions per bit of precision.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
template<unsigned P>
uint16x8 simdpp::div_p ( uint16x8  num,
uint16x8  den 
)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN
128-bit version:
The operations costs at least 9 instructions per bit of precision.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
template<unsigned id>
uint16_t simdpp::extract ( basic_int16x8  a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int16_t simdpp::extract ( int16x8  a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint32_t simdpp::extract ( basic_int32x4  a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int32_t simdpp::extract ( int32x4  a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint64_t simdpp::extract ( basic_int64x2  a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4_1 this intrinsic results in at least 1 instructions.
  • In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
  • In SSE4_1_32bit this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int64_t simdpp::extract ( int64x2  a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4_1 this intrinsic results in at least 1 instructions.
  • In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
  • In SSE4_1_32bit this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
float simdpp::extract ( float32x4  a)

Extracts an element from float32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
double simdpp::extract ( float64x2  a)

Extracts an element from float64x2 vector.

r = a[id]

This function may have very high latency.

  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint16_t simdpp::extract_bits ( uint8x16  a)

Extracts specific bit from each byte of each element of a int8x16 vector.

The default template argument selects the bits from each byte in most efficient way.

r = (a[0] & 0x80 >> 7) | (a[1] & 0x80 >> 6) | ... | (a[15] & 0x80 << 8)
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 7-9 instructions.
  • In ALTIVEC this intrinsic results in at least 9-11 instructions.
uint16_t simdpp::extract_bits_any ( uint8x16  a)
inline

Extracts a bit from each byte of each element of a int8x16 vector.

This operation is only sensible if each byte within the vector is either 0x00 or 0xff.

r = ((a[0] & 0x??) ? 0x01 : 0) |
((a[1] & 0x??) ? 0x02 : 0) |
...
((a[15] & 0x??) ? 0x80 : 0)
  • In NEON this intrinsic results in at least 6-7 instructions.
  • In ALTIVEC this intrinsic results in at least 8-9 instructions.
float32x4 simdpp::floor ( float32x4  a)
inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
  • In NEON this intrinsic results in at least 20-21 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::floor ( float32x8  a)
inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
  • In NEON this intrinsic results in at least 20-21 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
Arch simdpp::get_arch_gcc_builtin_cpu_supports ( )
inline

Retrieves supported architecture using GCC __builtin_cpu_supports function.

Works only on x86.

Arch simdpp::get_arch_linux_cpuinfo ( )
inline

Retrieves supported architecture from Linux /proc/cpuinfo file.

Works on X86 and ARM.

mask_float64x2 simdpp::isnan ( float64x2  a)
inline

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::isnan ( float64x4  a)
inline

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float32x4 simdpp::isnan2 ( float32x4  a,
float32x4  b 
)
inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 6 instructions.
mask_float32x8 simdpp::isnan2 ( float32x8  a,
float32x8  b 
)
inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 6 instructions.
mask_float64x2 simdpp::isnan2 ( float64x2  a,
float64x2  b 
)
inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
mask_float64x4 simdpp::isnan2 ( float64x4  a,
float64x4  b 
)
inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
void simdpp::load_packed2 ( float32x4 &  a,
float32x4 &  b,
const float *  p 
)
inline

Loads 32-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:
a = [ *(p), *(p+2), *(p+4), ... , *(p+6) ]
b = [ *(p+1), *(p+3), *(p+5), ... , *(p+7) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+2), *(p+4), ... , *(p+14) ]
b = [ *(p+1), *(p+3), *(p+5), ... , *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed2 ( float32x8 &  a,
float32x8 &  b,
const float *  p 
)
inline
void simdpp::load_packed2 ( float64x2 &  a,
float64x2 &  b,
const double *  p 
)
inline

Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:
a = [ *(p), *(p+2) ]
b = [ *(p+1), *(p+3) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+2), *(p+4), *(p+14) ]
b = [ *(p+1), *(p+3), *(p+5), *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed2 ( float64x4 &  a,
float64x4 &  b,
const double *  p 
)
inline

Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:
a = [ *(p), *(p+2) ]
b = [ *(p+1), *(p+3) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+2), *(p+4), *(p+14) ]
b = [ *(p+1), *(p+3), *(p+5), *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int8x16 &  a,
basic_int8x16 &  b,
basic_int8x16 &  c,
const void *  p 
)
inline

Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+93) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+94) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+95) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int8x32 &  a,
basic_int8x32 &  b,
basic_int8x32 &  c,
const void *  p 
)
inline

Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+93) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+94) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+95) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int16x8 &  a,
basic_int16x8 &  b,
basic_int16x8 &  c,
const void *  p 
)
inline

Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int16x16 &  a,
basic_int16x16 &  b,
basic_int16x16 &  c,
const void *  p 
)
inline

Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int32x4 &  a,
basic_int32x4 &  b,
basic_int32x4 &  c,
const void *  p 
)
inline

Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int32x8 &  a,
basic_int32x8 &  b,
basic_int32x8 &  c,
const void *  p 
)
inline

Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int64x2 &  a,
basic_int64x2 &  b,
basic_int64x2 &  c,
const void *  p 
)
inline

Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3) ]
b = [ *(p+1), *(p+4) ]
c = [ *(p+2), *(p+5) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( basic_int64x4 &  a,
basic_int64x4 &  b,
basic_int64x4 &  c,
const void *  p 
)
inline

Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3) ]
b = [ *(p+1), *(p+4) ]
c = [ *(p+2), *(p+5) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( float32x4 &  a,
float32x4 &  b,
float32x4 &  c,
const float *  p 
)
inline

Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( float32x8 &  a,
float32x8 &  b,
float32x8 &  c,
const float *  p 
)
inline

Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( float64x2 &  a,
float64x2 &  b,
float64x2 &  c,
const double *  p 
)
inline

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3) ]
b = [ *(p+1), *(p+4) ]
c = [ *(p+2), *(p+5) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 32 bytes.
void simdpp::load_packed3 ( float64x4 &  a,
float64x4 &  b,
float64x4 &  c,
const double *  p 
)
inline

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
a = [ *(p), *(p+3) ]
b = [ *(p+1), *(p+4) ]
c = [ *(p+2), *(p+5) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+3), *(p+6), *(p+9) ]
b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]
c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int8x16 &  a,
basic_int8x16 &  b,
basic_int8x16 &  c,
basic_int8x16 &  d,
const void *  p 
)
inline

Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+124) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+125) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+126) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+127) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int8x32 &  a,
basic_int8x32 &  b,
basic_int8x32 &  c,
basic_int8x32 &  d,
const void *  p 
)
inline

Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+124) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+125) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+126) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+127) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int16x8 &  a,
basic_int16x8 &  b,
basic_int16x8 &  c,
basic_int16x8 &  d,
const void *  p 
)
inline

Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int16x16 &  a,
basic_int16x16 &  b,
basic_int16x16 &  c,
basic_int16x16 &  d,
const void *  p 
)
inline

Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int32x4 &  a,
basic_int32x4 &  b,
basic_int32x4 &  c,
basic_int32x4 &  d,
const void *  p 
)
inline

Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int32x8 &  a,
basic_int32x8 &  b,
basic_int32x8 &  c,
basic_int32x8 &  d,
const void *  p 
)
inline

Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int64x2 &  a,
basic_int64x2 &  b,
basic_int64x2 &  c,
basic_int64x2 &  d,
const void *  p 
)
inline

Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4) ]
b = [ *(p+1), *(p+5) ]
c = [ *(p+2), *(p+6) ]
d = [ *(p+3), *(p+7) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( basic_int64x4 &  a,
basic_int64x4 &  b,
basic_int64x4 &  c,
basic_int64x4 &  d,
const void *  p 
)
inline

Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4) ]
b = [ *(p+1), *(p+5) ]
c = [ *(p+2), *(p+6) ]
d = [ *(p+3), *(p+7) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( float32x4 &  a,
float32x4 &  b,
float32x4 &  c,
float32x4 &  d,
const float *  p 
)
inline

Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( float32x8 &  a,
float32x8 &  b,
float32x8 &  c,
float32x8 &  d,
const float *  p 
)
inline

Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( float64x2 &  a,
float64x2 &  b,
float64x2 &  c,
float64x2 &  d,
const double *  p 
)
inline

Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4) ]
b = [ *(p+1), *(p+5) ]
c = [ *(p+2), *(p+6) ]
d = [ *(p+3), *(p+7) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 32 bytes.
void simdpp::load_packed4 ( float64x4 &  a,
float64x4 &  b,
float64x4 &  c,
float64x4 &  d,
const double *  p 
)
inline

Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:
a = [ *(p), *(p+4) ]
b = [ *(p+1), *(p+5) ]
c = [ *(p+2), *(p+6) ]
d = [ *(p+3), *(p+7) ]
p must be aligned to 16 bytes.
256-bit version:
a = [ *(p), *(p+4), *(p+8), *(p+12) ]
b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]
c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]
d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]
p must be aligned to 32 bytes.
template<int s0, int s1, int s2, int s3>
basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>
basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>
basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>
basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int32x4 simdpp::make_shuffle_bytes16_mask ( basic_int32x4 &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int32x8 simdpp::make_shuffle_bytes16_mask ( basic_int32x8 &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>
basic_int32x4 simdpp::make_shuffle_bytes16_mask ( basic_int32x4 &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>
basic_int32x8 simdpp::make_shuffle_bytes16_mask ( basic_int32x8 &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int64x2 simdpp::make_shuffle_bytes16_mask ( basic_int64x2 &  mask)

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>
basic_int64x4 simdpp::make_shuffle_bytes16_mask ( basic_int64x4 &  mask)

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

int8x16 simdpp::max ( int8x16  a,
int8x16  b 
)
inline

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int8x32 simdpp::max ( int8x32  a,
int8x32  b 
)
inline

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x16 simdpp::max ( uint8x16  a,
uint8x16  b 
)
inline

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x32 simdpp::max ( uint8x32  a,
uint8x32  b 
)
inline

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x8 simdpp::max ( int16x8  a,
int16x8  b 
)
inline

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x16 simdpp::max ( int16x16  a,
int16x16  b 
)
inline

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x8 simdpp::max ( uint16x8  a,
uint16x8  b 
)
inline

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x16 simdpp::max ( uint16x16  a,
uint16x16  b 
)
inline

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::max ( int32x4  a,
int32x4  b 
)
inline

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x8 simdpp::max ( int32x8  a,
int32x8  b 
)
inline

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x4 simdpp::max ( uint32x4  a,
uint32x4  b 
)
inline

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x8 simdpp::max ( uint32x8  a,
uint32x8  b 
)
inline

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::max ( float32x4  a,
float32x4  b 
)
inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::max ( float32x8  a,
float32x8  b 
)
inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float64x2 simdpp::max ( float64x2  a,
float64x2  b 
)
inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
float64x4 simdpp::max ( float64x4  a,
float64x4  b 
)
inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
uint8x16 simdpp::min ( uint8x16  a,
uint8x16  b 
)
inline

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint8x32 simdpp::min ( uint8x32  a,
uint8x32  b 
)
inline

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x8 simdpp::min ( int16x8  a,
int16x8  b 
)
inline

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16x16 simdpp::min ( int16x16  a,
int16x16  b 
)
inline

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x8 simdpp::min ( uint16x8  a,
uint16x8  b 
)
inline

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x16 simdpp::min ( uint16x16  a,
uint16x16  b 
)
inline

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::min ( int32x4  a,
int32x4  b 
)
inline

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x8 simdpp::min ( int32x8  a,
int32x8  b 
)
inline

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x4 simdpp::min ( uint32x4  a,
uint32x4  b 
)
inline

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint32x8 simdpp::min ( uint32x8  a,
uint32x8  b 
)
inline

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::min ( float32x4  a,
float32x4  b 
)
inline

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::min ( float32x8  a,
float32x8  b 
)
inline

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float64x2 simdpp::min ( float64x2  a,
float64x2  b 
)
inline

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
float64x4 simdpp::min ( float64x4  a,
float64x4  b 
)
inline

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned s0, unsigned s1>
basic_int16x8 simdpp::permute ( basic_int16x8  a)

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...
: 128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.
: 256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int16x16 simdpp::permute ( basic_int16x16  a)

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...
: 128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.
: 256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x4 simdpp::permute ( basic_int32x4  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x8 simdpp::permute ( basic_int32x8  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int32x4 simdpp::permute ( basic_int32x4  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int32x8 simdpp::permute ( basic_int32x8  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x4 simdpp::permute ( float32x4  a)

Permutes the values of each set of four consecutive 32-bit floating point values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x8 simdpp::permute ( float32x8  a)

Permutes the values of each set of four consecutive 32-bit floating point values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
float32x4 simdpp::permute ( float32x4  a)

Permutes the values of each set of four consecutive 32-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
float32x8 simdpp::permute ( float32x8  a)

Permutes the values of each set of four consecutive 32-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int64x4 simdpp::permute ( basic_int64x4  a)

Permutes the values of each set of four consecutive 64-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float64x4 simdpp::permute ( float64x4  a)

Permutes the values of each set of four consecutive 64-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]
  • In SSE2-AVX this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
template<unsigned s0, unsigned s1>
basic_int64x2 simdpp::permute ( basic_int64x2  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
template<unsigned s0, unsigned s1>
basic_int64x4 simdpp::permute ( basic_int64x4  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
template<unsigned s0, unsigned s1>
float64x2 simdpp::permute ( float64x2  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned s0, unsigned s1>
float64x4 simdpp::permute ( float64x4  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
int128 simdpp::permute_zbytes16 ( int128  a,
int128  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x4 simdpp::permute_zbytes16 ( float32x4  a,
int128  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float64x2 simdpp::permute_zbytes16 ( float64x2  a,
int128  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
int256 simdpp::permute_zbytes16 ( int256  a,
int256  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x8 simdpp::permute_zbytes16 ( float32x8  a,
int256  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float64x4 simdpp::permute_zbytes16 ( float64x4  a,
int256  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x4 simdpp::rcp_e ( float32x4  a)
inline

Computes approximate reciprocal.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::rcp_e ( float32x8  a)
inline

Computes approximate reciprocal.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::rcp_rh ( float32x4  x,
float32x4  a 
)
inline

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
float32x4 x;
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
  • In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 4-5 instructions.
float32x8 simdpp::rcp_rh ( float32x8  x,
float32x8  a 
)
inline

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
float32x4 x;
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
  • In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 4-5 instructions.
float32x4 simdpp::rsqrt_e ( float32x4  a)
inline

Computes approximate reciprocal square root.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))
128-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::rsqrt_e ( float32x8  a)
inline

Computes approximate reciprocal square root.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))
128-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::rsqrt_rh ( float32x4  x,
float32x4  a 
)
inline

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5
128-bit version:
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
  • In NEON this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 7 instructions.
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 8-10 instructions.
float32x8 simdpp::rsqrt_rh ( float32x8  x,
float32x8  a 
)
inline

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5
128-bit version:
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
  • In NEON this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 7 instructions.
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 8-10 instructions.
template<unsigned s0, unsigned s1>
float64x2 simdpp::shuffle1 ( float64x2  a,
float64x2  b 
)

Selects 64-bit floating-point values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned s0, unsigned s1>
float64x4 simdpp::shuffle1 ( float64x4  a,
float64x4  b 
)

Selects 64-bit floating-point values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned s0, unsigned s1>
basic_int64x2 simdpp::shuffle1 ( basic_int64x2  a,
basic_int64x2  b 
)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int64x4 simdpp::shuffle1 ( basic_int64x4  a,
basic_int64x4  b 
)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x4 simdpp::shuffle2 ( float32x4  a,
float32x4  b 
)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x8 simdpp::shuffle2 ( float32x8  a,
float32x8  b 
)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
float32x4 simdpp::shuffle2 ( float32x4  a,
float32x4  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
float32x8 simdpp::shuffle2 ( float32x8  a,
float32x8  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x4 simdpp::shuffle2 ( basic_int32x4  a,
basic_int32x4  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x8 simdpp::shuffle2 ( basic_int32x8  a,
basic_int32x8  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int32x4 simdpp::shuffle2 ( basic_int32x4  a,
basic_int32x4  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1>
basic_int32x8 simdpp::shuffle2 ( basic_int32x8  a,
basic_int32x8  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
int128 simdpp::shuffle_bytes16 ( int128  a,
int128  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::shuffle_bytes16 ( float32x4  a,
float32x4  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float64x2 simdpp::shuffle_bytes16 ( float64x2  a,
float64x2  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
int256 simdpp::shuffle_bytes16 ( int256  a,
int256  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::shuffle_bytes16 ( float32x8  a,
float32x8  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
float64x4 simdpp::shuffle_bytes16 ( float64x4  a,
float64x4  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
int128 simdpp::shuffle_zbytes16 ( int128  a,
int128  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x4 simdpp::shuffle_zbytes16 ( float32x4  a,
float32x4  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float64x2 simdpp::shuffle_zbytes16 ( float64x2  a,
float64x2  b,
int128  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
int256 simdpp::shuffle_zbytes16 ( int256  a,
int256  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x8 simdpp::shuffle_zbytes16 ( float32x8  a,
float32x8  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float64x4 simdpp::shuffle_zbytes16 ( float64x4  a,
float64x4  b,
int256  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
float32x4 simdpp::sqrt ( float32x4  a)
inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • In NEON this intrinsic results in at least 5 instructions.
  • In ALTIVEC this intrinsic results in at least 5-7 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 10 instructions.
  • In ALTIVEC this intrinsic results in at least 10-12 instructions.
float32x8 simdpp::sqrt ( float32x8  a)
inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • In NEON this intrinsic results in at least 5 instructions.
  • In ALTIVEC this intrinsic results in at least 5-7 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 10 instructions.
  • In ALTIVEC this intrinsic results in at least 10-12 instructions.
float64x2 simdpp::sqrt ( float64x2  a)
inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
float64x4 simdpp::sqrt ( float64x4  a)
inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
void simdpp::transpose16 ( basic_int8x16 &  a0,
basic_int8x16 &  a1,
basic_int8x16 &  a2,
basic_int8x16 &  a3,
basic_int8x16 &  a4,
basic_int8x16 &  a5,
basic_int8x16 &  a6,
basic_int8x16 &  a7,
basic_int8x16 &  a8,
basic_int8x16 &  a9,
basic_int8x16 &  a10,
basic_int8x16 &  a11,
basic_int8x16 &  a12,
basic_int8x16 &  a13,
basic_int8x16 &  a14,
basic_int8x16 &  a15 
)
inline

Transposes a 16x16 8-bit matrix within sixteen int8x16 vectors.

r0 = [ a0_0; ...; a15_0 ]
r1 = [ a0_1; ...; a15_1 ]
...
r15 = [ a0_15; ...; a15_15 ]
128-bit version:
  • In SSE2-AVX2 and NEON this intrinsic results in at least 32 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 64 instructions.
  • In AVX2 this intrinsic results in at least 32 instructions.
  • In NEON this intrinsic results in at least 64 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose16 ( basic_int8x32 &  a0,
basic_int8x32 &  a1,
basic_int8x32 &  a2,
basic_int8x32 &  a3,
basic_int8x32 &  a4,
basic_int8x32 &  a5,
basic_int8x32 &  a6,
basic_int8x32 &  a7,
basic_int8x32 &  a8,
basic_int8x32 &  a9,
basic_int8x32 &  a10,
basic_int8x32 &  a11,
basic_int8x32 &  a12,
basic_int8x32 &  a13,
basic_int8x32 &  a14,
basic_int8x32 &  a15 
)
inline
void simdpp::transpose8 ( basic_int16x8 &  a0,
basic_int16x8 &  a1,
basic_int16x8 &  a2,
basic_int16x8 &  a3,
basic_int16x8 &  a4,
basic_int16x8 &  a5,
basic_int16x8 &  a6,
basic_int16x8 &  a7 
)
inline

Transposes a 8x8 16-bit matrix within eight int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ...; a7_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ...; a7_1 ]
...
r7 = [ a0_7; a1_7; a2_7; a3_7 ...; a7_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 24 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 48 instructions.
  • In AVX2 this intrinsic results in at least 24 instructions.
  • In NEON this intrinsic results in at least 24 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose8 ( basic_int16x16 &  a0,
basic_int16x16 &  a1,
basic_int16x16 &  a2,
basic_int16x16 &  a3,
basic_int16x16 &  a4,
basic_int16x16 &  a5,
basic_int16x16 &  a6,
basic_int16x16 &  a7 
)
inline
float32x4 simdpp::trunc ( float32x4  a)
inline

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)
128-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
  • In NEON this intrinsic results in at least 5-6 instructions.
256-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
  • In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.
float32x8 simdpp::trunc ( float32x8  a)
inline

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)
128-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
  • In NEON this intrinsic results in at least 5-6 instructions.
256-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
  • In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.