libsimdpp  1.0
simdpp Namespace Reference

Namespaces

 altivec
 
 neon
 
 sse
 

Classes

class  aligned_allocator
 An allocator that allocates memory with stricter alignment requirements than the defaults. More...
 
struct  expr_bit_and
 
struct  expr_bit_andnot
 
struct  expr_bit_not
 
struct  expr_bit_or
 
struct  expr_bit_xor
 
struct  expr_blend
 
struct  expr_splat2
 
struct  expr_splat4
 
struct  expr_splat8
 
struct  expr_splat16
 
struct  expr_vec_construct
 
struct  expr_vec_load_splat
 
struct  expr_vec_set_splat
 
struct  expr_vec_make_const
 
struct  expr_vec_load
 
struct  expr_vec_load_u
 
struct  expr_add
 
struct  expr_add_sat
 
struct  expr_sub
 
struct  expr_sub_sat
 
struct  expr_abs
 
struct  expr_neg
 
struct  expr_mul
 
struct  expr_mul_lo
 
struct  expr_mul_hi
 
struct  expr_mull
 
struct  expr_fmadd
 
struct  expr_fmsub
 
struct  expr_imm_shift_l
 
struct  expr_imm_shift_r
 
struct  any_vec
 Represents any vector that has B bytes of data. More...
 
struct  any_vec8
 
struct  any_vec16
 
struct  any_vec32
 
struct  any_vec64
 
struct  any_float32
 
struct  any_float64
 
struct  any_int8
 
struct  any_int16
 
struct  any_int32
 
struct  any_int64
 
class  float32< N, void >
 Class representing a float32 vector of arbitrary length. More...
 
class  mask_float32< N, void >
 Class representing a mask for 32-bit floating-point vector of arbitrary length. More...
 
class  float32< 4, void >
 Class representing float32x4 vector. More...
 
class  mask_float32< 4, void >
 Class representing possibly optimized mask data for 4x 32-bit floating-point vector. More...
 
class  float32< 8, void >
 Class representing float32x8 vector. More...
 
class  mask_float32< 8, void >
 Class representing possibly optimized mask data for 4x 32-bit floating-point vector. More...
 
class  float64< N, void >
 Class representing a float64 vector of arbitrary length. More...
 
class  mask_float64< N, void >
 Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...
 
class  float64< 2, void >
 
class  mask_float64< 2, void >
 Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...
 
class  float64< 4, void >
 
class  mask_float64< 4, void >
 Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...
 
class  float32
 
class  mask_float32
 
class  float64
 
class  mask_float64
 
class  int8
 
class  uint8
 
class  mask_int8
 
class  int16
 
class  uint16
 
class  mask_int16
 
class  int32
 
class  uint32
 
class  mask_int32
 
class  int64
 
class  uint64
 
class  mask_int64
 
class  int16< N, void >
 Class representing an signed int16 vector of arbitrary length. More...
 
class  uint16< N, void >
 Class representing an unsigned int16 vector of arbitrary length. More...
 
class  mask_int16< N, void >
 Class representing a mask for 16-bit integer vector of arbitrary length. More...
 
class  int16< 16, void >
 Class representing 16x 16-bit signed integer vector. More...
 
class  uint16< 16, void >
 Class representing 16x 16-bit unsigned integer vector. More...
 
class  mask_int16< 16, void >
 Class representing possibly optimized mask data for 8x 16-bit integer vector. More...
 
class  int16< 8, void >
 Class representing 8x 16-bit signed integer vector. More...
 
class  uint16< 8, void >
 Class representing 8x 16-bit unsigned integer vector. More...
 
class  mask_int16< 8, void >
 Class representing possibly optimized mask data for 8x 16-bit integer vector. More...
 
class  int32< N, void >
 Class representing an signed int64 vector of arbitrary length. More...
 
class  uint32< N, void >
 Class representing an unsigned int32 vector of arbitrary length. More...
 
class  mask_int32< N, void >
 Class representing a mask for 32-bit integer vector of arbitrary length. More...
 
class  int32< 4, void >
 Class representing 4x 32-bit signed integer vector. More...
 
class  uint32< 4, void >
 Class representing 4x 32-bit unsigned integer vector. More...
 
class  mask_int32< 4, void >
 Class representing possibly optimized mask data for 4x 32-bit integer vector. More...
 
class  int32< 8, void >
 Class representing 8x 32-bit signed integer vector. More...
 
class  uint32< 8, void >
 Class representing 8x 32-bit unsigned integer vector. More...
 
class  mask_int32< 8, void >
 Class representing possibly optimized mask data for 4x 32-bit integer vector. More...
 
class  int64< N, void >
 Class representing an signed int64 vector of arbitrary length. More...
 
class  uint64< N, void >
 Class representing an unsigned int64 vector of arbitrary length. More...
 
class  mask_int64< N, void >
 Class representing a mask for 64-bit integer vector of arbitrary length. More...
 
class  int64< 2, void >
 Class representing 2x 64-bit signed integer vector. More...
 
class  uint64< 2, void >
 Class representing 2x 64-bit unsigned integer vector. More...
 
class  mask_int64< 2, void >
 Class representing possibly optimized mask data for 2x 64-bit integer vector. More...
 
class  int64< 4, void >
 Class representing 4x 64-bit signed integer vector. More...
 
class  uint64< 4, void >
 Class representing 4x 64-bit unsigned integer vector. More...
 
class  mask_int64< 4, void >
 Class representing possibly optimized mask data for 4x 64-bit integer vector. More...
 
class  int8< N, void >
 Class representing an signed int8 vector of arbitrary length. More...
 
class  uint8< N, void >
 Class representing an unsigned int8 vector of arbitrary length. More...
 
class  mask_int8< N, void >
 Class representing a mask for 8-bit integer vector of arbitrary length. More...
 
class  int8< 16, void >
 Class representing 16x 8-bit signed integer vector. More...
 
class  uint8< 16, void >
 Class representing 16x 8-bit unsigned integer vector. More...
 
class  mask_int8< 16, void >
 Class representing possibly optimized mask data for 16x 8-bit integer vector. More...
 
class  int8< 32, void >
 Class representing 32x 8-bit signed integer vector. More...
 
class  uint8< 32, void >
 Class representing 32x 8-bit unsigned integer vector. More...
 
class  mask_int8< 32, void >
 Class representing possibly optimized mask data for 16x 8-bit integer vector. More...
 
struct  is_vector
 Allows detection whether specific type is a simdpp vector. More...
 
struct  is_vector< float32< N, E > >
 
struct  is_vector< float64< N, E > >
 
struct  is_vector< int8< N, E > >
 
struct  is_vector< int16< N, E > >
 
struct  is_vector< int32< N, E > >
 
struct  is_vector< int64< N, E > >
 
struct  is_vector< uint8< N, E > >
 
struct  is_vector< uint16< N, E > >
 
struct  is_vector< uint32< N, E > >
 
struct  is_vector< uint64< N, E > >
 
struct  is_vector< mask_int8< N, E > >
 
struct  is_vector< mask_int16< N, E > >
 
struct  is_vector< mask_int32< N, E > >
 
struct  is_vector< mask_int64< N, E > >
 
struct  is_vector< mask_float32< N, E > >
 
struct  is_vector< mask_float64< N, E > >
 
struct  is_mask
 Allows detection whether specific type is a simdpp mask. More...
 
struct  is_mask< mask_int8< N, E > >
 
struct  is_mask< mask_int16< N, E > >
 
struct  is_mask< mask_int32< N, E > >
 
struct  is_mask< mask_int64< N, E > >
 
struct  is_mask< mask_float32< N, E > >
 
struct  is_mask< mask_float64< N, E > >
 

Typedefs

using GetArchCb = std::function< Arch()>
 
using float32x4 = float32< 4 >
 
using float32x8 = float32< 8 >
 
using mask_float32x4 = mask_float32< 4 >
 
using mask_float32x8 = mask_float32< 8 >
 
using float64x2 = float64< 2 >
 
using float64x4 = float64< 4 >
 
using mask_float64x2 = mask_float64< 2 >
 
using mask_float64x4 = mask_float64< 4 >
 
using int8x16 = int8< 16 >
 
using int8x32 = int8< 32 >
 
using uint8x16 = uint8< 16 >
 
using uint8x32 = uint8< 32 >
 
using mask_int8x16 = mask_int8< 16 >
 
using mask_int8x32 = mask_int8< 32 >
 
using int16x8 = int16< 8 >
 
using int16x16 = int16< 16 >
 
using uint16x8 = uint16< 8 >
 
using uint16x16 = uint16< 16 >
 
using mask_int16x8 = mask_int16< 8 >
 
using mask_int16x16 = mask_int16< 16 >
 
using int32x4 = int32< 4 >
 
using int32x8 = int32< 8 >
 
using uint32x4 = uint32< 4 >
 
using uint32x8 = uint32< 8 >
 
using mask_int32x4 = mask_int32< 4 >
 
using mask_int32x8 = mask_int32< 8 >
 
using int64x2 = int64< 2 >
 
using int64x4 = int64< 4 >
 
using uint64x2 = uint64< 2 >
 
using uint64x4 = uint64< 4 >
 
using mask_int64x2 = mask_int64< 2 >
 
using mask_int64x4 = mask_int64< 4 >
 
using float32v = float32< SIMDPP_FAST_FLOAT32_SIZE >
 
using mask_float32v = mask_float32< SIMDPP_FAST_FLOAT32_SIZE >
 
using float64v = float64< SIMDPP_FAST_FLOAT64_SIZE >
 
using mask_float64v = mask_float64< SIMDPP_FAST_FLOAT64_SIZE >
 
using int8v = int8< SIMDPP_FAST_INT8_SIZE >
 
using uint8v = uint8< SIMDPP_FAST_INT8_SIZE >
 
using mask_int8v = mask_int8< SIMDPP_FAST_INT8_SIZE >
 
using int16v = int16< SIMDPP_FAST_INT16_SIZE >
 
using uint16v = uint16< SIMDPP_FAST_INT16_SIZE >
 
using mask_int16v = mask_int16< SIMDPP_FAST_INT16_SIZE >
 
using int32v = int32< SIMDPP_FAST_INT32_SIZE >
 
using uint32v = uint32< SIMDPP_FAST_INT32_SIZE >
 
using mask_int32v = mask_int32< SIMDPP_FAST_INT32_SIZE >
 
using int64v = int64< SIMDPP_FAST_INT64_SIZE >
 
using uint64v = uint64< SIMDPP_FAST_INT64_SIZE >
 
using mask_int64v = mask_int64< SIMDPP_FAST_INT64_SIZE >
 
using mask_float32v2 = mask_float32< SIMDPP_FAST_FLOAT32_SIZE *2 >
 
using float64v2 = float64< SIMDPP_FAST_FLOAT64_SIZE *2 >
 
using mask_float64v2 = mask_float64< SIMDPP_FAST_FLOAT64_SIZE *2 >
 
using int8v2 = int8< SIMDPP_FAST_INT8_SIZE *2 >
 
using uint8v2 = uint8< SIMDPP_FAST_INT8_SIZE *2 >
 
using mask_int8v2 = mask_int8< SIMDPP_FAST_INT8_SIZE *2 >
 
using int16v2 = int16< SIMDPP_FAST_INT16_SIZE *2 >
 
using uint16v2 = uint16< SIMDPP_FAST_INT16_SIZE *2 >
 
using mask_int16v2 = mask_int16< SIMDPP_FAST_INT16_SIZE *2 >
 
using int32v2 = int32< SIMDPP_FAST_INT32_SIZE *2 >
 
using uint32v2 = uint32< SIMDPP_FAST_INT32_SIZE *2 >
 
using mask_int32v2 = mask_int32< SIMDPP_FAST_INT32_SIZE *2 >
 
using int64v2 = int64< SIMDPP_FAST_INT64_SIZE *2 >
 
using uint64v2 = uint64< SIMDPP_FAST_INT64_SIZE *2 >
 
using mask_int64v2 = mask_int64< SIMDPP_FAST_INT64_SIZE *2 >
 
using float32v4 = float32< SIMDPP_FAST_FLOAT32_SIZE *4 >
 
using mask_float32v4 = mask_float32< SIMDPP_FAST_FLOAT32_SIZE *4 >
 
using float64v4 = float64< SIMDPP_FAST_FLOAT64_SIZE *4 >
 
using mask_float64v4 = mask_float64< SIMDPP_FAST_FLOAT64_SIZE *4 >
 
using int8v4 = int8< SIMDPP_FAST_INT8_SIZE *4 >
 
using uint8v4 = uint8< SIMDPP_FAST_INT8_SIZE *4 >
 
using mask_int8v4 = mask_int8< SIMDPP_FAST_INT8_SIZE *4 >
 
using int16v4 = int16< SIMDPP_FAST_INT16_SIZE *4 >
 
using uint16v4 = uint16< SIMDPP_FAST_INT16_SIZE *4 >
 
using mask_int16v4 = mask_int16< SIMDPP_FAST_INT16_SIZE *4 >
 
using int32v4 = int32< SIMDPP_FAST_INT32_SIZE *4 >
 
using uint32v4 = uint32< SIMDPP_FAST_INT32_SIZE *4 >
 
using mask_int32v4 = mask_int32< SIMDPP_FAST_INT32_SIZE *4 >
 
using int64v4 = int64< SIMDPP_FAST_INT64_SIZE *4 >
 
using uint64v4 = uint64< SIMDPP_FAST_INT64_SIZE *4 >
 
using mask_int64v4 = mask_int64< SIMDPP_FAST_INT64_SIZE *4 >
 

Enumerations

enum  Arch : std::uint32_t {
  Arch::NONE_NULL = 0,
  Arch::X86_SSE2 = 1 << 1,
  Arch::X86_SSE3 = 1 << 2,
  Arch::X86_SSSE3 = 1 << 3,
  Arch::X86_SSE4_1 = 1 << 4,
  Arch::X86_AVX = 1 << 5,
  Arch::X86_AVX2 = 1 << 6,
  Arch::X86_FMA3 = 1 << 7,
  Arch::X86_FMA4 = 1 << 8,
  Arch::X86_XOP = 1 << 9,
  Arch::ARM_NEON = 1 << 0,
  Arch::ARM_NEON_FLT_SP = 1 << 1,
  Arch::POWER_ALTIVEC = 1 << 0
}
 Identifies supported instruction set. More...
 

Functions

void transpose2 (uint16x8 &a0, uint16x8 &a1)
 Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...
 
void transpose2 (int16x8 &a0, int16x8 &a1)
 
void transpose2 (uint16x16 &a0, uint16x16 &a1)
 
void transpose2 (int16x16 &a0, int16x16 &a1)
 
void transpose8 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3, uint8x16 &a4, uint8x16 &a5, uint8x16 &a6, uint8x16 &a7)
 Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...
 
void transpose8 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3, int8x16 &a4, int8x16 &a5, int8x16 &a6, int8x16 &a7)
 
void transpose8 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3, uint8x32 &a4, uint8x32 &a5, uint8x32 &a6, uint8x32 &a7)
 
void transpose8 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3, int8x32 &a4, int8x32 &a5, int8x32 &a6, int8x32 &a7)
 
void transpose8 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3, uint16x8 &a4, uint16x8 &a5, uint16x8 &a6, uint16x8 &a7)
 Transposes a 8x8 16-bit matrix within eight int16x8 vectors. More...
 
void transpose8 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3, int16x8 &a4, int16x8 &a5, int16x8 &a6, int16x8 &a7)
 
void transpose8 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3, uint16x16 &a4, uint16x16 &a5, uint16x16 &a6, uint16x16 &a7)
 
void transpose8 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3, int16x16 &a4, int16x16 &a5, int16x16 &a6, int16x16 &a7)
 
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
align16 (const any_vec8< N, V1 > &lower, const any_vec8< N, V2 > &upper)
 Extracts a int8x16 vector from two concatenated int8x16 vectors. More...
 
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
align8 (const any_vec16< N, V1 > &lower, const any_vec16< N, V2 > &upper)
 Extracts a int16x8 vector from two concatenated int16x8 vectors. More...
 
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
align4 (const any_vec32< N, V1 > &lower, const any_vec32< N, V2 > &upper)
 Extracts a int32x4 vector from two concatenated int32x4 vectors. More...
 
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
align2 (const any_vec64< N, V1 > &lower, const any_vec64< N, V2 > &upper)
 Extracts a int64x2 vector from two concatenated int64x2 vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2< V1, V2,
void >::empty 
bit_xor (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
 Computes bitwise XOR of integer or floating-point vectors. More...
 
void prefetch_read (const void *ptr)
 Prefetches data to the lowest level cache for reading. More...
 
void prefetch_write (const void *ptr)
 Prefetches data to the lowest level cache for writing. More...
 
template<class R , class T >
bit_cast (T t)
 Casts between unrelated types. More...
 
template<unsigned N, class V1 , class V2 >
mask_int8< N, mask_int8< N > > cmp_eq (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
 Compares 8-bit values for equality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int16< N, mask_int16< N > > cmp_eq (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
 Compares 16-bit values for equality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int32< N, mask_int32< N > > cmp_eq (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
 Compares the values of two int32x4 vectors for equality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int64< N, mask_int64< N > > cmp_eq (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
 Compares the values of two int64x2 vectors for equality. More...
 
template<unsigned N, class V1 , class V2 >
mask_float32< N, mask_float32
< N > > 
cmp_eq (const any_float32< N, V1 > &a, const any_float32< N, V2 > &b)
 Compares the values of two float32x4 vectors for equality. More...
 
template<unsigned N, class V1 , class V2 >
mask_float64< N, mask_float64
< N > > 
cmp_eq (const any_float64< N, V1 > &a, const any_float64< N, V2 > &b)
 Compares the values of two float64x2 vectors for equality. More...
 
template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64
< N > > 
cmp_ge (float64< N, E1 > a, float64< N, E2 > b)
 Compares the values of two float64x2 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > > cmp_gt (int8< N, E1 > a, int8< N, E2 > b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > > cmp_gt (uint8< N, E1 > a, uint8< N, E2 > b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > > cmp_gt (int16< N, E1 > a, int16< N, E2 > b)
 Compares the values of two signed int16x8 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > > cmp_gt (uint16< N, E1 > a, uint16< N, E2 > b)
 Compares the values of two unsigned int16x8 vectors for greater-than. More...
 
template<unsigned N, class V1 , class V2 >
mask_int8< N, mask_int8< N > > cmp_neq (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
 Compares the values of two int8x16 vectors for inequality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int16< N, mask_int16< N > > cmp_neq (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
 Compares the values of two int16x8 vectors for inequality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int32< N, mask_int32< N > > cmp_neq (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
 Compares the values of two int32x4 vectors for inequality. More...
 
template<unsigned N, class V1 , class V2 >
mask_int64< N, mask_int64< N > > cmp_neq (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
 Compares the values of two int64x2 vectors for inequality. More...
 
template<unsigned N, class V1 , class V2 >
mask_float32< N, mask_float32
< N > > 
cmp_neq (const any_float32< N, V1 > &a, const any_float32< N, V2 > &b)
 Compares the values of two float32x4 vectors for inequality. More...
 
template<unsigned N, class V1 , class V2 >
mask_float64< N, mask_float64
< N > > 
cmp_neq (const any_float64< N, V1 > &a, const any_float64< N, V2 > &b)
 Compares the values of two float64x2 vectors for inequality. More...
 
template<unsigned id>
float extract (float32x4 a)
 Extracts an element from float32x4 vector. More...
 
template<unsigned id>
double extract (float64x2 a)
 Extracts an element from float64x2 vector. More...
 
uint16_t extract_bits_any (uint8x16 a)
 Extracts a bit from each byte of each element of a int8x16 vector. More...
 
template<unsigned id>
uint16_t extract_bits (uint8x16 a)
 Extracts specific bit from each byte of each element of a int8x16 vector. More...
 
template<unsigned N, class E >
float32< N, expr_abs< float32
< N, E > > > 
abs (float32< N, E > a)
 Computes absolute value of floating point values. More...
 
template<unsigned N, class E >
float64< N, expr_abs< float64
< N, E > > > 
abs (float64< N, E > a)
 Computes absolute value of floating point values. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, expr_add< float32
< N, E1 >, float32< N, E2 > > > 
add (float32< N, E1 > a, float32< N, E2 > b)
 Adds the values of two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, expr_add< float64
< N, E1 >, float64< N, E2 > > > 
add (float64< N, E1 > a, float64< N, E2 > b)
 Adds the values of two vectors. More...
 
template<unsigned N, class E >
float32< N, float32< N > > ceil (float32< N, E > a)
 Rounds the values a vector towards positive infinity. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, float32< N > > div (float32< N, E1 > a, float32< N, E2 > b)
 Divides the values of two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, float64< N > > div (float64< N, E1 > a, float64< N, E2 > b)
 Divides the values of two vectors. More...
 
template<unsigned N, class E >
float32< N, float32< N > > floor (float32< N, E > a)
 Rounds the values of a vector towards negative infinity. More...
 
template<unsigned N, class E >
mask_float32< N, mask_float32
< N > > 
isnan (float32< N, E > a)
 Checks whether elements in a are IEEE754 NaN. More...
 
template<unsigned N, class E >
mask_float64< N, mask_float64
< N > > 
isnan (float64< N, E > a)
 Checks whether elements in a are IEEE754 NaN. More...
 
template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32
< N > > 
isnan2 (float32< N, E1 > a, float32< N, E2 > b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64
< N > > 
isnan2 (float64< N, E1 > a, float64< N, E2 > b)
 Checks whether corresponding elements in either a or b are IEEE754 NaN. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, float32< N > > max (float32< N, E1 > a, float32< N, E2 > b)
 Computes maxima of the values of two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, float32< N > > min (float32< N, E1 > a, float32< N, E2 > b)
 Computes minimum of the values in two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, float64< N > > min (float64< N, E1 > a, float64< N, E2 > b)
 Computes minima of the values in two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, expr_mul< float32
< N, E1 >, float32< N, E2 > > > 
mul (float32< N, E1 > a, float32< N, E2 > b)
 Multiplies the values of two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, expr_mul< float64
< N, E1 >, float64< N, E2 > > > 
mul (float64< N, E1 > a, float64< N, E2 > b)
 Multiplies the values of two vectors. More...
 
template<unsigned N, class E >
float32< N, expr_neg< float32
< N, E > > > 
neg (float32< N, E > a)
 Negates the values of a float32x4 vector. More...
 
template<unsigned N, class E >
float64< N, expr_neg< float64
< N, E > > > 
neg (float64< N, E > a)
 Negates the values of a vector. More...
 
template<unsigned N, class E >
float32< N, float32< N > > rcp_e (float32< N, E > a)
 Computes approximate reciprocal. More...
 
template<unsigned N, class E >
float32< N, float32< N > > rcp_rh (float32< N, E > a)
 Computes one Newton-Rhapson iterations for reciprocal. More...
 
template<unsigned N, class E >
float32< N, float32< N > > rsqrt_e (float32< N, E > a)
 Computes approximate reciprocal square root. More...
 
template<unsigned N, class E >
float32< N, float32< N > > rsqrt_rh (float32< N, E > a)
 Computes one Newton-Rhapson iteration for inverse of square root. More...
 
template<unsigned N, class E >
float32< N, float32< N > > sign (float32< N, E > a)
 Extracts sign bits from the values in float32x4 vector. More...
 
template<unsigned N, class E >
float64< N, float64< N > > sign (float64< N, E > a)
 Extracts sigh bit from the values in float64x2 vector. More...
 
template<unsigned N, class E1 >
float32< N, float32< N > > sqrt (float32< N, E1 > a)
 Computes square root. More...
 
template<unsigned N, class E1 >
float64< N, float64< N > > sqrt (float64< N, E1 > a)
 Computes square root. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, expr_sub< float32
< N, E1 >, float32< N, E2 > > > 
sub (float32< N, E1 > a, float32< N, E2 > b)
 Substracts the values of two vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, expr_sub< float64
< N, E1 >, float64< N, E2 > > > 
sub (float64< N, E1 > a, float64< N, E2 > b)
 Subtracts the values of two vectors. More...
 
template<unsigned N, class E >
float32< N, float32< N > > trunc (float32< N, E > a)
 Rounds the values of a vector towards zero. More...
 
template<unsigned N, class E >
uint8< N, expr_abs< int8< N, E > > > abs (int8< N, E > a)
 Computes absolute value of 8-bit integer values. More...
 
template<unsigned N, class E >
uint16< N, expr_abs< int16< N,
E > > > 
abs (int16< N, E > a)
 Computes absolute value of 16-bit integer values. More...
 
template<unsigned N, class E >
uint32< N, expr_abs< int32< N,
E > > > 
abs (int32< N, E > a)
 Computes absolute value of 32-bit integer values. More...
 
template<unsigned N, class E >
uint64< N, expr_abs< int64< N,
E > > > 
abs (int64< N, E > a)
 Computes absolute value of 64-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_add< uint8< N,
typename V1::expr_type >
, uint8< N, typename
V2::expr_type > > >::type 
add (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
 Adds 8-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_add< uint16< N,
typename V1::expr_type >
, uint16< N, typename
V2::expr_type > > >::type 
add (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
 Adds 16-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_add< uint32< N,
typename V1::expr_type >
, uint32< N, typename
V2::expr_type > > >::type 
add (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
 Adds 32-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_add< uint64< N,
typename V1::expr_type >
, uint64< N, typename
V2::expr_type > > >::type 
add (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
 Adds 64-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, expr_add_sat< int8< N,
E1 >, int8< N, E2 > > > 
add_sat (int8< N, E1 > a, int8< N, E2 > b)
 Adds and saturates signed 8-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, expr_add_sat< int16
< N, E1 >, int16< N, E2 > > > 
add_sat (int16< N, E1 > a, int16< N, E2 > b)
 Adds and saturates signed 16-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, expr_add_sat< uint8
< N, E1 >, uint8< N, E2 > > > 
add_sat (uint8< N, E1 > a, uint8< N, E2 > b)
 Adds and saturates unsigned 8-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, expr_add_sat
< uint16< N, E1 >, uint16< N,
E2 > > > 
add_sat (uint16< N, E1 > a, uint16< N, E2 > b)
 Adds and saturates unsigned 16-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > avg (uint8< N, E1 > a, uint8< N, E2 > b)
 Computes rounded average of the unsigned 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, int8< N > > avg (int8< N, E1 > a, int8< N, E2 > b)
 Computes rounded average of signed 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > avg (uint16< N, E1 > a, uint16< N, E2 > b)
 Computes rounded average of unsigned 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, int16< N > > avg (int16< N, E1 > a, int16< N, E2 > b)
 Computes rounded average of signed 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > avg (uint32< N, E1 > a, uint32< N, E2 > b)
 Computes rounded average of unsigned 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int32< N, int32< N > > avg (int32< N, E1 > a, int32< N, E2 > b)
 Computes rounded average of signed 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > avg_trunc (uint8< N, E1 > a, uint8< N, E2 > b)
 Computes truncated average of the unsigned 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, int8< N > > avg_trunc (int8< N, E1 > a, int8< N, E2 > b)
 Computes truncated average of signed 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > avg_trunc (uint16< N, E1 > a, uint16< N, E2 > b)
 Computes truncated average of unsigned 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, int16< N > > avg_trunc (int16< N, E1 > a, int16< N, E2 > b)
 Computes truncated average of signed 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > avg_trunc (uint32< N, E1 > a, uint32< N, E2 > b)
 Computes truncated average of unsigned 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int32< N, int32< N > > avg_trunc (int32< N, E1 > a, int32< N, E2 > b)
 Computes truncated average of signed 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, int8< N > > max (int8< N, E1 > a, int8< N, E2 > b)
 Computes maximum of the signed 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > max (uint8< N, E1 > a, uint8< N, E2 > b)
 Computes maximum of the unsigned 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, int16< N > > max (int16< N, E1 > a, int16< N, E2 > b)
 Computes maximum of the signed 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > max (uint16< N, E1 > a, uint16< N, E2 > b)
 Computes maximum of the unsigned 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int32< N, int32< N > > max (int32< N, E1 > a, int32< N, E2 > b)
 Computes maximum of the signed 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > max (uint32< N, E1 > a, uint32< N, E2 > b)
 Computes maximum of the unsigned 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, int8< N > > min (int8< N, E1 > a, int8< N, E2 > b)
 Computes minimum of signed 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > min (uint8< N, E1 > a, uint8< N, E2 > b)
 Computes minimum of the unsigned 8-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, int16< N > > min (int16< N, E1 > a, int16< N, E2 > b)
 Computes minimum of the signed 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > min (uint16< N, E1 > a, uint16< N, E2 > b)
 Computes minimum of the unsigned 16-bit values. More...
 
template<unsigned N, class E1 , class E2 >
int32< N, int32< N > > min (int32< N, E1 > a, int32< N, E2 > b)
 Computes minimum of the signed 32-bit values. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > min (uint32< N, E1 > a, uint32< N, E2 > b)
 Computes minimum of the unsigned 32-bit values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_mul_lo< uint16< N,
typename V1::expr_type >
, uint16< N, typename
V2::expr_type > > >::type 
mul_lo (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
 Multiplies 16-bit values and returns the lower part of the multiplication. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, expr_mul_hi< int16
< N, E1 >, int16< N, E2 > > > 
mul_hi (int16< N, E1 > a, int16< N, E2 > b)
 Multiplies signed 16-bit values and returns the higher half of the result. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, expr_mul_hi< uint16
< N, E1 >, uint16< N, E2 > > > 
mul_hi (uint16< N, E1 > a, uint16< N, E2 > b)
 Multiplies unsigned 16-bit values and returns the higher half of the result. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_mul_lo< uint32< N,
typename V1::expr_type >
, uint32< N, typename
V2::expr_type > > >::type 
mul_lo (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
 Multiplies 32-bit values and returns the lower half of the result. More...
 
template<unsigned N, class E >
int8< N, expr_neg< int8< N, E > > > neg (int8< N, E > a)
 Negates signed 8-bit values. More...
 
template<unsigned N, class E >
int16< N, expr_neg< int16< N,
E > > > 
neg (int16< N, E > a)
 Negates signed 16-bit values. More...
 
template<unsigned N, class E >
int32< N, expr_neg< int32< N,
E > > > 
neg (int32< N, E > a)
 Negates signed 32-bit values. More...
 
template<unsigned N, class E >
int64< N, expr_neg< int64< N,
E > > > 
neg (int64< N, E > a)
 Negates signed 64-bit values. More...
 
template<unsigned N, class E >
int8< N, int8< N > > shift_r (int8< N, E > a, unsigned count)
 Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_sub< uint8< N,
typename V1::expr_type >
, uint8< N, typename
V2::expr_type > > >::type 
sub (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
 Subtracts 8-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_sub< uint16< N,
typename V1::expr_type >
, uint16< N, typename
V2::expr_type > > >::type 
sub (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
 Subtracts 16-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_sub< uint32< N,
typename V1::expr_type >
, uint32< N, typename
V2::expr_type > > >::type 
sub (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
 Subtracts 32-bit integer values. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, expr_sub< uint64< N,
typename V1::expr_type >
, uint64< N, typename
V2::expr_type > > >::type 
sub (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
 Subtracts 64-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
int8< N, expr_sub_sat< int8< N,
E1 >, int8< N, E2 > > > 
sub_sat (int8< N, E1 > a, int8< N, E2 > b)
 Subtracts and saturaters signed 8-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
int16< N, expr_sub_sat< int16
< N, E1 >, int16< N, E2 > > > 
sub_sat (int16< N, E1 > a, int16< N, E2 > b)
 Subtracts and saturaters signed 16-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, expr_sub_sat< uint8
< N, E1 >, uint8< N, E2 > > > 
sub_sat (uint8< N, E1 > a, uint8< N, E2 > b)
 Subtracts and saturaters unsigned 8-bit integer values. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, expr_sub_sat
< uint16< N, E1 >, uint16< N,
E2 > > > 
sub_sat (uint16< N, E1 > a, uint16< N, E2 > b)
 Subtracts and saturaters unsigned 16-bit integer values. More...
 
template<unsigned id>
uint8x16 insert (uint8x16 a, uint8_t x)
 Inserts an element into int8x16 vector at the position identified by id. More...
 
template<unsigned id>
uint16x8 insert (uint16x8 a, uint16_t x)
 Inserts an element into int16x8 vector at the position identified by id. More...
 
template<unsigned id>
uint32x4 insert (uint32x4 a, uint32_t x)
 Inserts an element into int32x4 vector at the position identified by id. More...
 
template<unsigned id>
uint64x2 insert (uint64x2 a, uint64_t x)
 Inserts an element into int64x2 vector at the position identified by id. More...
 
template<unsigned id>
float32x4 insert (float32x4 a, float x)
 Inserts an element into float32x4 vector at the position identified by id. More...
 
template<unsigned id>
float64x2 insert (float64x2 a, double x)
 Inserts an element into float64x2 vector at the position identified by id. More...
 
template<class V = expr_vec_load>
load (const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location. More...
 
template<unsigned N, class V >
void load_packed2 (any_vec< N, V > &a, any_vec< N, V > &b, const void *p)
 Loads values packed in pairs, de-interleaves them and stores the result into two vectors. More...
 
template<unsigned N, class V >
void load_packed3 (any_vec< N, V > &a, any_vec< N, V > &b, any_vec< N, V > &c, const void *p)
 Loads values packed in triplets, de-interleaves them and stores the result into three vectors. More...
 
template<unsigned N, class V >
void load_packed4 (any_vec< N, V > &a, any_vec< N, V > &b, any_vec< N, V > &c, any_vec< N, V > &d, const void *p)
 Loads values packed in quartets, de-interleaves them and stores the result into four vectors. More...
 
template<class V = expr_vec_load_splat>
load_splat (const void *p)
 Loads a value from a memory location and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_load_u>
load_u (const void *p)
 Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...
 
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute2 (const any_vec16< N, V > &a)
 Permutes the 16-bit values within sets of two consecutive elements of the vector. More...
 
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute2 (const any_vec32< N, V > &a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
shuffle1 (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
 Selects 64-bit values from two vectors. More...
 
template<unsigned N, class V >
void store (void *p, const any_vec< N, V > &a)
 Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...
 
template<unsigned N, class V >
void store_first (void *p, const any_vec< N, V > &a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
template<unsigned N, class V >
void store_last (void *p, const any_vec< N, V > &a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
template<unsigned N, class V1 , class V2 >
void store_packed2 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
 Interleaves values from two vectors and stores the result into successive locations starting from p. More...
 
template<unsigned N, class V1 , class V2 , class V3 >
void store_packed3 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b, const any_vec< N, V3 > &c)
 Interleaves values from three vectors and stores the result into successive locations starting from p. More...
 
template<unsigned N, class V1 , class V2 , class V3 , class V4 >
void store_packed4 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b, const any_vec< N, V3 > &c, const any_vec< N, V4 > &d)
 Interleaves values from four vectors and stores the result into successive locations starting from p. More...
 
template<unsigned N, class V >
void stream (void *p, const any_vec< N, V > &a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
uint16x16 to_int16 (int8x16 a)
 Sign extends the 16 values of a signed int8x16 vector to 16-bits. More...
 
uint16x16 to_int16 (uint8x16 a)
 Extends the 16 values of a unsigned int8x16 vector to 16-bits. More...
 
int32x8 to_int32 (int16x8 a)
 Sign extends the first 8 values of a signed int16x16 vector to 32-bits. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > unzip16_lo (uint8< N, E1 > a, uint8< N, E2 > b)
 De-interleaves the odd(lower) elements of two int8x16 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > unzip8_lo (uint16< N, E1 > a, uint16< N, E2 > b)
 De-interleaves the odd(lower) elements of two int16x8 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > unzip4_lo (uint32< N, E1 > a, uint32< N, E2 > b)
 De-interleaves the odd(lower) elements of two int32x4 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint64< N, uint64< N > > unzip2_lo (uint64< N, E1 > a, uint64< N, E2 > b)
 De-interleaves the odd(lower) elements of two int64x2 vectors. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, float32< N > > unzip4_lo (float32< N, E1 > a, float32< N, E2 > b)
 De-interleaves the odd(lower) elements of two float32x4 vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, float64< N > > unzip2_lo (float64< N, E1 > a, float64< N, E2 > b)
 De-interleaves the odd(lower) elements of two float64x2 vectors. More...
 
Arch get_arch_gcc_builtin_cpu_supports ()
 Retrieves supported architecture using GCC __builtin_cpu_supports function. More...
 
Arch get_arch_linux_cpuinfo ()
 Retrieves supported architecture from Linux /proc/cpuinfo file. More...
 
Arch this_compile_arch ()
 Returns the instruction set flags that will be required by the currently compiled code. More...
 
void transpose2 (uint32x4 &a0, uint32x4 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (int32x4 &a0, int32x4 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (uint32x8 &a0, uint32x8 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (int32x8 &a0, int32x8 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void transpose2 (uint64x2 &a0, uint64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (int64x2 &a0, int64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (uint64x4 &a0, uint64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (int64x4 &a0, int64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (float32x4 &a0, float32x4 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void transpose2 (float32x8 &a0, float32x8 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void transpose2 (float64x2 &a0, float64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose2 (float64x4 &a0, float64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void transpose4 (uint32x4 &a0, uint32x4 &a1, uint32x4 &a2, uint32x4 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void transpose4 (int32x4 &a0, int32x4 &a1, int32x4 &a2, int32x4 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void transpose4 (int32x8 &a0, int32x8 &a1, int32x8 &a2, int32x8 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void transpose4 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (uint32x8 &a0, uint32x8 &a1, uint32x8 &a2, uint32x8 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void transpose4 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 
void transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and
< expr_bit_and, V1, V2 >::type 
bit_and (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
 Computes bitwise AND of integer or floating-point vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and
< expr_bit_andnot, V1, V2 >
::type 
bit_andnot (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
 Computes bitwise AND NOT of two integer or floating-point vectors. More...
 
template<unsigned N, class V >
detail::get_expr< V,
expr_bit_not< V > >::empty 
bit_not (const any_vec< N, V > &a)
 Computes bitwise NOT of an integer or floating-point vector. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr_bit_or< V1,
V2 >::type 
bit_or (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
 Computes bitwise OR of integer vectors. More...
 
template<unsigned N, class V1 , class V2 , class V3 >
detail::get_expr_blend< V1, V2,
V3 >::type 
blend (const any_vec< N, V1 > &on, const any_vec< N, V2 > &off, const any_vec< N, V3 > &mask)
 Composes a vector from two sources according to a mask. More...
 
template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32
< N > > 
cmp_ge (float32< N, E1 > a, float32< N, E2 > b)
 Compares the values of two float32x4 vectors for greater-than or equal. More...
 
template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > > cmp_gt (int32< N, E1 > a, int32< N, E2 > b)
 Compares the values of two signed int32x4 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > > cmp_gt (uint32< N, E1 > a, uint32< N, E2 > b)
 Compares the values of two unsigned int32x4 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32
< N > > 
cmp_gt (float32< N, E1 > a, float32< N, E2 > b)
 Compares the values of two float32x4 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64
< N > > 
cmp_gt (float64< N, E1 > a, float64< N, E2 > b)
 Compares the values of two float64x2 vectors for greater-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32
< N > > 
cmp_le (float32< N, E1 > a, float32< N, E2 > b)
 Compares the values of two float32x4 vectors for less-than or equal. More...
 
template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64
< N > > 
cmp_le (float64< N, E1 > a, float64< N, E2 > b)
 Compares the values of two float64x2 vectors for less-than or equal. More...
 
template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > > cmp_lt (int8< N, E1 > a, int8< N, E2 > b)
 Compares the values of two signed int8x16 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > > cmp_lt (uint8< N, E1 > a, uint8< N, E2 > b)
 Compares the values of two unsigned int8x16 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > > cmp_lt (int16< N, E1 > a, int16< N, E2 > b)
 Compares the values of two signed int16x8 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > > cmp_lt (uint16< N, E1 > a, uint16< N, E2 > b)
 Compares the values of two unsigned int16x8 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > > cmp_lt (int32< N, E1 > a, int32< N, E2 > b)
 Compares the values of two signed int32x4 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > > cmp_lt (uint32< N, E1 > a, uint32< N, E2 > b)
 Compares the values of two unsigned int32x4 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32
< N > > 
cmp_lt (float32< N, E1 > a, float32< N, E2 > b)
 Compares the values of two float32x4 vectors for less-than. More...
 
template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64
< N > > 
cmp_lt (float64< N, E1 > a, float64< N, E2 > b)
 Compares the values of two float64x2 vectors for less-than. More...
 
template<unsigned id>
uint8_t extract (uint8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<unsigned id>
int8_t extract (int8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<unsigned id>
uint16_t extract (uint16x8 a)
 Extracts the id-th element from int16x8 vector. More...
 
template<unsigned id>
int16_t extract (int16x8 a)
 Extracts the id-th element from int16x8 vector. More...
 
template<unsigned id>
uint32_t extract (uint32x4 a)
 Extracts the id-th element from int32x4 vector. More...
 
template<unsigned id>
int32_t extract (int32x4 a)
 Extracts the id-th element from int32x4 vector. More...
 
template<unsigned id>
uint64_t extract (uint64x2 a)
 Extracts an element from int64x2 vector. More...
 
template<unsigned id>
int64_t extract (int64x2 a)
 Extracts an element from int64x2 vector. More...
 
void split (uint8x32 a, uint8x16 &r1, uint8x16 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (uint16x16 a, uint16x8 &r1, uint16x8 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (uint32x8 a, uint32x4 &r1, uint32x4 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (uint64x4 a, uint64x2 &r1, uint64x2 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (int8x32 a, int8x16 &r1, int8x16 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (int16x16 a, int16x8 &r1, int16x8 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (int32x8 a, int32x4 &r1, int32x4 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (int64x4 a, int64x2 &r1, int64x2 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (float32x8 a, float32x4 &r1, float32x4 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
void split (float64x4 a, float64x2 &r1, float64x2 &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (uint8< N > a, uint8< N/2 > &r1, uint8< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (uint16< N > a, uint16< N/2 > &r1, uint16< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (uint32< N > a, uint32< N/2 > &r1, uint32< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (uint64< N > a, uint64< N/2 > &r1, uint64< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (int8< N > a, int8< N/2 > &r1, int8< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (int16< N > a, int16< N/2 > &r1, int16< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (int32< N > a, int32< N/2 > &r1, int32< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (int64< N > a, int64< N/2 > &r1, int64< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (float32< N > a, float32< N/2 > &r1, float32< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N>
void split (float64< N > a, float64< N/2 > &r1, float64< N/2 > &r2)
 Splits a 256-bit vector into two 128-bit vectors. More...
 
template<unsigned N, class E1 , class E2 , class E3 >
float32< N, expr_fmadd
< float32< N, E1 >, float32< N,
E2 >, float32< N, E3 > > > 
fmadd (float32< N, E1 > a, float32< N, E2 > b, float32< N, E3 > c)
 Performs a fused multiply-add operation. More...
 
template<unsigned N, class E1 , class E2 , class E3 >
float64< N, expr_fmadd
< float64< N, E1 >, float64< N,
E2 >, float64< N, E3 > > > 
fmadd (float64< N, E1 > a, float64< N, E2 > b, float64< N, E3 > c)
 Performs a fused multiply-add operation. More...
 
template<unsigned N, class E1 , class E2 , class E3 >
float32< N, expr_fmsub
< float32< N, E1 >, float32< N,
E2 >, float32< N, E3 > > > 
fmsub (float32< N, E1 > a, float32< N, E2 > b, float32< N, E3 > c)
 Performs a fused multiply-sutract operation. More...
 
template<unsigned N, class E1 , class E2 , class E3 >
float64< N, expr_fmsub
< float64< N, E1 >, float64< N,
E2 >, float64< N, E3 > > > 
fmsub (float64< N, E1 > a, float64< N, E2 > b, float64< N, E3 > c)
 Performs a fused multiply-sutract operation. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, float64< N > > max (float64< N, E1 > a, float64< N, E2 > b)
 Computes maxima of the values of two vectors. More...
 
template<unsigned P>
uint8x16 div_p (uint8x16 num, uint8x16 den)
 Divides one 8-bit unsigned number by another. More...
 
template<unsigned P>
uint16x8 div_p (uint16x8 num, uint16x8 den)
 Divides one 8-bit unsigned number by another. More...
 
template<unsigned N, class E1 , class E2 >
int32< N, expr_mull< int16< N,
E1 >, int16< N, E2 > > > 
mull (int16< N, E1 > a, int16< N, E2 > b)
 Multiplies signed 16-bit values and expands the results to 32 bits. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, expr_mull< uint16
< N, E1 >, uint16< N, E2 > > > 
mull (uint16< N, E1 > a, uint16< N, E2 > b)
 Multiplies unsigned 16-bit values and expands the results to 32 bits. More...
 
template<unsigned N, class E1 , class E2 >
int64< N, expr_mull< int32< N,
E1 >, int32< N, E2 > > > 
mull (int32< N, E1 > a, int32< N, E2 > b)
 Multiplies signed 32-bit values in and expands the results to 64 bits. More...
 
template<unsigned N, class E1 , class E2 >
uint64< N, expr_mull< uint32
< N, E1 >, uint32< N, E2 > > > 
mull (uint32< N, E1 > a, uint32< N, E2 > b)
 Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...
 
template<unsigned N, class E >
int8< N, int8< N > > shift_l (int8< N, E > a, unsigned count)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
uint8< N, uint8< N > > shift_l (uint8< N, E > a, unsigned count)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int16< N, int16< N > > shift_l (int16< N, E > a, unsigned count)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
uint16< N, uint16< N > > shift_l (uint16< N, E > a, unsigned count)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int32< N, int32< N > > shift_l (int32< N, E > a, unsigned count)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
uint32< N, uint32< N > > shift_l (uint32< N, E > a, unsigned count)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int64< N, int64< N > > shift_l (int64< N, E > a, unsigned count)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
uint64< N, uint64< N > > shift_l (uint64< N, E > a, unsigned count)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int8< N, int8< N > > shift_l (int8< N, E > a)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > > shift_l (uint8< N, E > a)
 Shifts 8-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int16< N, int16< N > > shift_l (int16< N, E > a)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > > shift_l (uint16< N, E > a)
 Shifts 16-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int32< N, int32< N > > shift_l (int32< N, E > a)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > > shift_l (uint32< N, E > a)
 Shifts 32-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int64< N, int64< N > > shift_l (int64< N, E > a)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > > shift_l (uint64< N, E > a)
 Shifts 64-bit values left by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
uint8< N, uint8< N > > shift_r (uint8< N, E > a, unsigned count)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int16< N, int16< N > > shift_r (int16< N, E > a, unsigned count)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned N, class E >
uint16< N, uint16< N > > shift_r (uint16< N, E > a, unsigned count)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int32< N, int32< N > > shift_r (int32< N, E > a, unsigned count)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned N, class E >
uint32< N, uint32< N > > shift_r (uint32< N, E > a, unsigned count)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
template<unsigned N, class E >
int64< N, int64< N > > shift_r (int64< N, E > a, unsigned count)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned N, class E >
uint64< N, uint64< N > > shift_r (uint64< N, E > a, unsigned count)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int8< N, int8< N > > shift_r (int8< N, E > a)
 Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > > shift_r (uint8< N, E > a)
 Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int16< N, int16< N > > shift_r (int16< N, E > a)
 Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > > shift_r (uint16< N, E > a)
 Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int32< N, int32< N > > shift_r (int32< N, E > a)
 Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > > shift_r (uint32< N, E > a)
 Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...
 
template<unsigned count, unsigned N, class E >
int64< N, int64< N > > shift_r (int64< N, E > a)
 Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...
 
template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > > shift_r (uint64< N, E > a)
 Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...
 
template<class E1 , class E2 >
uint8x32 combine (uint8< 16, E1 > a, uint8< 16, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint16x16 combine (uint16< 8, E1 > a, uint16< 8, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint32x8 combine (uint32< 4, E1 > a, uint32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint64x4 combine (uint64< 2, E1 > a, uint64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int16x16 combine (int16< 8, E1 > a, int16< 8, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int32x8 combine (int32< 4, E1 > a, int32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int64x4 combine (int64< 2, E1 > a, int64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
float32x8 combine (float32< 4, E1 > a, float32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
float64x4 combine (float64< 2, E1 > a, float64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N *2 > combine (uint8< N, E1 > a1, uint8< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N *2 > combine (uint16< N, E1 > a1, uint16< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N *2 > combine (uint32< N, E1 > a1, uint32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint64< N *2 > combine (uint64< N, E1 > a1, uint64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int8< N *2 > combine (int8< N, E1 > a1, int8< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int16< N *2 > combine (int16< N, E1 > a1, int16< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int32< N *2 > combine (int32< N, E1 > a1, int32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int64< N *2 > combine (int64< N, E1 > a1, int64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
float32< N *2 > combine (float32< N, E1 > a1, float32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
float64< N *2 > combine (float64< N, E1 > a1, float64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class V = expr_vec_make_const<double,1>>
make_float (double v0)
 Creates a vector from floating-point values known at compile-time. More...
 
template<class V = expr_vec_make_const<double,2>>
make_float (double v0, double v1)
 Creates a vector from floating-point values known at compile-time. More...
 
template<class V = expr_vec_make_const<double,4>>
make_float (double v0, double v1, double v2, double v3)
 Creates a vector from floating-point values known at compile-time. More...
 
template<class V = expr_vec_make_const<double,8>>
make_float (double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
 Creates a vector from floating-point values known at compile-time. More...
 
template<class V = expr_vec_make_const<int64_t,1>>
make_int (int64_t v0)
 Creates a vector from signed integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<int64_t,2>>
make_int (int64_t v0, int64_t v1)
 Creates a vector from signed integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<int64_t,4>>
make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3)
 Creates a vector from signed integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<int64_t,8>>
make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7)
 Creates a vector from signed integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<int64_t,16>>
make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7, int64_t v8, int64_t v9, int64_t v10, int64_t v11, int64_t v12, int64_t v13, int64_t v14, int64_t v15)
 Creates a vector from signed integer values known at compile-time. More...
 
template<int s0, int s1, unsigned N>
uint8< N > make_shuffle_bytes16_mask (uint8< N > &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, unsigned N>
uint8< N > make_shuffle_bytes16_mask (uint8< N > &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint8< N > make_shuffle_bytes16_mask (uint8< N > &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15, unsigned N>
uint8< N > make_shuffle_bytes16_mask (uint8< N > &mask)
 Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, unsigned N>
uint16< N > make_shuffle_bytes16_mask (uint16< N > &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, unsigned N>
uint16< N > make_shuffle_bytes16_mask (uint16< N > &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint16< N > make_shuffle_bytes16_mask (uint16< N > &mask)
 Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, unsigned N>
uint32< N > make_shuffle_bytes16_mask (uint32< N > &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, int s2, int s3, unsigned N>
uint32< N > make_shuffle_bytes16_mask (uint32< N > &mask)
 Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<int s0, int s1, unsigned N>
uint64< N > make_shuffle_bytes16_mask (uint64< N > &mask)
 Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions. More...
 
template<class V = expr_vec_make_const<uint64_t,1>>
make_uint (uint64_t v0)
 Creates a vector from unsigned integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<uint64_t,2>>
make_uint (uint64_t v0, uint64_t v1)
 Creates a vector from unsigned integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<uint64_t,4>>
make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3)
 Creates a vector from unsigned integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<uint64_t,8>>
make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7)
 Creates a vector from unsigned integer values known at compile-time. More...
 
template<class V = expr_vec_make_const<uint64_t,16>>
make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7, uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11, uint64_t v12, uint64_t v13, uint64_t v14, uint64_t v15)
 Creates a vector from unsigned integer values known at compile-time. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move16_l (const any_vec8< N, V > &a)
 Moves the elements in an int8x16 vector to the left by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move8_l (const any_vec16< N, V > &a)
 Moves the 16-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move4_l (const any_vec32< N, V > &a)
 Moves the 32-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move2_l (const any_vec64< N, V > &a)
 Moves the 64-bit elements in a vector to the left by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move16_r (const any_vec8< N, V > &a)
 Moves the 8-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move8_r (const any_vec16< N, V > &a)
 Moves the 16-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move4_r (const any_vec32< N, V > &a)
 Moves the 32-bit elements in a vector to the right by shift positions. More...
 
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
move2_r (const any_vec64< N, V > &a)
 Moves the 64-bit elements in a vector to the right by shift positions. More...
 
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute2 (const any_vec64< N, V > &a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute4 (const any_vec16< N, V > &a)
 Permutes the 16-bit values within each 4 consecutive values of the vector. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute4 (const any_vec32< N, V > &a)
 Permutes the values of each set of four consecutive 32-bit values. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
permute4 (const any_vec64< N, V > &a)
 Permutes the values of each set of four consecutive 64-bit values. More...
 
uint8x16 permute_bytes16 (uint8x16 a, uint8x16 mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
uint8< N > permute_bytes16 (uint8< N > a, uint8< N > mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
uint16< N > permute_bytes16 (uint16< N > a, uint16< N > mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
uint32< N > permute_bytes16 (uint32< N > a, uint32< N > mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
uint64< N > permute_bytes16 (uint64< N > a, uint64< N > mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
float32< N > permute_bytes16 (float32< N > a, uint32< N > mask)
 Selects bytes from a vector according to a mask. More...
 
template<unsigned N>
float64< N > permute_bytes16 (float64< N > a, uint64< N > mask)
 Selects bytes from a vector according to a mask. More...
 
uint8x16 permute_zbytes16 (uint8x16 a, uint8x16 mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint8< N > permute_zbytes16 (uint8< N > a, uint8< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint16< N > permute_zbytes16 (uint16< N > a, uint16< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint32< N > permute_zbytes16 (uint32< N > a, uint32< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint64< N > permute_zbytes16 (uint64< N > a, uint64< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
float32< N > permute_zbytes16 (float32< N > a, uint32< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<unsigned N>
float64< N > permute_zbytes16 (float64< N > a, uint64< N > mask)
 Selects bytes from a vector according to a mask, optionally selecting zero. More...
 
template<class V = expr_vec_set_splat<int>>
splat (int x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_set_splat<unsigned>>
splat (unsigned x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_set_splat<int64_t>>
splat (int64_t x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_set_splat<uint64_t>>
splat (uint64_t x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_set_splat<float>>
splat (float x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<class V = expr_vec_set_splat<double>>
splat (double x)
 Loads a value from a register and broadcasts it to all elements of a vector. More...
 
template<unsigned sa0, unsigned sa1, unsigned sb0, unsigned sb1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
shuffle2 (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
 Selects 32-bit floating-point values from two vectors. More...
 
template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
shuffle2 (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
 Selects 32-bit values from two vectors. More...
 
uint8x16 shuffle_bytes16 (uint8x16 a, uint8x16 b, uint8x16 mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
uint8< N > shuffle_bytes16 (uint8< N > a, uint8< N > b, uint8< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
uint16< N > shuffle_bytes16 (uint16< N > a, uint16< N > b, uint16< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
uint32< N > shuffle_bytes16 (uint32< N > a, uint32< N > b, uint32< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
uint64< N > shuffle_bytes16 (uint64< N > a, uint64< N > b, uint64< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
float32< N > shuffle_bytes16 (float32< N > a, float32< N > b, uint32< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
template<unsigned N>
float64< N > shuffle_bytes16 (float64< N > a, float64< N > b, uint64< N > mask)
 Selects bytes from two vectors according to a mask. More...
 
uint8x16 shuffle_zbytes16 (uint8x16 a, uint8x16 b, uint8x16 mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint8< N > shuffle_zbytes16 (uint8< N > a, uint8< N > b, uint8< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint16< N > shuffle_zbytes16 (uint16< N > a, uint16< N > b, uint16< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint32< N > shuffle_zbytes16 (uint32< N > a, uint32< N > b, uint32< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
uint64< N > shuffle_zbytes16 (uint64< N > a, uint64< N > b, uint64< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
float32< N > shuffle_zbytes16 (float32< N > a, float32< N > b, uint32< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned N>
float64< N > shuffle_zbytes16 (float64< N > a, float64< N > b, uint64< N > mask)
 Selects bytes from two vectors according to a mask, optionally selecting zero. More...
 
template<unsigned s, unsigned N, class V >
detail::get_expr_nomask< V,
void >::empty 
splat (const any_vec< N, V > &a)
 Broadcasts the specified element to all elements. More...
 
template<unsigned s, unsigned N, class E >
int8< N, expr_splat16< s, int8
< N, E > > > 
splat16 (int8< N, E > a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
uint8< N, expr_splat16< s,
uint8< N, E > > > 
splat16 (uint8< N, E > a)
 Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
int16< N, expr_splat8< s,
int16< N, E > > > 
splat8 (int16< N, E > a)
 Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
uint16< N, expr_splat8< s,
uint16< N, E > > > 
splat8 (uint16< N, E > a)
 Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
int32< N, expr_splat4< s,
int32< N, E > > > 
splat4 (int32< N, E > a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
uint32< N, expr_splat4< s,
uint32< N, E > > > 
splat4 (uint32< N, E > a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
int64< N, expr_splat2< s,
int64< N, E > > > 
splat2 (int64< N, E > a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
uint64< N, expr_splat2< s,
uint64< N, E > > > 
splat2 (uint64< N, E > a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
float32< N, expr_splat4< s,
float32< N, E > > > 
splat4 (float32< N, E > a)
 Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...
 
template<unsigned s, unsigned N, class E >
float64< N, expr_splat2< s,
float64< N, E > > > 
splat2 (float64< N, E > a)
 Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...
 
float32x4 to_float32 (int32x4 a)
 Converts 32-bit integer values to 32-bit float values. More...
 
template<unsigned N>
float32< N > to_float32 (int32< N > a)
 Converts 32-bit integer values to 32-bit float values. More...
 
float32x4 to_float32 (float64x4 a)
 Converts 64-bit float values to 32-bit float values. More...
 
float64x4 to_float64 (int32x4 a)
 Converts the 32-bit integer values to 64-bit float values. More...
 
float64x4 to_float64 (float32x4 a)
 Converts the 32-bit float values to 64-bit float values. More...
 
int32x4 to_int32 (float32x4 a)
 Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...
 
template<unsigned N>
uint32< N > to_int32x8 (float32< N > a)
 Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...
 
int32x4 to_int32 (float64x4 a)
 Converts the values of a doublex2 vector into int32_t representation using truncation. More...
 
uint64x4 to_int64 (int32x4 a)
 Extends the values of a signed int32x4 vector to 64-bits. More...
 
uint64x4 to_int64 (uint32x4 a)
 Extends the values of an unsigned int32x4 vector to 64-bits. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > > unzip16_hi (uint8< N, E1 > a, uint8< N, E2 > b)
 De-interleaves the even(higher) elements of two int8x16 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > > unzip8_hi (uint16< N, E1 > a, uint16< N, E2 > b)
 De-interleaves the even(higher) elements of two int16x8 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > > unzip4_hi (uint32< N, E1 > a, uint32< N, E2 > b)
 De-interleaves the even(higher) elements of two int32x4 vectors. More...
 
template<unsigned N, class E1 , class E2 >
uint64< N, uint64< N > > unzip2_hi (uint64< N, E1 > a, uint64< N, E2 > b)
 De-interleaves the even(higher) elements of two int64x2 vectors. More...
 
template<unsigned N, class E1 , class E2 >
float32< N, float32< N > > unzip4_hi (float32< N, E1 > a, float32< N, E2 > b)
 De-interleaves the even(higher) elements of two float32x4 vectors. More...
 
template<unsigned N, class E1 , class E2 >
float64< N, float64< N > > unzip2_hi (float64< N, E1 > a, float64< N, E2 > b)
 De-interleaves the even(higher) elements of two float64x2 vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip16_hi (const any_vec8< N, V1 > &a, const any_vec8< N, V2 > &b)
 Interleaves the higher halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip8_hi (const any_vec16< N, V1 > &a, const any_vec16< N, V2 > &b)
 Interleaves the higher halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip4_hi (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
 Interleaves the higher halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip2_hi (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
 Interleaves the higher halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip16_lo (const any_vec8< N, V1 > &a, const any_vec8< N, V2 > &b)
 Interleaves the lower halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip8_lo (const any_vec16< N, V1 > &a, const any_vec16< N, V2 > &b)
 Interleaves the lower halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip4_lo (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
 Interleaves the lower halves of two vectors. More...
 
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1,
V2, void >::empty 
zip2_lo (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
 Interleaves the lower halves of two vectors. More...
 
Archoperator|= (Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Archoperator&= (Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator| (const Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator& (const Arch &x, const Arch &y)
 Bitwise operators for Arch. More...
 
Arch operator~ (const Arch &x)
 Bitwise operators for Arch. More...
 

Typedef Documentation

using simdpp::float32v = typedef float32<SIMDPP_FAST_FLOAT32_SIZE>
using simdpp::float32v4 = typedef float32<SIMDPP_FAST_FLOAT32_SIZE*4>
using simdpp::float32x4 = typedef float32<4>
using simdpp::float32x8 = typedef float32<8>
using simdpp::float64v = typedef float64<SIMDPP_FAST_FLOAT64_SIZE>
using simdpp::float64v2 = typedef float64<SIMDPP_FAST_FLOAT64_SIZE*2>
using simdpp::float64v4 = typedef float64<SIMDPP_FAST_FLOAT64_SIZE*4>
using simdpp::float64x2 = typedef float64<2>
using simdpp::float64x4 = typedef float64<4>
using simdpp::int16v = typedef int16<SIMDPP_FAST_INT16_SIZE>
using simdpp::int16v2 = typedef int16<SIMDPP_FAST_INT16_SIZE*2>
using simdpp::int16v4 = typedef int16<SIMDPP_FAST_INT16_SIZE*4>
using simdpp::int16x16 = typedef int16<16>
using simdpp::int16x8 = typedef int16<8>
using simdpp::int32v = typedef int32<SIMDPP_FAST_INT32_SIZE>
using simdpp::int32v2 = typedef int32<SIMDPP_FAST_INT32_SIZE*2>
using simdpp::int32v4 = typedef int32<SIMDPP_FAST_INT32_SIZE*4>
using simdpp::int32x4 = typedef int32<4>
using simdpp::int32x8 = typedef int32<8>
using simdpp::int64v = typedef int64<SIMDPP_FAST_INT64_SIZE>
using simdpp::int64v2 = typedef int64<SIMDPP_FAST_INT64_SIZE*2>
using simdpp::int64v4 = typedef int64<SIMDPP_FAST_INT64_SIZE*4>
using simdpp::int64x2 = typedef int64<2>
using simdpp::int64x4 = typedef int64<4>
using simdpp::int8v = typedef int8<SIMDPP_FAST_INT8_SIZE>
using simdpp::int8v2 = typedef int8<SIMDPP_FAST_INT8_SIZE*2>
using simdpp::int8v4 = typedef int8<SIMDPP_FAST_INT8_SIZE*4>
using simdpp::int8x16 = typedef int8<16>
using simdpp::int8x32 = typedef int8<32>
using simdpp::mask_float32v = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE>
using simdpp::mask_float32v2 = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE*2>
using simdpp::mask_float32v4 = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE*4>
using simdpp::mask_float64v = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE>
using simdpp::mask_float64v2 = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE*2>
using simdpp::mask_float64v4 = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE*4>
using simdpp::mask_int16v = typedef mask_int16<SIMDPP_FAST_INT16_SIZE>
using simdpp::mask_int16v2 = typedef mask_int16<SIMDPP_FAST_INT16_SIZE*2>
using simdpp::mask_int16v4 = typedef mask_int16<SIMDPP_FAST_INT16_SIZE*4>
using simdpp::mask_int16x16 = typedef mask_int16<16>
using simdpp::mask_int16x8 = typedef mask_int16<8>
using simdpp::mask_int32v = typedef mask_int32<SIMDPP_FAST_INT32_SIZE>
using simdpp::mask_int32v2 = typedef mask_int32<SIMDPP_FAST_INT32_SIZE*2>
using simdpp::mask_int32v4 = typedef mask_int32<SIMDPP_FAST_INT32_SIZE*4>
using simdpp::mask_int32x4 = typedef mask_int32<4>
using simdpp::mask_int32x8 = typedef mask_int32<8>
using simdpp::mask_int64v = typedef mask_int64<SIMDPP_FAST_INT64_SIZE>
using simdpp::mask_int64v2 = typedef mask_int64<SIMDPP_FAST_INT64_SIZE*2>
using simdpp::mask_int64v4 = typedef mask_int64<SIMDPP_FAST_INT64_SIZE*4>
using simdpp::mask_int64x2 = typedef mask_int64<2>
using simdpp::mask_int64x4 = typedef mask_int64<4>
using simdpp::mask_int8v = typedef mask_int8<SIMDPP_FAST_INT8_SIZE>
using simdpp::mask_int8v2 = typedef mask_int8<SIMDPP_FAST_INT8_SIZE*2>
using simdpp::mask_int8v4 = typedef mask_int8<SIMDPP_FAST_INT8_SIZE*4>
using simdpp::mask_int8x16 = typedef mask_int8<16>
using simdpp::mask_int8x32 = typedef mask_int8<32>
using simdpp::uint16v = typedef uint16<SIMDPP_FAST_INT16_SIZE>
using simdpp::uint16v2 = typedef uint16<SIMDPP_FAST_INT16_SIZE*2>
using simdpp::uint16v4 = typedef uint16<SIMDPP_FAST_INT16_SIZE*4>
using simdpp::uint16x16 = typedef uint16<16>
using simdpp::uint16x8 = typedef uint16<8>
using simdpp::uint32v = typedef uint32<SIMDPP_FAST_INT32_SIZE>
using simdpp::uint32v2 = typedef uint32<SIMDPP_FAST_INT32_SIZE*2>
using simdpp::uint32v4 = typedef uint32<SIMDPP_FAST_INT32_SIZE*4>
using simdpp::uint32x4 = typedef uint32<4>
using simdpp::uint32x8 = typedef uint32<8>
using simdpp::uint64v = typedef uint64<SIMDPP_FAST_INT64_SIZE>
using simdpp::uint64v2 = typedef uint64<SIMDPP_FAST_INT64_SIZE*2>
using simdpp::uint64v4 = typedef uint64<SIMDPP_FAST_INT64_SIZE*4>
using simdpp::uint64x2 = typedef uint64<2>
using simdpp::uint64x4 = typedef uint64<4>
using simdpp::uint8v = typedef uint8<SIMDPP_FAST_INT8_SIZE>
using simdpp::uint8v2 = typedef uint8<SIMDPP_FAST_INT8_SIZE*2>
using simdpp::uint8v4 = typedef uint8<SIMDPP_FAST_INT8_SIZE*4>
using simdpp::uint8x16 = typedef uint8<16>
using simdpp::uint8x32 = typedef uint8<32>

Function Documentation

template<unsigned N, class E >
uint8<N, expr_abs<int8<N,E> > > simdpp::abs ( int8< N, E >  a)

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
template<unsigned N, class E >
float32<N, expr_abs<float32<N,E> > > simdpp::abs ( float32< N, E >  a)

Computes absolute value of floating point values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N, class E >
uint16<N, expr_abs<int16<N,E> > > simdpp::abs ( int16< N, E >  a)

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
float64<N, expr_abs<float64<N,E> > > simdpp::abs ( float64< N, E >  a)

Computes absolute value of floating point values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • Not vectorized in NEON and .
  • In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
  • In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
template<unsigned N, class E >
uint32<N, expr_abs<int32<N,E> > > simdpp::abs ( int32< N, E >  a)

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 1-3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
template<unsigned N, class E >
uint64<N, expr_abs<int64<N,E> > > simdpp::abs ( int64< N, E >  a)

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 5 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • Not vectorized in ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 10 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • Not vectorized in ALTIVEC.
template<unsigned N, class E1 , class E2 >
float32<N, expr_add<float32<N,E1>, float32<N,E2> > > simdpp::add ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Adds the values of two vectors.

r0 = a0 + b0
...
rN = aN + bN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_add<uint8<N, typename V1::expr_type>, uint8<N, typename V2::expr_type> > >::type simdpp::add ( const any_int8< N, V1 > &  a,
const any_int8< N, V2 > &  b 
)

Adds 8-bit integer values.

r0 = a0 + b0
...
rN = aN + bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, expr_add<float64<N,E1>, float64<N,E2> > > simdpp::add ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Adds the values of two vectors.

r0 = a0 + b0
...
rN = aN + bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_add<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::add ( const any_int16< N, V1 > &  a,
const any_int16< N, V2 > &  b 
)

Adds 16-bit integer values.

r0 = a0 + b0
...
rN = aN + bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_add<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::add ( const any_int32< N, V1 > &  a,
const any_int32< N, V2 > &  b 
)

Adds 32-bit integer values.

r0 = a0 + b0
...
rN = aN + bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_add<uint64<N, typename V1::expr_type>, uint64<N, typename V2::expr_type> > >::type simdpp::add ( const any_int64< N, V1 > &  a,
const any_int64< N, V2 > &  b 
)

Adds 64-bit integer values.

r0 = a0 + b0
...
rN = aN + bN
128-bit version:
  • In ALTIVEC this intrinsic results in at least 5-6 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 10-11 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, expr_add_sat<int8<N,E1>, int8<N,E2> > > simdpp::add_sat ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Adds and saturates signed 8-bit integer values.

r0 = signed_saturate(a0 + b0)
...
rN = signed_saturate(aN + bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, expr_add_sat<int16<N,E1>, int16<N,E2> > > simdpp::add_sat ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Adds and saturates signed 16-bit integer values.

r0 = signed_saturate(a0 + b0)
...
rN = signed_saturate(aN + bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, expr_add_sat<uint8<N,E1>, uint8<N,E2> > > simdpp::add_sat ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Adds and saturates unsigned 8-bit integer values.

r0 = unsigned_saturate(a0 + b0)
...
rN = unsigned_saturate(aN + bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, expr_add_sat<uint16<N,E1>, uint16<N,E2> > > simdpp::add_sat ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Adds and saturates unsigned 16-bit integer values.

r0 = unsigned_saturate(a0 + b0)
...
rN = unsigned_saturate(aN + bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align16 ( const any_vec8< N, V1 > &  lower,
const any_vec8< N, V2 > &  upper 
)

Extracts a int8x16 vector from two concatenated int8x16 vectors.

shift: pos:| 0 1 . 14 15 |
0 r = [ l0 l1 . l14 l15 ]
1 r = [ l1 l2 . l15 u0 ]
2 r = [ l2 l3 . u0 l1 ]
... .. .. .. ... .. ..
15 r = [ l15 u0 . u13 u14 ]
16 r = [ u0 u1 . u14 u15 ]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align2 ( const any_vec64< N, V1 > &  lower,
const any_vec64< N, V2 > &  upper 
)

Extracts a int64x2 vector from two concatenated int64x2 vectors.

shift: pos:| 0 1 |
0 r = [ l0 l1 ]
1 r = [ l1 u0 ]
2 r = [ u0 u1 ]
int64
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float64
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align4 ( const any_vec32< N, V1 > &  lower,
const any_vec32< N, V2 > &  upper 
)

Extracts a int32x4 vector from two concatenated int32x4 vectors.

shift: pos:| 0 1 2 3 |
0 r = [ l0 l1 l2 l3 ]
1 r = [ l1 l2 l3 u0 ]
2 r = [ l2 l3 u0 u1 ]
3 r = [ l3 u0 u1 u2 ]
4 r = [ u0 u1 u2 u3 ]
int32
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align8 ( const any_vec16< N, V1 > &  lower,
const any_vec16< N, V2 > &  upper 
)

Extracts a int16x8 vector from two concatenated int16x8 vectors.

shift: pos:| 0 1 . 6 7 |
0 r = [ l0 l1 . l6 l7 ]
1 r = [ l1 l2 . l7 u0 ]
2 r = [ l2 l3 . u0 l1 ]
... .. .. .. ... .. ..
7 r = [ l3 u0 . u5 u6 ]
8 r = [ u0 u1 . u6 u7 ]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
The all 128-bit sub-vectors are processed as if 128-bit instruction was applied to each of them separately.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::avg ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, int8<N> > simdpp::avg ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::avg ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, int16<N> > simdpp::avg ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::avg ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 12-13 instructions.
  • In AVX2 this intrinsic results in at least 6-7 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int32<N, int32<N> > simdpp::avg ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 18-19 instructions.
  • In AVX2 this intrinsic results in at least 9-10 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::avg_trunc ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, int8<N> > simdpp::avg_trunc ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::avg_trunc ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, int16<N> > simdpp::avg_trunc ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::avg_trunc ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int32<N, int32<N> > simdpp::avg_trunc ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 14-15 instructions.
  • In AVX2 this intrinsic results in at least 7-8 instructions.
  • In ALTIVEC this intrinsic results in at least 8 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and<expr_bit_and, V1, V2>::type simdpp::bit_and ( const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b 
)

Computes bitwise AND of integer or floating-point vectors.

r0 = a0 & b0
...
rN = aN & bN
Todo:
: icost
template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and<expr_bit_andnot, V1, V2>::type simdpp::bit_andnot ( const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b 
)

Computes bitwise AND NOT of two integer or floating-point vectors.

r0 = a0 & ~b0
...
rN = aN & ~bN
Todo:
: icost
template<class R , class T >
R simdpp::bit_cast ( t)

Casts between unrelated types.

No changes to the stored values are performed.

Conversions between vector and non-vector types are not allowed.

Conversion from non-mask type to mask type is not allowed.

Conversion from mask type to a non-mask type is not a costless operation because masks may have different logical and physical layout (e.g., in some implementations one bit represents entire element in a vector).

Conversions between mask types is only allowed if the element size is the same.

template<unsigned N, class V >
detail::get_expr<V, expr_bit_not<V> >::empty simdpp::bit_not ( const any_vec< N, V > &  a)

Computes bitwise NOT of an integer or floating-point vector.

r = ~a
Todo:
icost
template<unsigned N, class V1 , class V2 >
detail::get_expr_bit_or<V1, V2>::type simdpp::bit_or ( const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b 
)

Computes bitwise OR of integer vectors.

r0 = a0 | b0
...
rN = aN | bN
Todo:
icost
template<unsigned N, class V1 , class V2 >
detail::get_expr2<V1, V2, void>::empty simdpp::bit_xor ( const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b 
)

Computes bitwise XOR of integer or floating-point vectors.

r0 = a0 ^ b0
...
rN = aN ^ bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 , class V3 >
detail::get_expr_blend<V1, V2, V3>::type simdpp::blend ( const any_vec< N, V1 > &  on,
const any_vec< N, V2 > &  off,
const any_vec< N, V3 > &  mask 
)

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN
Todo:
icost
int16
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int64
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32
128-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float64
128-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
  • Not vectorized in NEON and .
template<unsigned N, class E >
float32<N, float32<N> > simdpp::ceil ( float32< N, E >  a)

Rounds the values a vector towards positive infinity.

r0 = ceil(a0)
...
rN = ceil(aN)
128-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 13-15 instructions.
  • In NEON this intrinsic results in at least 11-13 instructions.
256-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 26-28 instructions.
  • In NEON this intrinsic results in at least 22-24 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_eq ( const any_int8< N, V1 > &  a,
const any_int8< N, V2 > &  b 
)

Compares 8-bit values for equality.

r0 = (a0 == b0) ? 0xff : 0x0
...
rN = (aN == bN) ? 0xff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_eq ( const any_int16< N, V1 > &  a,
const any_int16< N, V2 > &  b 
)

Compares 16-bit values for equality.

r0 = (a0 == b0) ? 0xffff : 0x0
...
rN = (aN == bN) ? 0xffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_eq ( const any_int32< N, V1 > &  a,
const any_int32< N, V2 > &  b 
)

Compares the values of two int32x4 vectors for equality.

r0 = (a0 == b0) ? 0xffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int64<N, mask_int64<N> > simdpp::cmp_eq ( const any_int64< N, V1 > &  a,
const any_int64< N, V2 > &  b 
)

Compares the values of two int64x2 vectors for equality.

r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 3-4 instructions.
256-bit version:
  • In SSE2-SSSE3 and AVX this intrinsic results in at least 10 instructions.
  • In XOP and SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 6-7 instructions.
template<unsigned N, class V1 , class V2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_eq ( const any_float32< N, V1 > &  a,
const any_float32< N, V2 > &  b 
)

Compares the values of two float32x4 vectors for equality.

r0 = (a0 == b0) ? 0xffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_eq ( const any_float64< N, V1 > &  a,
const any_float64< N, V2 > &  b 
)

Compares the values of two float64x2 vectors for equality.

r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_ge ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_ge ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 >= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_gt ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Compares the values of two signed int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xff : 0x0
...
rN = (aN > bN) ? 0xff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_gt ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Compares the values of two unsigned int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xff : 0x0
...
rN = (aN > bN) ? 0xff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_gt ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Compares the values of two signed int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xffff : 0x0
...
rN = (aN > bN) ? 0xffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_gt ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Compares the values of two unsigned int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xffff : 0x0
...
rN = (aN > bN) ? 0xffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_gt ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Compares the values of two signed int32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_gt ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Compares the values of two unsigned int32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_gt ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Compares the values of two float32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_gt ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_le ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_le ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_lt ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_lt ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_lt ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_lt ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_lt ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_lt ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6-7 instructions.
  • In AVX2 this intrinsic results in at least 3-4 instructions.
  • In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_lt ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_lt ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int8<N, mask_int8<N> > simdpp::cmp_neq ( const any_int8< N, V1 > &  a,
const any_int8< N, V2 > &  b 
)

Compares the values of two int8x16 vectors for inequality.

r0 = (a0 != b0) ? 0xff : 0x0
...
rN = (aN != bN) ? 0xff : 0x0
128-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int16<N, mask_int16<N> > simdpp::cmp_neq ( const any_int16< N, V1 > &  a,
const any_int16< N, V2 > &  b 
)

Compares the values of two int16x8 vectors for inequality.

r0 = (a0 != b0) ? 0xffff : 0x0
...
rN = (aN != bN) ? 0xffff : 0x0
128-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int32<N, mask_int32<N> > simdpp::cmp_neq ( const any_int32< N, V1 > &  a,
const any_int32< N, V2 > &  b 
)

Compares the values of two int32x4 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffff : 0x0
128-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
256-bit version
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
mask_int64<N, mask_int64<N> > simdpp::cmp_neq ( const any_int64< N, V1 > &  a,
const any_int64< N, V2 > &  b 
)

Compares the values of two int64x2 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
  • In SSE4.1 and AVX this intrinsic results in at least 2 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 3-5 instructions.
256-bit version:
  • In SSE2-SSSE3 and AVX this intrinsic results in at least 10 instructions.
  • In SSE4.1 and NEON this intrinsic results in at least 4 instructions.
  • In AVX2 and XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 6-8 instructions.
template<unsigned N, class V1 , class V2 >
mask_float32<N, mask_float32<N> > simdpp::cmp_neq ( const any_float32< N, V1 > &  a,
const any_float32< N, V2 > &  b 
)

Compares the values of two float32x4 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffff : 0x0
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
256-bit version
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 4 instructions.
template<unsigned N, class V1 , class V2 >
mask_float64<N, mask_float64<N> > simdpp::cmp_neq ( const any_float64< N, V1 > &  a,
const any_float64< N, V2 > &  b 
)

Compares the values of two float64x2 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffffffffffff : 0x0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float32<N, float32<N> > simdpp::div ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 10 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In ALTIVEC this intrinsic results in at least 19 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, float64<N> > simdpp::div ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned P>
uint8x16 simdpp::div_p ( uint8x16  num,
uint8x16  den 
)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN
128-bit version:
The operations costs at least 9 instructions per bit of precision.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
template<unsigned P>
uint16x8 simdpp::div_p ( uint16x8  num,
uint16x8  den 
)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN
128-bit version:
The operations costs at least 9 instructions per bit of precision.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
template<unsigned id>
uint16_t simdpp::extract ( uint16x8  a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int16_t simdpp::extract ( int16x8  a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint32_t simdpp::extract ( uint32x4  a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int32_t simdpp::extract ( int32x4  a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint64_t simdpp::extract ( uint64x2  a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4_1 this intrinsic results in at least 1 instructions.
  • In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
  • In SSE4_1_32bit this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int64_t simdpp::extract ( int64x2  a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4_1 this intrinsic results in at least 1 instructions.
  • In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
  • In SSE4_1_32bit this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
float simdpp::extract ( float32x4  a)

Extracts an element from float32x4 vector.

r = a[id]

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
double simdpp::extract ( float64x2  a)

Extracts an element from float64x2 vector.

r = a[id]

This function may have very high latency.

  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint16_t simdpp::extract_bits ( uint8x16  a)

Extracts specific bit from each byte of each element of a int8x16 vector.

The default template argument selects the bits from each byte in most efficient way.

r = (a[0] & 0x80 >> 7) | (a[1] & 0x80 >> 6) | ... | (a[15] & 0x80 << 8)
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 7-9 instructions.
  • In ALTIVEC this intrinsic results in at least 9-11 instructions.
uint16_t simdpp::extract_bits_any ( uint8x16  a)
inline

Extracts a bit from each byte of each element of a int8x16 vector.

This operation is only sensible if each byte within the vector is either 0x00 or 0xff.

r = ((a[0] & 0x??) ? 0x01 : 0) |
((a[1] & 0x??) ? 0x02 : 0) |
...
((a[15] & 0x??) ? 0x80 : 0)
  • In NEON this intrinsic results in at least 6-7 instructions.
  • In ALTIVEC this intrinsic results in at least 8-9 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::floor ( float32< N, E >  a)

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
  • In NEON this intrinsic results in at least 20-21 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 , class E3 >
float32<N, expr_fmadd<float32<N,E1>, float32<N,E2>, float32<N,E3> > > simdpp::fmadd ( float32< N, E1 >  a,
float32< N, E2 >  b,
float32< N, E3 >  c 
)

Performs a fused multiply-add operation.

r0 = a0 * b0 + c0
...
rN = aN * bN + cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >
float64<N, expr_fmadd<float64<N,E1>, float64<N,E2>, float64<N,E3> > > simdpp::fmadd ( float64< N, E1 >  a,
float64< N, E2 >  b,
float64< N, E3 >  c 
)

Performs a fused multiply-add operation.

r0 = a0 * b0 + c0
...
rN = aN * bN + cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >
float32<N, expr_fmsub<float32<N,E1>, float32<N,E2>, float32<N,E3> > > simdpp::fmsub ( float32< N, E1 >  a,
float32< N, E2 >  b,
float32< N, E3 >  c 
)

Performs a fused multiply-sutract operation.

r0 = a0 * b0 - c0
...
rN = aN * bN - cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >
float64<N, expr_fmsub<float64<N,E1>, float64<N,E2>, float64<N,E3> > > simdpp::fmsub ( float64< N, E1 >  a,
float64< N, E2 >  b,
float64< N, E3 >  c 
)

Performs a fused multiply-sutract operation.

r0 = a0 * b0 - c0
...
rN = aN * bN - cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

Arch simdpp::get_arch_gcc_builtin_cpu_supports ( )
inline

Retrieves supported architecture using GCC __builtin_cpu_supports function.

Works only on x86.

Arch simdpp::get_arch_linux_cpuinfo ( )
inline

Retrieves supported architecture from Linux /proc/cpuinfo file.

Works on X86 and ARM.

template<unsigned N, class E >
mask_float32<N, mask_float32<N> > simdpp::isnan ( float32< N, E >  a)

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffff : 0
...
rN = isnan(aN) ? 0xffffffff : 0
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
mask_float64<N, mask_float64<N> > simdpp::isnan ( float64< N, E >  a)

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
mask_float32<N, mask_float32<N> > simdpp::isnan2 ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 6 instructions.
template<unsigned N, class E1 , class E2 >
mask_float64<N, mask_float64<N> > simdpp::isnan2 ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<class V = expr_vec_load>
V simdpp::load ( const void *  p)

Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location.

128-bit version:
a[0..127] = *(p)

p must be aligned to 16 bytes.

256-bit version:
a[0..255] = *(p)

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
template<unsigned N, class V >
void simdpp::load_packed2 ( any_vec< N, V > &  a,
any_vec< N, V > &  b,
const void *  p 
)

Loads values packed in pairs, de-interleaves them and stores the result into two vectors.

a = [ *(p), *(p+2), *(p+4), ... , *(p+M*2-2) ]
b = [ *(p+1), *(p+3), *(p+5), ... , *(p+M*2-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >
void simdpp::load_packed3 ( any_vec< N, V > &  a,
any_vec< N, V > &  b,
any_vec< N, V > &  c,
const void *  p 
)

Loads values packed in triplets, de-interleaves them and stores the result into three vectors.

a = [ *(p), *(p+3), *(p+6), ... , *(p+M*3-3) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+M*3-2) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+M*3-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >
void simdpp::load_packed4 ( any_vec< N, V > &  a,
any_vec< N, V > &  b,
any_vec< N, V > &  c,
any_vec< N, V > &  d,
const void *  p 
)

Loads values packed in quartets, de-interleaves them and stores the result into four vectors.

a = [ *(p), *(p+4), *(p+8), ... , *(p+M*4-4) ]
b = [ *(p+1), *(p+5), *(p+9), ... , *(p+M*4-3) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+M*4-2) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+M*4-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<class V = expr_vec_load_splat>
V simdpp::load_splat ( const void *  p)

Loads a value from a memory location and broadcasts it to all elements of a vector.

r0 = *p
...
rN = *p

p must have the alignment of the element of the target vector.

template<class V = expr_vec_load_u>
V simdpp::load_u ( const void *  p)

Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location.

128-bit version:
a[0..127] = *(p)

p must be aligned to the element size. If p is aligned to 16 bytes only the referenced 16 byte block is accessed. Otherwise, memory within the smallest 16-byte aligned 32-byte block may be accessed.

  • In ALTIVEC this intrinsic results in at least 4 instructions.
256-bit version:
a[0..255] = *(p)

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1 and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 6 instructions.

p must be aligned to the element size. If p is aligned to 32 bytes only the referenced 16 byte block is accessed. Otherwise, memory within the smallest 32-byte aligned 64-byte block may be accessed.

template<class V = expr_vec_make_const<double,1>>
V simdpp::make_float ( double  v0)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<double,2>>
V simdpp::make_float ( double  v0,
double  v1 
)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<double,4>>
V simdpp::make_float ( double  v0,
double  v1,
double  v2,
double  v3 
)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<double,8>>
V simdpp::make_float ( double  v0,
double  v1,
double  v2,
double  v3,
double  v4,
double  v5,
double  v6,
double  v7 
)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<int64_t,1>>
V simdpp::make_int ( int64_t  v0)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<int64_t,2>>
V simdpp::make_int ( int64_t  v0,
int64_t  v1 
)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<int64_t,4>>
V simdpp::make_int ( int64_t  v0,
int64_t  v1,
int64_t  v2,
int64_t  v3 
)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<int64_t,8>>
V simdpp::make_int ( int64_t  v0,
int64_t  v1,
int64_t  v2,
int64_t  v3,
int64_t  v4,
int64_t  v5,
int64_t  v6,
int64_t  v7 
)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<int64_t,16>>
V simdpp::make_int ( int64_t  v0,
int64_t  v1,
int64_t  v2,
int64_t  v3,
int64_t  v4,
int64_t  v5,
int64_t  v6,
int64_t  v7,
int64_t  v8,
int64_t  v9,
int64_t  v10,
int64_t  v11,
int64_t  v12,
int64_t  v13,
int64_t  v14,
int64_t  v15 
)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<int s0, int s1, unsigned N>
uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r14 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+14] : b[s0+12])
r15 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+14] : b[s1+12])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>
uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15, unsigned N>
uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > &  mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>
uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>
uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > &  mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>
uint32<N> simdpp::make_shuffle_bytes16_mask ( uint32< N > &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>
uint32<N> simdpp::make_shuffle_bytes16_mask ( uint32< N > &  mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>
uint64<N> simdpp::make_shuffle_bytes16_mask ( uint64< N > &  mask)

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])
256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<class V = expr_vec_make_const<uint64_t,1>>
V simdpp::make_uint ( uint64_t  v0)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<uint64_t,2>>
V simdpp::make_uint ( uint64_t  v0,
uint64_t  v1 
)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<uint64_t,4>>
V simdpp::make_uint ( uint64_t  v0,
uint64_t  v1,
uint64_t  v2,
uint64_t  v3 
)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<uint64_t,8>>
V simdpp::make_uint ( uint64_t  v0,
uint64_t  v1,
uint64_t  v2,
uint64_t  v3,
uint64_t  v4,
uint64_t  v5,
uint64_t  v6,
uint64_t  v7 
)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<class V = expr_vec_make_const<uint64_t,16>>
V simdpp::make_uint ( uint64_t  v0,
uint64_t  v1,
uint64_t  v2,
uint64_t  v3,
uint64_t  v4,
uint64_t  v5,
uint64_t  v6,
uint64_t  v7,
uint64_t  v8,
uint64_t  v9,
uint64_t  v10,
uint64_t  v11,
uint64_t  v12,
uint64_t  v13,
uint64_t  v14,
uint64_t  v15 
)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version
| 0 1 2 3 ... n |
r = [ v0 v0 v0 v0 ... v0 ]
2 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v0 v1 ... v1 ]
4 parameters version
| 0 1 2 3 ... n |
r = [ v0 v1 v2 v3 ... v3 ]
8 parameters version
| 0 1 .. 7 8 ... n |
r = [ v0 v1 .. v7 v0 ... v7 ]
template<unsigned N, class E1 , class E2 >
float32<N, float32<N> > simdpp::max ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, int8<N> > simdpp::max ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::max ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, float64<N> > simdpp::max ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned N, class E1 , class E2 >
int16<N, int16<N> > simdpp::max ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::max ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int32<N, int32<N> > simdpp::max ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::max ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float32<N, float32<N> > simdpp::min ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, int8<N> > simdpp::min ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Computes minimum of signed 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::min ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, float64<N> > simdpp::min ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, int16<N> > simdpp::min ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::min ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int32<N, int32<N> > simdpp::min ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::min ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
  • In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move16_l ( const any_vec8< N, V > &  a)

Moves the elements in an int8x16 vector to the left by shift positions.

shift: pos:| 0 1 . 14 15 |
0 r = [ a0 a1 . a14 a15 ]
1 r = [ a1 a2 . a15 0 ]
2 r = [ a2 a3 . 0 0 ]
... .. .. .. ... .. .. ..
14 r = [ a15 0 . 0 0 ]
15 r = [ 0 0 . 0 0 ]
16 r = [ 0 0 . 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move16_r ( const any_vec8< N, V > &  a)

Moves the 8-bit elements in a vector to the right by shift positions.

shift: pos:| 0 1 . 14 15 |
0 r = [ a0 a1 . a14 a15 ]
1 r = [ 0 a0 . a13 a14 ]
2 r = [ 0 0 . a12 a13 ]
... .. .. .. ... .. .. ..
14 r = [ 0 0 . a0 a1 ]
15 r = [ 0 0 . 0 a0 ]
16 r = [ 0 0 . 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move2_l ( const any_vec64< N, V > &  a)

Moves the 64-bit elements in a vector to the left by shift positions.

shift: pos:| 0 1 |
0 r = [ a0 a1 ]
1 r = [ a1 0 ]
2 r = [ 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move2_r ( const any_vec64< N, V > &  a)

Moves the 64-bit elements in a vector to the right by shift positions.

shift: pos:| 0 1 |
0 r = [ a0 a1 ]
1 r = [ 0 a0 ]
2 r = [ 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move4_l ( const any_vec32< N, V > &  a)

Moves the 32-bit elements in a vector to the left by shift positions.

shift: pos:| 0 1 2 3 |
0 r = [ a0 a1 a2 a3 ]
1 r = [ a1 a2 a3 0 ]
2 r = [ a2 a3 0 0 ]
3 r = [ a3 0 0 0 ]
4 r = [ 0 0 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move4_r ( const any_vec32< N, V > &  a)

Moves the 32-bit elements in a vector to the right by shift positions.

shift: pos:| 0 1 2 3 |
0 r = [ a0 a1 a2 a3 ]
1 r = [ 0 a0 a1 a2 ]
2 r = [ 0 0 a0 a1 ]
3 r = [ 0 0 0 a0 ]
4 r = [ 0 0 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move8_l ( const any_vec16< N, V > &  a)

Moves the 16-bit elements in a vector to the left by shift positions.

shift: pos:| 0 1 . 6 7 |
0 r = [ a0 a1 . a6 a7 ]
1 r = [ a1 a2 . a7 0 ]
2 r = [ a2 a3 . 0 0 ]
... .. .. .. ... .. ..
6 r = [ a6 a7 . 0 0 ]
7 r = [ a7 0 . 0 0 ]
8 r = [ 0 0 . 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::move8_r ( const any_vec16< N, V > &  a)

Moves the 16-bit elements in a vector to the right by shift positions.

shift: pos:| 0 1 . 6 7 |
0 r = [ a0 a1 . a6 a7 ]
1 r = [ 0 a0 . a5 a6 ]
2 r = [ 0 0 . a4 a5 ]
... .. .. .. ... .. ..
6 r = [ 0 0 . a0 a1 ]
7 r = [ 0 0 . 0 a0 ]
8 r = [ 0 0 . 0 0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float32<N, expr_mul<float32<N,E1>, float32<N,E2> > > simdpp::mul ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Multiplies the values of two vectors.

r0 = a0 * b0
...
rN = aN * bN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, expr_mul<float64<N,E1>, float64<N,E2> > > simdpp::mul ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Multiplies the values of two vectors.

r0 = a0 * b0
...
rN = aN * bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, expr_mul_hi<int16<N,E1>, int16<N,E2> > > simdpp::mul_hi ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Multiplies signed 16-bit values and returns the higher half of the result.

r0 = high(a0 * b0)
...
rN = high(aN * bN)
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 6 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, expr_mul_hi<uint16<N,E1>, uint16<N,E2> > > simdpp::mul_hi ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Multiplies unsigned 16-bit values and returns the higher half of the result.

r0 = high(a0 * b0)
...
rN = high(aN * bN)
128-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 6 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_mul_lo<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::mul_lo ( const any_int16< N, V1 > &  a,
const any_int16< N, V2 > &  b 
)

Multiplies 16-bit values and returns the lower part of the multiplication.

r0 = low(a0 * b0)
...
rN = low(aN * bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_mul_lo<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::mul_lo ( const any_int32< N, V1 > &  a,
const any_int32< N, V2 > &  b 
)

Multiplies 32-bit values and returns the lower half of the result.

r0 = low(a0 * b0)
...
rN = low(aN * bN)
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 8 instructions.
256-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1, AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 16 instructions.
template<unsigned N, class E1 , class E2 >
int32<N, expr_mull<int16<N,E1>, int16<N,E2> > > simdpp::mull ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Multiplies signed 16-bit values and expands the results to 32 bits.

128-bit version:
r0 = a0 * b0
...
rN = aN * bN
  • In SSE2-AVX and ALTIVEC this intrinsic results in at least 2-3 instructions.
256-bit version:

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

  • In SSE2-AVX and ALTIVEC this intrinsic results in at least 4-6 instructions.
  • In AVX2 and NEON this intrinsic results in at least 2-3 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, expr_mull<uint16<N,E1>, uint16<N,E2> > > simdpp::mull ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Multiplies unsigned 16-bit values and expands the results to 32 bits.

128-bit version:
r0 = a0 * b0
...
rN = aN * bN
  • In SSE2-AVX2 and ALTIVEC this intrinsic results in at least 2-3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX and ALTIVEC this intrinsic results in at least 4-6 instructions.
  • In AVX2 this intrinsic results in at least 2-3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
    Note
    Use with mull_hi on the same arguments to save instructions.
template<unsigned N, class E1 , class E2 >
int64<N, expr_mull<int32<N,E1>, int32<N,E2> > > simdpp::mull ( int32< N, E1 >  a,
int32< N, E2 >  b 
)

Multiplies signed 32-bit values in and expands the results to 64 bits.

r0 = a0 * b0
...
rN = aN * bN
128-bit version:
  • In SSE4.1-AVX this intrinsic results in at least 3 instructions.
  • Not implemented for SSE2-SSSE3 and ALTIVEC.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE4.1-AVX this intrinsic results in at least 6 instructions.
  • In AVX2 this intrinsic results in at least 3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • Not implemented for SSE2-SSSE3 and ALTIVEC.
template<unsigned N, class E1 , class E2 >
uint64<N, expr_mull<uint32<N,E1>, uint32<N,E2> > > simdpp::mull ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits.

128-bit version:
r0 = a0 * b0
r1 = a1 * b1
  • In SSE2-AVX this intrinsic results in at least 3 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In AVX2 this intrinsic results in at least 3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
template<unsigned N, class E >
int8<N, expr_neg<int8<N,E> > > simdpp::neg ( int8< N, E >  a)

Negates signed 8-bit values.

r0 = -a0
...
rN = -aN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
float32<N, expr_neg<float32<N,E> > > simdpp::neg ( float32< N, E >  a)

Negates the values of a float32x4 vector.

r0 = -a0
...
rN = -aN
128-bit version:
  • In SSE2-AVX2 and ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1 and ALTIVEC this intrinsic results in at least 2-3 instructions.
  • In AVX-AVX2 and NEON this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
int16<N, expr_neg<int16<N,E> > > simdpp::neg ( int16< N, E >  a)

Negates signed 16-bit values.

r0 = -a0
...
rN = -aN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
float64<N, expr_neg<float64<N,E> > > simdpp::neg ( float64< N, E >  a)

Negates the values of a vector.

r0 = -a0
...
rN = -aN
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
  • In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
  • Not vectorized in NEON and .
template<unsigned N, class E >
int32<N, expr_neg<int32<N,E> > > simdpp::neg ( int32< N, E >  a)

Negates signed 32-bit values.

r0 = -a0
...
rN = -aN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
int64<N, expr_neg<int64<N,E> > > simdpp::neg ( int64< N, E >  a)

Negates signed 64-bit values.

r0 = -a0
...
rN = -aN
128-bit version:
  • In ALTIVEC this intrinsic results in at least 4-5 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 8-9 instructions.
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec16< N, V > &  a)

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...
: 128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.
: 256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec32< N, V > &  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]
integer
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
floating-point
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec64< N, V > &  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec16< N, V > &  a)

Permutes the 16-bit values within each 4 consecutive values of the vector.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a[s0+12]
...
r15 = a[s3+12]
: 128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 1-5 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
: 256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-10 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec32< N, V > &  a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]
integer
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
floating-point
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec64< N, V > &  a)

Permutes the values of each set of four consecutive 64-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]
integer
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
floating-point
  • In SSE2-AVX this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
uint8x16 simdpp::permute_bytes16 ( uint8x16  a,
uint8x16  mask 
)
inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
uint8<N> simdpp::permute_bytes16 ( uint8< N >  a,
uint8< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
uint16<N> simdpp::permute_bytes16 ( uint16< N >  a,
uint16< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
uint32<N> simdpp::permute_bytes16 ( uint32< N >  a,
uint32< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
uint64<N> simdpp::permute_bytes16 ( uint64< N >  a,
uint64< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
float32<N> simdpp::permute_bytes16 ( float32< N >  a,
uint32< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
template<unsigned N>
float64<N> simdpp::permute_bytes16 ( float64< N >  a,
uint64< N >  mask 
)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
uint8x16 simdpp::permute_zbytes16 ( uint8x16  a,
uint8x16  mask 
)
inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint8<N> simdpp::permute_zbytes16 ( uint8< N >  a,
uint8< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint16<N> simdpp::permute_zbytes16 ( uint16< N >  a,
uint16< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint32<N> simdpp::permute_zbytes16 ( uint32< N >  a,
uint32< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint64<N> simdpp::permute_zbytes16 ( uint64< N >  a,
uint64< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
float32<N> simdpp::permute_zbytes16 ( float32< N >  a,
uint32< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
float64<N> simdpp::permute_zbytes16 ( float64< N >  a,
uint64< N >  mask 
)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3-AVX this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::rcp_e ( float32< N, E >  a)

Computes approximate reciprocal.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::rcp_rh ( float32< N, E >  a)

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
  • In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 4-5 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::rsqrt_e ( float32< N, E >  a)

Computes approximate reciprocal square root.

Relative error is as follows:

  • 1/2 ULP for NULL and NEON
  • ~1/2730 for SSE2
  • 1/4096 for ALTIVEC
  • 1/256 for NEON_FLT_SP
r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))
128-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::rsqrt_rh ( float32< N, E >  a)

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5
128-bit version:
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
  • In NEON this intrinsic results in at least 3 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
256-bit version:
  • In AVX-AVX2 this intrinsic results in at least 7 instructions.
  • In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
  • In NEON this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 8-10 instructions.
template<unsigned N, class E >
int8<N, int8<N> > simdpp::shift_l ( int8< N, E >  a,
unsigned  count 
)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint8< N, uint8< N > > simdpp::shift_l ( uint8< N, E >  a,
unsigned  count 
)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int16<N, int16<N> > simdpp::shift_l ( int16< N, E >  a,
unsigned  count 
)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint16< N, uint16< N > > simdpp::shift_l ( uint16< N, E >  a,
unsigned  count 
)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int32<N, int32<N> > simdpp::shift_l ( int32< N, E >  a,
unsigned  count 
)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint32< N, uint32< N > > simdpp::shift_l ( uint32< N, E >  a,
unsigned  count 
)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int64<N, int64<N> > simdpp::shift_l ( int64< N, E >  a,
unsigned  count 
)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • Not implemented for ALTIVEC.
template<unsigned N, class E >
uint64< N, uint64< N > > simdpp::shift_l ( uint64< N, E >  a,
unsigned  count 
)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • Not implemented for ALTIVEC.
template<unsigned count, unsigned N, class E >
int8<N, int8<N> > simdpp::shift_l ( int8< N, E >  a)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4-5 instructions.
  • In AVX2 and NEON this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > > simdpp::shift_l ( uint8< N, E >  a)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4-5 instructions.
  • In AVX2 and NEON this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int16<N, int16<N> > simdpp::shift_l ( int16< N, E >  a)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > > simdpp::shift_l ( uint16< N, E >  a)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int32<N, int32<N> > simdpp::shift_l ( int32< N, E >  a)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > > simdpp::shift_l ( uint32< N, E >  a)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int64<N, int64<N> > simdpp::shift_l ( int64< N, E >  a)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > > simdpp::shift_l ( uint64< N, E >  a)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
template<unsigned N, class E >
int8< N, int8< N > > simdpp::shift_r ( int8< N, E >  a,
unsigned  count 
)

Shifts signed 8-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 6 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint8< N, uint8< N > > simdpp::shift_r ( uint8< N, E >  a,
unsigned  count 
)

Shifts unsigned 8-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int16< N, int16< N > > simdpp::shift_r ( int16< N, E >  a,
unsigned  count 
)

Shifts signed 16-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint16< N, uint16< N > > simdpp::shift_r ( uint16< N, E >  a,
unsigned  count 
)

Shifts unsigned 16-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int32< N, int32< N > > simdpp::shift_r ( int32< N, E >  a,
unsigned  count 
)

Shifts signed 32-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
r0 = a0 >> count
...
rN = aN >> count
  • In NEON this intrinsic results in at least 2 instructions.
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
uint32< N, uint32< N > > simdpp::shift_r ( uint32< N, E >  a,
unsigned  count 
)

Shifts unsigned 32-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • In ALTIVEC this intrinsic results in at least 2-5 instructions.
template<unsigned N, class E >
int64< N, int64< N > > simdpp::shift_r ( int64< N, E >  a,
unsigned  count 
)

Shifts signed 64-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-6 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 8-10 instructions.
  • In AVX2 this intrinsic results in at least 4-6 instructions.
  • In NEON this intrinsic results in at least 3 instructions.
  • Not implemented for ALTIVEC.
template<unsigned N, class E >
uint64< N, uint64< N > > simdpp::shift_r ( uint64< N, E >  a,
unsigned  count 
)

Shifts unsigned 64-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In NEON this intrinsic results in at least 1-2 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-3 instructions.
  • Not implemented for ALTIVEC.
template<unsigned count, unsigned N, class E >
int8< N, int8< N > > simdpp::shift_r ( int8< N, E >  a)

Shifts signed 8-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 6 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > > simdpp::shift_r ( uint8< N, E >  a)

Shifts unsigned 8-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 4-5 instructions.
  • In AVX2 this intrinsic results in at least 2-3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int16< N, int16< N > > simdpp::shift_r ( int16< N, E >  a)

Shifts signed 16-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > > simdpp::shift_r ( uint16< N, E >  a)

Shifts unsigned 16-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int32< N, int32< N > > simdpp::shift_r ( int32< N, E >  a)

Shifts signed 32-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > > simdpp::shift_r ( uint32< N, E >  a)

Shifts unsigned 32-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned count, unsigned N, class E >
int64< N, int64< N > > simdpp::shift_r ( int64< N, E >  a)

Shifts signed 64-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count
128-bit version:
  • Not vectorized in SSE2-AVX2.
  • In SSE2-AVX2 this intrinsic results in at least 4-6 instructions.
  • Not implemented for ALTIVEC.
256-bit version:
  • Not vectorized in SSE2-AVX.
  • In SSE2-AVX this intrinsic results in at least 8-10 instructions.
  • In AVX2 this intrinsic results in at least 4-6 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > > simdpp::shift_r ( uint64< N, E >  a)

Shifts unsigned 64-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count
  • Not implemented for ALTIVEC.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • Not implemented for ALTIVEC.
template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle1 ( const any_vec64< N, V1 > &  a,
const any_vec64< N, V2 > &  b 
)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]
floating-point
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
integer
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 1-2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned sa0, unsigned sa1, unsigned sb0, unsigned sb1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle2 ( const any_vec32< N, V1 > &  a,
const any_vec32< N, V2 > &  b 
)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]
floating-point
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 1-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
integer
128-bit version:
  • In NEON this intrinsic results in at least 1-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle2 ( const any_vec32< N, V1 > &  a,
const any_vec32< N, V2 > &  b 
)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]
floating-point
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
  • In NEON this intrinsic results in at least 2-4 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
integer
128-bit version:
  • In NEON this intrinsic results in at least 2-4 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4-8 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
uint8x16 simdpp::shuffle_bytes16 ( uint8x16  a,
uint8x16  b,
uint8x16  mask 
)
inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
uint8<N> simdpp::shuffle_bytes16 ( uint8< N >  a,
uint8< N >  b,
uint8< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
uint16<N> simdpp::shuffle_bytes16 ( uint16< N >  a,
uint16< N >  b,
uint16< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
uint32<N> simdpp::shuffle_bytes16 ( uint32< N >  a,
uint32< N >  b,
uint32< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
uint64<N> simdpp::shuffle_bytes16 ( uint64< N >  a,
uint64< N >  b,
uint64< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
float32<N> simdpp::shuffle_bytes16 ( float32< N >  a,
float32< N >  b,
uint32< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
float64<N> simdpp::shuffle_bytes16 ( float64< N >  a,
float64< N >  b,
uint64< N >  mask 
)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 6 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 12 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
uint8x16 simdpp::shuffle_zbytes16 ( uint8x16  a,
uint8x16  b,
uint8x16  mask 
)
inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint8<N> simdpp::shuffle_zbytes16 ( uint8< N >  a,
uint8< N >  b,
uint8< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint16<N> simdpp::shuffle_zbytes16 ( uint16< N >  a,
uint16< N >  b,
uint16< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint32<N> simdpp::shuffle_zbytes16 ( uint32< N >  a,
uint32< N >  b,
uint32< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
uint64<N> simdpp::shuffle_zbytes16 ( uint64< N >  a,
uint64< N >  b,
uint64< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
float32<N> simdpp::shuffle_zbytes16 ( float32< N >  a,
float32< N >  b,
uint32< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N>
float64<N> simdpp::shuffle_zbytes16 ( float64< N >  a,
float64< N >  b,
uint64< N >  mask 
)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 9 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 1 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.
  • Not implemented for SSE2-SSE3.
  • In SSSE3 this intrinsic results in at least 18 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 12 instructions.
  • In AVX2 this intrinsic results in at least 6 instructions.
  • In XOP this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N, class E >
float32<N, float32<N> > simdpp::sign ( float32< N, E >  a)

Extracts sign bits from the values in float32x4 vector.

r0 = a0 & 0x80000000
...
rN = aN & 0x80000000
128-bit version:
  • In SSE2-SSE4.1, ALTIVEC and NEON this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-SSE4.1, ALTIVEC and NEON this intrinsic results in at least 2-3 instructions.
  • In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
template<unsigned N, class E >
float64<N, float64<N> > simdpp::sign ( float64< N, E >  a)

Extracts sigh bit from the values in float64x2 vector.

r0 = a0 & 0x8000000000000000
...
r0 = aN & 0x8000000000000000
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
  • In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
  • Not vectorized in NEON and .
template<class V = expr_vec_set_splat<int>>
V simdpp::splat ( int  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<class V = expr_vec_set_splat<unsigned>>
V simdpp::splat ( unsigned  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<class V = expr_vec_set_splat<int64_t>>
V simdpp::splat ( int64_t  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<class V = expr_vec_set_splat<uint64_t>>
V simdpp::splat ( uint64_t  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<class V = expr_vec_set_splat<float>>
V simdpp::splat ( float  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<class V = expr_vec_set_splat<double>>
V simdpp::splat ( double  x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a
template<unsigned s, unsigned N, class V >
detail::get_expr_nomask<V, void>::empty simdpp::splat ( const any_vec< N, V > &  a)

Broadcasts the specified element to all elements.

r0 = a[s]
r1 = a[s]
...
rN = a[s]
int8
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 5 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int16
128-bit version:
  • In SSE2-AVX this intrinsic results in at least 5 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32
256-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int64
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
float32
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float64
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned s, unsigned N, class E >
int8<N, expr_splat16<s,int8<N,E> > > simdpp::splat16 ( int8< N, E >  a)

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 7 instructions.
  • In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 14 instructions.
  • In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
uint8<N, expr_splat16<s,uint8<N,E> > > simdpp::splat16 ( uint8< N, E >  a)

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 7 instructions.
  • In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE3 this intrinsic results in at least 14 instructions.
  • In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
int64<N, expr_splat2<s,int64<N,E> > > simdpp::splat2 ( int64< N, E >  a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s, unsigned N, class E >
uint64<N, expr_splat2<s,uint64<N,E> > > simdpp::splat2 ( uint64< N, E >  a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned s, unsigned N, class E >
float64<N, expr_splat2<s,float64<N,E> > > simdpp::splat2 ( float64< N, E >  a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned s, unsigned N, class E >
int32<N, expr_splat4<s,int32<N,E> > > simdpp::splat4 ( int32< N, E >  a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]
256-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
uint32<N, expr_splat4<s,uint32<N,E> > > simdpp::splat4 ( uint32< N, E >  a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]
256-bit version:
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
float32<N, expr_splat4<s,float32<N,E> > > simdpp::splat4 ( float32< N, E >  a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
int16<N, expr_splat8<s,int16<N,E> > > simdpp::splat8 ( int16< N, E >  a)

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
  • In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned s, unsigned N, class E >
uint16<N, expr_splat8<s,uint16<N,E> > > simdpp::splat8 ( uint16< N, E >  a)

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]
128-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 3 instructions.
  • In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
256-bit version:
  • In SSE2-SSE3 this intrinsic results in at least 6 instructions.
  • In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
  • In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
void simdpp::split ( uint8x32  a,
uint8x16 &  r1,
uint8x16 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( uint16x16  a,
uint16x8 &  r1,
uint16x8 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( uint32x8  a,
uint32x4 &  r1,
uint32x4 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( uint64x4  a,
uint64x2 &  r1,
uint64x2 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( int8x32  a,
int8x16 &  r1,
int8x16 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( int16x16  a,
int16x8 &  r1,
int16x8 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( int32x8  a,
int32x4 &  r1,
int32x4 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( int64x4  a,
int64x2 &  r1,
int64x2 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( float32x8  a,
float32x4 &  r1,
float32x4 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
void simdpp::split ( float64x4  a,
float64x2 &  r1,
float64x2 &  r2 
)
inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( uint8< N >  a,
uint8< N/2 > &  r1,
uint8< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( uint16< N >  a,
uint16< N/2 > &  r1,
uint16< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( uint32< N >  a,
uint32< N/2 > &  r1,
uint32< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( uint64< N >  a,
uint64< N/2 > &  r1,
uint64< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( int8< N >  a,
int8< N/2 > &  r1,
int8< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( int16< N >  a,
int16< N/2 > &  r1,
int16< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( int32< N >  a,
int32< N/2 > &  r1,
int32< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( int64< N >  a,
int64< N/2 > &  r1,
int64< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( float32< N >  a,
float32< N/2 > &  r1,
float32< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N>
void simdpp::split ( float64< N >  a,
float64< N/2 > &  r1,
float64< N/2 > &  r2 
)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 >
float32<N, float32<N> > simdpp::sqrt ( float32< N, E1 >  a)

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • In NEON this intrinsic results in at least 5 instructions.
  • In ALTIVEC this intrinsic results in at least 5-7 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 10 instructions.
  • In ALTIVEC this intrinsic results in at least 10-12 instructions.
template<unsigned N, class E1 >
float64<N, float64<N> > simdpp::sqrt ( float64< N, E1 >  a)

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
template<unsigned N, class V >
void simdpp::store ( void *  p,
const any_vec< N, V > &  a 
)

Stores a 128-bit or 256-bit integer vector to an aligned memory location.

128-bit version:
(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
template<unsigned N, class V >
void simdpp::store_first ( void *  p,
const any_vec< N, V > &  a,
unsigned  n 
)

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

The function may write entire block of 128 or 256 bits.

(p) = a0
(p+1) = a1
...
(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
template<unsigned N, class V >
void simdpp::store_last ( void *  p,
const any_vec< N, V > &  a,
unsigned  n 
)

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

The function may write entire block of 128 or 256 bits.

(p+N-n) = a{N-n}
...
(p+N-2) = a{N-2}
(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
template<unsigned N, class V1 , class V2 >
void simdpp::store_packed2 ( void *  p,
const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b 
)

Interleaves values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+M*2-2) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+M*2-1) ] = b

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V1 , class V2 , class V3 >
void simdpp::store_packed3 ( void *  p,
const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b,
const any_vec< N, V3 > &  c 
)

Interleaves values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+M*3-3) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+M*3-2) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+M*3-1) ] = c

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V1 , class V2 , class V3 , class V4 >
void simdpp::store_packed4 ( void *  p,
const any_vec< N, V1 > &  a,
const any_vec< N, V2 > &  b,
const any_vec< N, V3 > &  c,
const any_vec< N, V4 > &  d 
)

Interleaves values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+M*4-4) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+M*4-3) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+M*4-2) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+M*4-1) ] = d

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >
void simdpp::stream ( void *  p,
const any_vec< N, V > &  a 
)

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float32<N, expr_sub<float32<N,E1>, float32<N,E2> > > simdpp::sub ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

Substracts the values of two vectors.

r0 = a0 - b0
...
rN = aN - bN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_sub<uint8<N, typename V1::expr_type>, uint8<N, typename V2::expr_type> > >::type simdpp::sub ( const any_int8< N, V1 > &  a,
const any_int8< N, V2 > &  b 
)

Subtracts 8-bit integer values.

r0 = a0 - b0
...
rN = aN - bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, expr_sub<float64<N,E1>, float64<N,E2> > > simdpp::sub ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

Subtracts the values of two vectors.

r0 = a0 - b0
...
rN = aN - bN
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • Not vectorized in NEON and .
  • In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_sub<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::sub ( const any_int16< N, V1 > &  a,
const any_int16< N, V2 > &  b 
)

Subtracts 16-bit integer values.

r0 = a0 - b0
...
rN = aN - bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_sub<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::sub ( const any_int32< N, V1 > &  a,
const any_int32< N, V2 > &  b 
)

Subtracts 32-bit integer values.

r0 = a0 - b0
...
rN = aN - bN
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, expr_sub<uint64<N, typename V1::expr_type>, uint64<N, typename V2::expr_type> > >::type simdpp::sub ( const any_int64< N, V1 > &  a,
const any_int64< N, V2 > &  b 
)

Subtracts 64-bit integer values.

r0 = a0 - b0
...
rN = aN - bN
128-bit version:
  • In ALTIVEC this intrinsic results in at least 5-6 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 10-11 instructions.
template<unsigned N, class E1 , class E2 >
int8<N, expr_sub_sat<int8<N,E1>, int8<N,E2> > > simdpp::sub_sat ( int8< N, E1 >  a,
int8< N, E2 >  b 
)

Subtracts and saturaters signed 8-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
int16<N, expr_sub_sat<int16<N,E1>, int16<N,E2> > > simdpp::sub_sat ( int16< N, E1 >  a,
int16< N, E2 >  b 
)

Subtracts and saturaters signed 16-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, expr_sub_sat<uint8<N,E1>, uint8<N,E2> > > simdpp::sub_sat ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

Subtracts and saturaters unsigned 8-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, expr_sub_sat<uint16<N,E1>, uint16<N,E2> > > simdpp::sub_sat ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

Subtracts and saturaters unsigned 16-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)
256-bit version:
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::to_float32 ( int32x4  a)
inline

Converts 32-bit integer values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the current rounding mode is used.

NEON, ALTIVEC specific:

If only inexact conversion can be performed, round to nearest mode is used.

r0 = (float) a0
...
rN = (float) aN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N>
float32<N> simdpp::to_float32 ( int32< N >  a)

Converts 32-bit integer values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the current rounding mode is used.

NEON, ALTIVEC specific:

If only inexact conversion can be performed, round to nearest mode is used.

r0 = (float) a0
...
rN = (float) aN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
float32x4 simdpp::to_float32 ( float64x4  a)
inline

Converts 64-bit float values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is truncated.

128-bit version:
r0 = (float) a0
r1 = (float) a1
r2 = 0.0f
r3 = 0.0f
  • Not vectorized in NEON and .
256-bit version:
r0 = (float) a0
...
r3 = (float) a3
r4 = 0.0f
...
r7 = 0.0f
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
  • Not vectorized in NEON and .
float64x4 simdpp::to_float64 ( int32x4  a)
inline

Converts the 32-bit integer values to 64-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is rounded to the nearest representable value.

256-bit version:
r0 = (double) a0
...
r3 = (double) a3
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
  • Not vectorized in NEON and .
float64x4 simdpp::to_float64 ( float32x4  a)
inline

Converts the 32-bit float values to 64-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is rounded to the nearest representable value.

256-bit version:
r0 = (double) a0
...
r3 = (double) a3
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
  • Not vectorized in NEON and .
uint16x16 simdpp::to_int16 ( int8x16  a)
inline

Sign extends the 16 values of a signed int8x16 vector to 16-bits.

r0 = (int16_t) a0
...
r15 = (int16_t) a15
  • In SSE4.1-AVX this intrinsic results in at least 3 instructions.
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint16x16 simdpp::to_int16 ( uint8x16  a)
inline

Extends the 16 values of a unsigned int8x16 vector to 16-bits.

r0 = (uint16_t) a0
...
r15 = (uint16_t) a15
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x8 simdpp::to_int32 ( int16x8  a)
inline

Sign extends the first 8 values of a signed int16x16 vector to 32-bits.

r0 = (int32_t) a0
...
r7 = (int32_t) a7
  • In SSE4.1-AVX this intrinsic results in at least 3 instructions.
  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::to_int32 ( float32x4  a)
inline

Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned TODO: NaN handling

NEON, ALTIVEC specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand (saturation occurs). Conversion of NaNs results in 0.

r0 = (int32_t) a0
...
rN = (int32_t) aN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
int32x4 simdpp::to_int32 ( float64x4  a)
inline

Converts the values of a doublex2 vector into int32_t representation using truncation.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned

Todo:
NaN handling

NEON VFP specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand. Conversion of NaNs results in 0.

128-bit version:
  • Not vectorized in NEON and .
r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = (int32_t) a2
r3 = (int32_t) a3
  • In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
template<unsigned N>
uint32<N> simdpp::to_int32x8 ( float32< N >  a)

Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned TODO: NaN handling

NEON, ALTIVEC specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand (saturation occurs). Conversion of NaNs results in 0.

r0 = (int32_t) a0
...
rN = (int32_t) aN
256-bit version:
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
uint64x4 simdpp::to_int64 ( int32x4  a)
inline

Extends the values of a signed int32x4 vector to 64-bits.

r0 = (int64_t) a0
...
r3 = (int64_t) a3
  • In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 3 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 3-4 instructions.
uint64x4 simdpp::to_int64 ( uint32x4  a)
inline

Extends the values of an unsigned int32x4 vector to 64-bits.

r0 = (uint64_t) a0
...
r3 = (uint64_t) a3
  • In SSE2-AVX this intrinsic results in at least 3 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
void simdpp::transpose8 ( uint16x8 &  a0,
uint16x8 &  a1,
uint16x8 &  a2,
uint16x8 &  a3,
uint16x8 &  a4,
uint16x8 &  a5,
uint16x8 &  a6,
uint16x8 &  a7 
)
inline

Transposes a 8x8 16-bit matrix within eight int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ...; a7_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ...; a7_1 ]
...
r7 = [ a0_7; a1_7; a2_7; a3_7 ...; a7_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 24 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 48 instructions.
  • In AVX2 this intrinsic results in at least 24 instructions.
  • In NEON this intrinsic results in at least 24 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose8 ( int16x8 &  a0,
int16x8 &  a1,
int16x8 &  a2,
int16x8 &  a3,
int16x8 &  a4,
int16x8 &  a5,
int16x8 &  a6,
int16x8 &  a7 
)
inline
void simdpp::transpose8 ( uint16x16 &  a0,
uint16x16 &  a1,
uint16x16 &  a2,
uint16x16 &  a3,
uint16x16 &  a4,
uint16x16 &  a5,
uint16x16 &  a6,
uint16x16 &  a7 
)
inline
void simdpp::transpose8 ( int16x16 &  a0,
int16x16 &  a1,
int16x16 &  a2,
int16x16 &  a3,
int16x16 &  a4,
int16x16 &  a5,
int16x16 &  a6,
int16x16 &  a7 
)
inline
template<unsigned N, class E >
float32<N, float32<N> > simdpp::trunc ( float32< N, E >  a)

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)
128-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
  • In NEON this intrinsic results in at least 5-6 instructions.
256-bit version:
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
  • In NEON this intrinsic results in at least 10-11 instructions.
  • In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::unzip16_hi ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

De-interleaves the even(higher) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
r = [ a1 a3 a5 a7 a9 a11 a13 a15 b1 b3 b5 b7 b9 b11 b13 b15 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 3 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N, uint8<N> > simdpp::unzip16_lo ( uint8< N, E1 >  a,
uint8< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
r = [ a0 a2 a4 a6 a8 a10 a12 a14 b0 b2 b4 b6 b8 b10 b12 b14 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8-9 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N, class E1 , class E2 >
uint64<N, uint64<N> > simdpp::unzip2_hi ( uint64< N, E1 >  a,
uint64< N, E2 >  b 
)

De-interleaves the even(higher) elements of two int64x2 vectors.

| 0 1 |
r = [ a1 b1 ]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, float64<N> > simdpp::unzip2_hi ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

De-interleaves the even(higher) elements of two float64x2 vectors.

| 0 1 |
r = [ a1 b1 ]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • Not vectorized in NEON and .
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint64<N, uint64<N> > simdpp::unzip2_lo ( uint64< N, E1 >  a,
uint64< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two int64x2 vectors.

| 0 1 |
r = [ a0 b0 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float64<N, float64<N> > simdpp::unzip2_lo ( float64< N, E1 >  a,
float64< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two float64x2 vectors.

| 0 1 |
r = [ a0 b0 ]
128-bit version:
  • Not vectorized in NEON and .
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::unzip4_hi ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

De-interleaves the even(higher) elements of two int32x4 vectors.

| 0 1 2 3 |
r = [ a1 a3 b1 b3 ]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
float32<N, float32<N> > simdpp::unzip4_hi ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

De-interleaves the even(higher) elements of two float32x4 vectors.

| 0 1 2 3 |
r = [ a1 a3 b1 b3 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N, uint32<N> > simdpp::unzip4_lo ( uint32< N, E1 >  a,
uint32< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two int32x4 vectors.

| 0 1 2 3 |
r = [ a0 a2 b0 b2 ]
128-bit version:
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
  • In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
template<unsigned N, class E1 , class E2 >
float32<N, float32<N> > simdpp::unzip4_lo ( float32< N, E1 >  a,
float32< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two float32x4 vectors.

| 0 1 2 3 |
r = [ a0 a2 b0 b2 ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::unzip8_hi ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

De-interleaves the even(higher) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |
r = [ a1 a3 a5 a7 b1 b3 b5 b7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 3 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 6 instructions.
  • In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX2 this intrinsic results in at least 3 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N, uint16<N> > simdpp::unzip8_lo ( uint16< N, E1 >  a,
uint16< N, E2 >  b 
)

De-interleaves the odd(lower) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |
r = [ a0 a2 a4 a6 b0 b2 b4 b6 ]
128-bit version:
  • In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
  • In SSE4.1-AVX2 this intrinsic results in at least 4-5 instructions.
  • In ALTIVEC this intrinsic results in at least 1-2 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 8-9 instructions.
  • In AVX2 this intrinsic results in at least 4-5 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-3 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip16_hi ( const any_vec8< N, V1 > &  a,
const any_vec8< N, V2 > &  b 
)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |
r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip16_lo ( const any_vec8< N, V1 > &  a,
const any_vec8< N, V2 > &  b 
)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |
r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip2_hi ( const any_vec64< N, V1 > &  a,
const any_vec64< N, V2 > &  b 
)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |
r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip2_lo ( const any_vec64< N, V1 > &  a,
const any_vec64< N, V2 > &  b 
)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |
r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip4_hi ( const any_vec32< N, V1 > &  a,
const any_vec32< N, V2 > &  b 
)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |
r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip4_lo ( const any_vec32< N, V1 > &  a,
const any_vec32< N, V2 > &  b 
)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |
r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip8_hi ( const any_vec16< N, V1 > &  a,
const any_vec16< N, V2 > &  b 
)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |
r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip8_lo ( const any_vec16< N, V1 > &  a,
const any_vec16< N, V2 > &  b 
)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |
r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.