Functions
basic_int8x16	simdpp::zip_lo (basic_int8x16 a, basic_int8x16 b)
	Interleaves the lower halves of two vectors. More...

basic_int8x32	simdpp::zip_lo (basic_int8x32 a, basic_int8x32 b)

basic_int16x8	simdpp::zip_lo (basic_int16x8 a, basic_int16x8 b)

basic_int16x16	simdpp::zip_lo (basic_int16x16 a, basic_int16x16 b)

basic_int32x4	simdpp::zip_lo (basic_int32x4 a, basic_int32x4 b)

basic_int32x8	simdpp::zip_lo (basic_int32x8 a, basic_int32x8 b)

basic_int64x2	simdpp::zip_lo (basic_int64x2 a, basic_int64x2 b)

basic_int64x4	simdpp::zip_lo (basic_int64x4 a, basic_int64x4 b)

template<int s0, int s1>
basic_int8x16	simdpp::make_shuffle_bytes16_mask (basic_int8x16 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1>
basic_int8x32	simdpp::make_shuffle_bytes16_mask (basic_int8x32 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

float32x4	simdpp::zip_lo (float32x4 a, float32x4 b)
	Interleaves the lower halves of two vectors. More...

float32x8	simdpp::zip_lo (float32x8 a, float32x8 b)
	Interleaves the lower halves of two vectors. More...

float64x2	simdpp::zip_lo (float64x2 a, float64x2 b)
	Interleaves the lower halves of two vectors. More...

float64x4	simdpp::zip_lo (float64x4 a, float64x4 b)
	Interleaves the lower halves of two vectors. More...

basic_int8x16	simdpp::zip_hi (basic_int8x16 a, basic_int8x16 b)
	Interleaves the higher halves of two vectors. More...

basic_int8x32	simdpp::zip_hi (basic_int8x32 a, basic_int8x32 b)
	Interleaves the higher halves of two vectors. More...

basic_int16x8	simdpp::zip_hi (basic_int16x8 a, basic_int16x8 b)
	Interleaves the higher halves of two vectors. More...

basic_int16x16	simdpp::zip_hi (basic_int16x16 a, basic_int16x16 b)
	Interleaves the higher halves of two vectors. More...

basic_int32x4	simdpp::zip_hi (basic_int32x4 a, basic_int32x4 b)
	Interleaves the higher halves of two vectors. More...

basic_int32x8	simdpp::zip_hi (basic_int32x8 a, basic_int32x8 b)
	Interleaves the higher halves of two vectors. More...

basic_int64x2	simdpp::zip_hi (basic_int64x2 a, basic_int64x2 b)
	Interleaves the higher halves of two vectors. More...

basic_int64x4	simdpp::zip_hi (basic_int64x4 a, basic_int64x4 b)
	Interleaves the higher halves of two vectors. More...

float32x4	simdpp::zip_hi (float32x4 a, float32x4 b)
	Interleaves the higher halves of two vectors. More...

float32x8	simdpp::zip_hi (float32x8 a, float32x8 b)
	Interleaves the higher halves of two vectors. More...

float64x2	simdpp::zip_hi (float64x2 a, float64x2 b)
	Interleaves the higher halves of two vectors. More...

float64x4	simdpp::zip_hi (float64x4 a, float64x4 b)
	Interleaves the higher halves of two vectors. More...

template<unsigned shift>
basic_int8x16	simdpp::move_l (basic_int8x16 a)
	Moves the elements in an int8x16 vector to the left by shift positions. More...

template<unsigned shift>
basic_int8x32	simdpp::move_l (basic_int8x32 a)
	Moves the elements in an int8x16 vector to the left by shift positions. More...

template<unsigned shift>
basic_int16x8	simdpp::move_l (basic_int16x8 a)
	Moves the 16-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int16x16	simdpp::move_l (basic_int16x16 a)
	Moves the 16-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int32x4	simdpp::move_l (basic_int32x4 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int32x8	simdpp::move_l (basic_int32x8 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int64x2	simdpp::move_l (basic_int64x2 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int64x4	simdpp::move_l (basic_int64x4 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float32x4	simdpp::move_l (float32x4 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float32x8	simdpp::move_l (float32x8 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float64x2	simdpp::move_l (float64x2 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float64x4	simdpp::move_l (float64x4 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int8x16	simdpp::move_r (basic_int8x16 a)
	Moves the 8-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int8x32	simdpp::move_r (basic_int8x32 a)
	Moves the 8-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int16x8	simdpp::move_r (basic_int16x8 a)
	Moves the 16-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int16x16	simdpp::move_r (basic_int16x16 a)
	Moves the 16-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int32x4	simdpp::move_r (basic_int32x4 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int32x8	simdpp::move_r (basic_int32x8 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int64x2	simdpp::move_r (basic_int64x2 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int64x4	simdpp::move_r (basic_int64x4 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float32x4	simdpp::move_r (float32x4 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float32x8	simdpp::move_r (float32x8 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float64x2	simdpp::move_r (float64x2 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float64x4	simdpp::move_r (float64x4 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned s>
basic_int8x16	simdpp::broadcast (basic_int8x16 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int8x32	simdpp::broadcast (basic_int8x32 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int16x8	simdpp::broadcast (basic_int16x8 a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int16x16	simdpp::broadcast (basic_int16x16 a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int32x4	simdpp::broadcast (basic_int32x4 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int32x8	simdpp::broadcast (basic_int32x8 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int64x2	simdpp::broadcast (basic_int64x2 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int64x4	simdpp::broadcast (basic_int64x4 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float32x4	simdpp::broadcast (float32x4 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float32x8	simdpp::broadcast (float32x8 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float64x2	simdpp::broadcast (float64x2 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float64x4	simdpp::broadcast (float64x4 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int8x16	simdpp::broadcast_w (basic_int8x16 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...

template<unsigned s>
basic_int8x32	simdpp::broadcast_w (basic_int8x32 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...

template<unsigned s>
basic_int16x8	simdpp::broadcast_w (basic_int16x8 a)
	Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...

template<unsigned s>
basic_int16x16	simdpp::broadcast_w (basic_int16x16 a)
	Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...

template<unsigned s>
basic_int32x4	simdpp::broadcast_w (basic_int32x4 a)
	Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...

template<unsigned s>
basic_int32x8	simdpp::broadcast_w (basic_int32x8 a)
	Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...

template<unsigned s>
basic_int64x2	simdpp::broadcast_w (basic_int64x2 a)
	Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...

template<unsigned s>
basic_int64x4	simdpp::broadcast_w (basic_int64x4 a)
	Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...

template<unsigned s>
float32x4	simdpp::broadcast_w (float32x4 a)
	Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...

template<unsigned s>
float32x8	simdpp::broadcast_w (float32x8 a)
	Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...

template<unsigned s>
float64x2	simdpp::broadcast_w (float64x2 a)
	Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...

template<unsigned s>
float64x4	simdpp::broadcast_w (float64x4 a)
	Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...

template<unsigned shift>
basic_int8x16	simdpp::align (basic_int8x16 lower, basic_int8x16 upper)
	Extracts a int8x16 vector from two concatenated int8x16 vectors. More...

template<unsigned shift>
basic_int8x32	simdpp::align (basic_int8x32 lower, basic_int8x32 upper)
	Extracts a int8x16 vector from two concatenated int8x16 vectors. More...

template<unsigned shift>
basic_int16x8	simdpp::align (basic_int16x8 lower, basic_int16x8 upper)
	Extracts a int16x8 vector from two concatenated int16x8 vectors. More...

template<unsigned shift>
basic_int16x16	simdpp::align (basic_int16x16 lower, basic_int16x16 upper)
	Extracts a int16x8 vector from two concatenated int16x8 vectors. More...

template<unsigned shift>
basic_int32x4	simdpp::align (basic_int32x4 lower, basic_int32x4 upper)
	Extracts a int32x4 vector from two concatenated int32x4 vectors. More...

template<unsigned shift>
basic_int32x8	simdpp::align (basic_int32x8 lower, basic_int32x8 upper)
	Extracts a int32x4 vector from two concatenated int32x4 vectors. More...

template<unsigned shift>
basic_int64x2	simdpp::align (basic_int64x2 lower, basic_int64x2 upper)
	Extracts a int64x2 vector from two concatenated int64x2 vectors. More...

template<unsigned shift>
basic_int64x4	simdpp::align (basic_int64x4 lower, basic_int64x4 upper)
	Extracts a int64x2 vector from two concatenated int64x2 vectors. More...

template<unsigned shift>
float32x4	simdpp::align (float32x4 lower, float32x4 upper)
	Extracts a float32x4 vector from two concatenated float32x4 vectors. More...

template<unsigned shift>
float32x8	simdpp::align (float32x8 lower, float32x8 upper)
	Extracts a float32x4 vector from two concatenated float32x4 vectors. More...

template<unsigned shift>
float64x2	simdpp::align (float64x2 lower, float64x2 upper)
	Extracts a float64x2 vector from two concatenated float64x2 vectors. More...

template<unsigned shift>
float64x4	simdpp::align (float64x4 lower, float64x4 upper)
	Extracts a float64x2 vector from two concatenated float64x2 vectors. More...

basic_int8x16	simdpp::blend (basic_int8x16 on, basic_int8x16 off, basic_int8x16 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x16	simdpp::blend (basic_int8x16 on, basic_int8x16 off, mask_int8x16 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x32	simdpp::blend (basic_int8x32 on, basic_int8x32 off, basic_int8x32 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x32	simdpp::blend (basic_int8x32 on, basic_int8x32 off, mask_int8x32 mask)
	Composes a vector from two sources according to a mask. More...

basic_int16x8	simdpp::blend (basic_int16x8 on, basic_int16x8 off, basic_int16x8 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x16	simdpp::blend (basic_int16x16 on, basic_int16x16 off, basic_int16x16 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x8	simdpp::blend (basic_int16x8 on, basic_int16x8 off, mask_int16x8 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x16	simdpp::blend (basic_int16x16 on, basic_int16x16 off, mask_int16x16 mask)
	Composes vector from two sources according to a mask. More...

basic_int32x4	simdpp::blend (basic_int32x4 on, basic_int32x4 off, basic_int32x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x8	simdpp::blend (basic_int32x8 on, basic_int32x8 off, basic_int32x8 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x4	simdpp::blend (basic_int32x4 on, basic_int32x4 off, mask_int32x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x8	simdpp::blend (basic_int32x8 on, basic_int32x8 off, mask_int32x8 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x2	simdpp::blend (basic_int64x2 on, basic_int64x2 off, basic_int64x2 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x4	simdpp::blend (basic_int64x4 on, basic_int64x4 off, basic_int64x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x2	simdpp::blend (basic_int64x2 on, basic_int64x2 off, mask_int64x2 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x4	simdpp::blend (basic_int64x4 on, basic_int64x4 off, mask_int64x4 mask)
	Composes a vector from two sources according to a mask. More...

float32x4	simdpp::blend (float32x4 on, float32x4 off, float32x4 mask)
	Composes a vector from two sources according to a mask. More...

float32x4	simdpp::blend (float32x4 on, float32x4 off, int128 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	simdpp::blend (float32x8 on, float32x8 off, float32x8 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	simdpp::blend (float32x8 on, float32x8 off, int256 mask)
	Composes a vector from two sources according to a mask. More...

float32x4	simdpp::blend (float32x4 on, float32x4 off, mask_float32x4 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	simdpp::blend (float32x8 on, float32x8 off, mask_float32x8 mask)
	Composes a vector from two sources according to a mask. More...

float64x2	simdpp::blend (float64x2 on, float64x2 off, float64x2 mask)
	Composes a vector from two sources according to a mask. More...

float64x2	simdpp::blend (float64x2 on, float64x2 off, int128 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	simdpp::blend (float64x4 on, float64x4 off, float64x4 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	simdpp::blend (float64x4 on, float64x4 off, int256 mask)
	Composes a vector from two sources according to a mask. More...

float64x2	simdpp::blend (float64x2 on, float64x2 off, mask_float64x2 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	simdpp::blend (float64x4 on, float64x4 off, mask_float64x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x16	simdpp::unzip_lo (basic_int8x16 a, basic_int8x16 b)
	De-interleaves the odd(lower) elements of two int8x16 vectors. More...

basic_int8x32	simdpp::unzip_lo (basic_int8x32 a, basic_int8x32 b)
	De-interleaves the odd(lower) elements of two int8x16 vectors. More...

basic_int16x8	simdpp::unzip_lo (basic_int16x8 a, basic_int16x8 b)
	De-interleaves the odd(lower) elements of two int16x8 vectors. More...

basic_int16x16	simdpp::unzip_lo (basic_int16x16 a, basic_int16x16 b)
	De-interleaves the odd(lower) elements of two int16x8 vectors. More...

basic_int32x4	simdpp::unzip_lo (basic_int32x4 a, basic_int32x4 b)
	De-interleaves the odd(lower) elements of two int32x4 vectors. More...

basic_int32x8	simdpp::unzip_lo (basic_int32x8 a, basic_int32x8 b)
	De-interleaves the odd(lower) elements of two int32x4 vectors. More...

basic_int64x2	simdpp::unzip_lo (basic_int64x2 a, basic_int64x2 b)
	De-interleaves the odd(lower) elements of two int64x2 vectors. More...

basic_int64x4	simdpp::unzip_lo (basic_int64x4 a, basic_int64x4 b)
	De-interleaves the odd(lower) elements of two int64x2 vectors. More...

float32x4	simdpp::unzip_lo (float32x4 a, float32x4 b)
	De-interleaves the odd(lower) elements of two float32x4 vectors. More...

float32x8	simdpp::unzip_lo (float32x8 a, float32x8 b)
	De-interleaves the odd(lower) elements of two float32x4 vectors. More...

float64x2	simdpp::unzip_lo (float64x2 a, float64x2 b)
	De-interleaves the odd(lower) elements of two float64x2 vectors. More...

float64x4	simdpp::unzip_lo (float64x4 a, float64x4 b)
	De-interleaves the odd(lower) elements of two float64x2 vectors. More...

basic_int8x16	simdpp::unzip_hi (basic_int8x16 a, basic_int8x16 b)
	De-interleaves the even(higher) elements of two int8x16 vectors. More...

basic_int8x32	simdpp::unzip_hi (basic_int8x32 a, basic_int8x32 b)
	De-interleaves the even(higher) elements of two int8x16 vectors. More...

basic_int16x8	simdpp::unzip_hi (basic_int16x8 a, basic_int16x8 b)
	De-interleaves the even(higher) elements of two int16x8 vectors. More...

basic_int16x16	simdpp::unzip_hi (basic_int16x16 a, basic_int16x16 b)
	De-interleaves the even(higher) elements of two int16x8 vectors. More...

basic_int32x4	simdpp::unzip_hi (basic_int32x4 a, basic_int32x4 b)
	De-interleaves the even(higher) elements of two int32x4 vectors. More...

basic_int32x8	simdpp::unzip_hi (basic_int32x8 a, basic_int32x8 b)
	De-interleaves the even(higher) elements of two int32x4 vectors. More...

basic_int64x2	simdpp::unzip_hi (basic_int64x2 a, basic_int64x2 b)
	De-interleaves the even(higher) elements of two int64x2 vectors. More...

basic_int64x4	simdpp::unzip_hi (basic_int64x4 a, basic_int64x4 b)
	De-interleaves the even(higher) elements of two int64x2 vectors. More...

float32x4	simdpp::unzip_hi (float32x4 a, float32x4 b)
	De-interleaves the even(higher) elements of two float32x4 vectors. More...

float32x8	simdpp::unzip_hi (float32x8 a, float32x8 b)
	De-interleaves the even(higher) elements of two float32x4 vectors. More...

float64x2	simdpp::unzip_hi (float64x2 a, float64x2 b)
	De-interleaves the even(higher) elements of two float64x2 vectors. More...

float64x4	simdpp::unzip_hi (float64x4 a, float64x4 b)
	De-interleaves the even(higher) elements of two float64x2 vectors. More...

int128	simdpp::permute_bytes16 (int128 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

float32x4	simdpp::permute_bytes16 (float32x4 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

float64x2	simdpp::permute_bytes16 (float64x2 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

int256	simdpp::permute_bytes16 (int256 a, int256 mask)
	Selects bytes from a vector according to a mask. More...

float32x8	simdpp::permute_bytes16 (float32x8 a, int256 mask)
	Selects bytes from a vector according to a mask. More...

float64x4	simdpp::permute_bytes16 (float64x4 a, int256 mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
int128	simdpp::permute (basic_int16x8 a)
	Permutes the 16-bit values within each 4 consecutive values of the vector. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int16x16	simdpp::permute (basic_int16x16 a)
	Permutes the 16-bit values within each 4 consecutive values of the vector. More...

Detailed Description

Function Documentation

template<unsigned shift>

basic_int8x16 simdpp::align	(	basic_int8x16	lower,
		basic_int8x16	upper
	)

Extracts a int8x16 vector from two concatenated int8x16 vectors.

shift:  pos:| 0   1    .  14  15  |
    r = [ l0  l1   .  l14 l15 ]
    r = [ l1  l2   .  l15 u0  ]
    r = [ l2  l3   .  u0  l1  ]
  ...    ..   .. ..  ... .. ..
   r = [ l15 u0   .  u13 u14 ]
   r = [ u0  u1   .  u14 u15 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int8x32 simdpp::align	(	basic_int8x32	lower,
		basic_int8x32	upper
	)

Extracts a int8x16 vector from two concatenated int8x16 vectors.

shift:  pos:| 0   1    .  14  15  |
    r = [ l0  l1   .  l14 l15 ]
    r = [ l1  l2   .  l15 u0  ]
    r = [ l2  l3   .  u0  l1  ]
  ...    ..   .. ..  ... .. ..
   r = [ l15 u0   .  u13 u14 ]
   r = [ u0  u1   .  u14 u15 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x8 simdpp::align	(	basic_int16x8	lower,
		basic_int16x8	upper
	)

Extracts a int16x8 vector from two concatenated int16x8 vectors.

shift:  pos:| 0  1    .  6  7  |
    r = [ l0 l1   .  l6 l7 ]
    r = [ l1 l2   .  l7 u0 ]
    r = [ l2 l3   .  u0 l1 ]
  ...    ..   .. ..  ... .. ..
    r = [ l3 u0   .  u5 u6 ]
    r = [ u0 u1   .  u6 u7 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x16 simdpp::align	(	basic_int16x16	lower,
		basic_int16x16	upper
	)

Extracts a int16x8 vector from two concatenated int16x8 vectors.

shift:  pos:| 0  1    .  6  7  |
    r = [ l0 l1   .  l6 l7 ]
    r = [ l1 l2   .  l7 u0 ]
    r = [ l2 l3   .  u0 l1 ]
  ...    ..   .. ..  ... .. ..
    r = [ l3 u0   .  u5 u6 ]
    r = [ u0 u1   .  u6 u7 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x4 simdpp::align	(	basic_int32x4	lower,
		basic_int32x4	upper
	)

Extracts a int32x4 vector from two concatenated int32x4 vectors.

shift:  pos:| 0  1  2  3  |
    r = [ l0 l1 l2 l3 ]
    r = [ l1 l2 l3 u0 ]
    r = [ l2 l3 u0 u1 ]
    r = [ l3 u0 u1 u2 ]
    r = [ u0 u1 u2 u3 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x8 simdpp::align	(	basic_int32x8	lower,
		basic_int32x8	upper
	)

Extracts a int32x4 vector from two concatenated int32x4 vectors.

shift:  pos:| 0  1  2  3  |
    r = [ l0 l1 l2 l3 ]
    r = [ l1 l2 l3 u0 ]
    r = [ l2 l3 u0 u1 ]
    r = [ l3 u0 u1 u2 ]
    r = [ u0 u1 u2 u3 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x2 simdpp::align	(	basic_int64x2	lower,
		basic_int64x2	upper
	)

Extracts a int64x2 vector from two concatenated int64x2 vectors.

shift:  pos:| 0  1  |
    r = [ l0 l1 ]
    r = [ l1 u0 ]
    r = [ u0 u1 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x4 simdpp::align	(	basic_int64x4	lower,
		basic_int64x4	upper
	)

Extracts a int64x2 vector from two concatenated int64x2 vectors.

shift:  pos:| 0  1  |
    r = [ l0 l1 ]
    r = [ l1 u0 ]
    r = [ u0 u1 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x4 simdpp::align	(	float32x4	lower,
		float32x4	upper
	)

Extracts a float32x4 vector from two concatenated float32x4 vectors.

shift:  pos:| 0  1  2  3  |
    r = [ l0 l1 l2 l3 ]
    r = [ l1 l2 l3 u0 ]
    r = [ l2 l3 u0 u1 ]
    r = [ l3 u0 u1 u2 ]
    r = [ u0 u1 u2 u3 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x8 simdpp::align	(	float32x8	lower,
		float32x8	upper
	)

Extracts a float32x4 vector from two concatenated float32x4 vectors.

shift:  pos:| 0  1  2  3  |
    r = [ l0 l1 l2 l3 ]
    r = [ l1 l2 l3 u0 ]
    r = [ l2 l3 u0 u1 ]
    r = [ l3 u0 u1 u2 ]
    r = [ u0 u1 u2 u3 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float64x2 simdpp::align	(	float64x2	lower,
		float64x2	upper
	)

Extracts a float64x2 vector from two concatenated float64x2 vectors.

shift:  pos:| 0  1  |
    r = [ l0 l1 ]
    r = [ l1 u0 ]
    r = [ u0 u1 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned shift>

float64x4 simdpp::align	(	float64x4	lower,
		float64x4	upper
	)

Extracts a float64x2 vector from two concatenated float64x2 vectors.

shift:  pos:| 0  1  |
    r = [ l0 l1 ]
    r = [ l1 u0 ]
    r = [ u0 u1 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

basic_int8x16 simdpp::blend	(	basic_int8x16	on,
		basic_int8x16	off,
		basic_int8x16	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

basic_int8x16 simdpp::blend	(	basic_int8x16	on,
		basic_int8x16	off,
		mask_int8x16	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

basic_int8x32 simdpp::blend	(	basic_int8x32	on,
		basic_int8x32	off,
		basic_int8x32	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

basic_int8x32 simdpp::blend	(	basic_int8x32	on,
		basic_int8x32	off,
		mask_int8x32	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

basic_int16x8 simdpp::blend	(	basic_int16x8	on,
		basic_int16x8	off,
		basic_int16x8	mask
	)

inline

Composes vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffff ) ? on0 : off0
...
rN = (maskN == 0xffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int16x16 simdpp::blend	(	basic_int16x16	on,
		basic_int16x16	off,
		basic_int16x16	mask
	)

inline

Composes vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffff ) ? on0 : off0
...
rN = (maskN == 0xffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int16x8 simdpp::blend	(	basic_int16x8	on,
		basic_int16x8	off,
		mask_int16x8	mask
	)

inline

Composes vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffff ) ? on0 : off0
...
rN = (maskN == 0xffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int16x16 simdpp::blend	(	basic_int16x16	on,
		basic_int16x16	off,
		mask_int16x16	mask
	)

inline

Composes vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffff ) ? on0 : off0
...
rN = (maskN == 0xffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x4 simdpp::blend	(	basic_int32x4	on,
		basic_int32x4	off,
		basic_int32x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x8 simdpp::blend	(	basic_int32x8	on,
		basic_int32x8	off,
		basic_int32x8	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x4 simdpp::blend	(	basic_int32x4	on,
		basic_int32x4	off,
		mask_int32x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x8 simdpp::blend	(	basic_int32x8	on,
		basic_int32x8	off,
		mask_int32x8	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x2 simdpp::blend	(	basic_int64x2	on,
		basic_int64x2	off,
		basic_int64x2	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x4 simdpp::blend	(	basic_int64x4	on,
		basic_int64x4	off,
		basic_int64x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x2 simdpp::blend	(	basic_int64x2	on,
		basic_int64x2	off,
		mask_int64x2	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x4 simdpp::blend	(	basic_int64x4	on,
		basic_int64x4	off,
		mask_int64x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::blend	(	float32x4	on,
		float32x4	off,
		float32x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::blend	(	float32x4	on,
		float32x4	off,
		int128	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::blend	(	float32x8	on,
		float32x8	off,
		float32x8	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::blend	(	float32x8	on,
		float32x8	off,
		int256	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::blend	(	float32x4	on,
		float32x4	off,
		mask_float32x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::blend	(	float32x8	on,
		float32x8	off,
		mask_float32x8	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::blend	(	float64x2	on,
		float64x2	off,
		float64x2	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

float64x2 simdpp::blend	(	float64x2	on,
		float64x2	off,
		int128	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

float64x4 simdpp::blend	(	float64x4	on,
		float64x4	off,
		float64x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

float64x4 simdpp::blend	(	float64x4	on,
		float64x4	off,
		int256	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

float64x2 simdpp::blend	(	float64x2	on,
		float64x2	off,
		mask_float64x2	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

float64x4 simdpp::blend	(	float64x4	on,
		float64x4	off,
		mask_float64x4	mask
	)

inline

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xffffffffffffffff ) ? on0 : off0
...
rN = (maskN == 0xffffffffffffffff ) ? onN : offN

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

template<unsigned s>

basic_int8x16 simdpp::broadcast ( basic_int8x16 a )

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 7 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 14 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int8x32 simdpp::broadcast ( basic_int8x32 a )

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 7 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 14 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int16x8 simdpp::broadcast ( basic_int16x8 a )

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int16x16 simdpp::broadcast ( basic_int16x16 a )

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int32x4 simdpp::broadcast ( basic_int32x4 a )

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int32x8 simdpp::broadcast ( basic_int32x8 a )

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int64x2 simdpp::broadcast ( basic_int64x2 a )

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s>

basic_int64x4 simdpp::broadcast ( basic_int64x4 a )

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s>

float32x4 simdpp::broadcast ( float32x4 a )

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

float32x8 simdpp::broadcast ( float32x8 a )

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

float64x2 simdpp::broadcast ( float64x2 a )

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s>

float64x4 simdpp::broadcast ( float64x4 a )

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s>

basic_int8x16 simdpp::broadcast_w ( basic_int8x16 a )

Broadcasts the specified 8-bit value to all elements within 128-bit lane.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int8x32 simdpp::broadcast_w ( basic_int8x32 a )

Broadcasts the specified 8-bit value to all elements within 128-bit lane.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int16x8 simdpp::broadcast_w ( basic_int16x8 a )

Broadcasts the specified 16-bit value to all elements within a int16x8 vector.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int16x16 simdpp::broadcast_w ( basic_int16x16 a )

Broadcasts the specified 16-bit value to all elements within a int16x8 vector.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int32x4 simdpp::broadcast_w ( basic_int32x4 a )

Broadcasts the specified 32-bit value to all elements within a int32x4 vector.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int32x8 simdpp::broadcast_w ( basic_int32x8 a )

Broadcasts the specified 32-bit value to all elements within a int32x4 vector.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

basic_int64x2 simdpp::broadcast_w ( basic_int64x2 a )

Broadcasts the specified 64-bit value to all elements within a int64x2 vector.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

template<unsigned s>

basic_int64x4 simdpp::broadcast_w ( basic_int64x4 a )

Broadcasts the specified 64-bit value to all elements within a int64x2 vector.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

template<unsigned s>

float32x4 simdpp::broadcast_w ( float32x4 a )

Broadcasts the specified 32-bit value to all elements within a float32x4 vector.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

float32x8 simdpp::broadcast_w ( float32x8 a )

Broadcasts the specified 32-bit value to all elements within a float32x4 vector.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s>

float64x2 simdpp::broadcast_w ( float64x2 a )

Broadcasts the specified 64-bit value to all elements within a float64x2 vector.

r0 = a[s]

r1 = a[s]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s>

float64x4 simdpp::broadcast_w ( float64x4 a )

Broadcasts the specified 64-bit value to all elements within a float64x2 vector.

r0 = a[s]

r1 = a[s]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<int s0, int s1>

basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r14 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+14] : b[s0+12])
r15 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+14] : b[s1+12])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r14 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+14] : b[s0+12])
r15 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+14] : b[s1+12])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<unsigned shift>

basic_int8x16 simdpp::move_l ( basic_int8x16 a )

Moves the elements in an int8x16 vector to the left by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [ a1  a2   .  a15  0  ]
    r = [ a2  a3   .   0   0  ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [ a15  0   .   0   0  ]
   r = [  0   0   .   0   0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int8x32 simdpp::move_l ( basic_int8x32 a )

Moves the elements in an int8x16 vector to the left by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [ a1  a2   .  a15  0  ]
    r = [ a2  a3   .   0   0  ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [ a15  0   .   0   0  ]
   r = [  0   0   .   0   0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x8 simdpp::move_l ( basic_int16x8 a )

Moves the 16-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [ a1 a2  . a7  0 ]
    r = [ a2 a3  .  0  0 ]
  ...    ..   .. .. ... .. ..
    r = [ a6 a7  .  0  0 ]
    r = [ a7  0  .  0  0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x16 simdpp::move_l ( basic_int16x16 a )

Moves the 16-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [ a1 a2  . a7  0 ]
    r = [ a2 a3  .  0  0 ]
  ...    ..   .. .. ... .. ..
    r = [ a6 a7  .  0  0 ]
    r = [ a7  0  .  0  0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x4 simdpp::move_l ( basic_int32x4 a )

Moves the 32-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [ a1 a2 a3  0 ]
    r = [ a2 a3  0  0 ]
    r = [ a3  0  0  0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x8 simdpp::move_l ( basic_int32x8 a )

Moves the 32-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [ a1 a2 a3  0 ]
    r = [ a2 a3  0  0 ]
    r = [ a3  0  0  0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x2 simdpp::move_l ( basic_int64x2 a )

Moves the 64-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [ a1  0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x4 simdpp::move_l ( basic_int64x4 a )

Moves the 64-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [ a1  0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x4 simdpp::move_l ( float32x4 a )

Moves the 32-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [ a1 a2 a3  0 ]
    r = [ a2 a3  0  0 ]
    r = [ a3  0  0  0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x8 simdpp::move_l ( float32x8 a )

Moves the 32-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [ a1 a2 a3  0 ]
    r = [ a2 a3  0  0 ]
    r = [ a3  0  0  0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float64x2 simdpp::move_l ( float64x2 a )

Moves the 64-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [ a1  0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float64x4 simdpp::move_l ( float64x4 a )

Moves the 64-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [ a1  0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int8x16 simdpp::move_r ( basic_int8x16 a )

Moves the 8-bit elements in a vector to the right by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [  0  a0   .  a13 a14 ]
    r = [  0   0   .  a12 a13 ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [  0   0   .  a0  a1  ]
   r = [  0   0   .   0  a0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int8x32 simdpp::move_r ( basic_int8x32 a )

Moves the 8-bit elements in a vector to the right by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [  0  a0   .  a13 a14 ]
    r = [  0   0   .  a12 a13 ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [  0   0   .  a0  a1  ]
   r = [  0   0   .   0  a0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x8 simdpp::move_r ( basic_int16x8 a )

Moves the 16-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [  0 a0  . a5 a6 ]
    r = [  0  0  . a4 a5 ]
  ...    ..   .. .. ... .. ..
    r = [  0  0  . a0 a1 ]
    r = [  0  0  .  0 a0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int16x16 simdpp::move_r ( basic_int16x16 a )

Moves the 16-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [  0 a0  . a5 a6 ]
    r = [  0  0  . a4 a5 ]
  ...    ..   .. .. ... .. ..
    r = [  0  0  . a0 a1 ]
    r = [  0  0  .  0 a0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x4 simdpp::move_r ( basic_int32x4 a )

Moves the 32-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [  0 a0 a1 a2 ]
    r = [  0  0 a0 a1 ]
    r = [  0  0  0 a0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int32x8 simdpp::move_r ( basic_int32x8 a )

Moves the 32-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [  0 a0 a1 a2 ]
    r = [  0  0 a0 a1 ]
    r = [  0  0  0 a0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x2 simdpp::move_r ( basic_int64x2 a )

Moves the 64-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [  0 a0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

basic_int64x4 simdpp::move_r ( basic_int64x4 a )

Moves the 64-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [  0 a0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x4 simdpp::move_r ( float32x4 a )

Moves the 32-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [  0 a0 a1 a2 ]
    r = [  0  0 a0 a1 ]
    r = [  0  0  0 a0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float32x8 simdpp::move_r ( float32x8 a )

Moves the 32-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [  0 a0 a1 a2 ]
    r = [  0  0 a0 a1 ]
    r = [  0  0  0 a0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float64x2 simdpp::move_r ( float64x2 a )

Moves the 64-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [  0 a0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift>

float64x4 simdpp::move_r ( float64x4 a )

Moves the 64-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [  0 a0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

int128 simdpp::permute ( basic_int16x8 a )

Permutes the 16-bit values within each 4 consecutive values of the vector.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a[s0+12]
...
r15 = a[s3+12]

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-10 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

basic_int16x16 simdpp::permute ( basic_int16x16 a )

Permutes the 16-bit values within each 4 consecutive values of the vector.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a[s0+12]
...
r15 = a[s3+12]

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-10 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

int128 simdpp::permute_bytes16	(	int128	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

float32x4 simdpp::permute_bytes16	(	float32x4	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

float64x2 simdpp::permute_bytes16	(	float64x2	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

int256 simdpp::permute_bytes16	(	int256	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

float32x8 simdpp::permute_bytes16	(	float32x8	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

float64x4 simdpp::permute_bytes16	(	float64x4	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

basic_int8x16 simdpp::unzip_hi	(	basic_int8x16	a,
		basic_int8x16	b
	)

inline

De-interleaves the even(higher) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a1 a3 a5 a7 a9 a11 a13 a15 b1 b3 b5 b7 b9 b11 b13 b15 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

basic_int8x32 simdpp::unzip_hi	(	basic_int8x32	a,
		basic_int8x32	b
	)

inline

De-interleaves the even(higher) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a1 a3 a5 a7 a9 a11 a13 a15 b1 b3 b5 b7 b9 b11 b13 b15 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

basic_int16x8 simdpp::unzip_hi	(	basic_int16x8	a,
		basic_int16x8	b
	)

inline

De-interleaves the even(higher) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a1 a3 a5 a7 b1 b3 b5 b7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

basic_int16x16 simdpp::unzip_hi	(	basic_int16x16	a,
		basic_int16x16	b
	)

inline

De-interleaves the even(higher) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a1 a3 a5 a7 b1 b3 b5 b7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

basic_int32x4 simdpp::unzip_hi	(	basic_int32x4	a,
		basic_int32x4	b
	)

inline

De-interleaves the even(higher) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

basic_int32x8 simdpp::unzip_hi	(	basic_int32x8	a,
		basic_int32x8	b
	)

inline

De-interleaves the even(higher) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

basic_int64x2 simdpp::unzip_hi	(	basic_int64x2	a,
		basic_int64x2	b
	)

inline

De-interleaves the even(higher) elements of two int64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

basic_int64x4 simdpp::unzip_hi	(	basic_int64x4	a,
		basic_int64x4	b
	)

inline

De-interleaves the even(higher) elements of two int64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

float32x4 simdpp::unzip_hi	(	float32x4	a,
		float32x4	b
	)

inline

De-interleaves the even(higher) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::unzip_hi	(	float32x8	a,
		float32x8	b
	)

inline

De-interleaves the even(higher) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::unzip_hi	(	float64x2	a,
		float64x2	b
	)

inline

De-interleaves the even(higher) elements of two float64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

Not vectorized in NEON and .
In SSE2-AVX this intrinsic results in at least 2 instructions.

float64x4 simdpp::unzip_hi	(	float64x4	a,
		float64x4	b
	)

inline

De-interleaves the even(higher) elements of two float64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

Not vectorized in NEON and .
In SSE2-AVX this intrinsic results in at least 2 instructions.

basic_int8x16 simdpp::unzip_lo	(	basic_int8x16	a,
		basic_int8x16	b
	)

inline

De-interleaves the odd(lower) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a0 a2 a4 a6 a8 a10 a12 a14 b0 b2 b4 b6 b8 b10 b12 b14 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In NEON this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

basic_int8x32 simdpp::unzip_lo	(	basic_int8x32	a,
		basic_int8x32	b
	)

inline

De-interleaves the odd(lower) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a0 a2 a4 a6 a8 a10 a12 a14 b0 b2 b4 b6 b8 b10 b12 b14 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In NEON this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

basic_int16x8 simdpp::unzip_lo	(	basic_int16x8	a,
		basic_int16x8	b
	)

inline

De-interleaves the odd(lower) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a0 a2 a4 a6 b0 b2 b4 b6 ]

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

basic_int16x16 simdpp::unzip_lo	(	basic_int16x16	a,
		basic_int16x16	b
	)

inline

De-interleaves the odd(lower) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a0 a2 a4 a6 b0 b2 b4 b6 ]

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

basic_int32x4 simdpp::unzip_lo	(	basic_int32x4	a,
		basic_int32x4	b
	)

inline

De-interleaves the odd(lower) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

basic_int32x8 simdpp::unzip_lo	(	basic_int32x8	a,
		basic_int32x8	b
	)

inline

De-interleaves the odd(lower) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

basic_int64x2 simdpp::unzip_lo	(	basic_int64x2	a,
		basic_int64x2	b
	)

inline

De-interleaves the odd(lower) elements of two int64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x4 simdpp::unzip_lo	(	basic_int64x4	a,
		basic_int64x4	b
	)

inline

De-interleaves the odd(lower) elements of two int64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::unzip_lo	(	float32x4	a,
		float32x4	b
	)

inline

De-interleaves the odd(lower) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::unzip_lo	(	float32x8	a,
		float32x8	b
	)

inline

De-interleaves the odd(lower) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::unzip_lo	(	float64x2	a,
		float64x2	b
	)

inline

De-interleaves the odd(lower) elements of two float64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

float64x4 simdpp::unzip_lo	(	float64x4	a,
		float64x4	b
	)

inline

De-interleaves the odd(lower) elements of two float64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

basic_int8x16 simdpp::zip_hi	(	basic_int8x16	a,
		basic_int8x16	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int8x32 simdpp::zip_hi	(	basic_int8x32	a,
		basic_int8x32	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int16x8 simdpp::zip_hi	(	basic_int16x8	a,
		basic_int16x8	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int16x16 simdpp::zip_hi	(	basic_int16x16	a,
		basic_int16x16	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x4 simdpp::zip_hi	(	basic_int32x4	a,
		basic_int32x4	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int32x8 simdpp::zip_hi	(	basic_int32x8	a,
		basic_int32x8	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x2 simdpp::zip_hi	(	basic_int64x2	a,
		basic_int64x2	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int64x4 simdpp::zip_hi	(	basic_int64x4	a,
		basic_int64x4	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::zip_hi	(	float32x4	a,
		float32x4	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::zip_hi	(	float32x8	a,
		float32x8	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::zip_hi	(	float64x2	a,
		float64x2	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x4 simdpp::zip_hi	(	float64x4	a,
		float64x4	b
	)

inline

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int8x16 simdpp::zip_lo	(	basic_int8x16	a,
		basic_int8x16	b
	)

inline

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

basic_int8x32 simdpp::zip_lo	(	basic_int8x32	a,
		basic_int8x32	b
	)

inline

basic_int16x8 simdpp::zip_lo	(	basic_int16x8	a,
		basic_int16x8	b
	)

inline

basic_int16x16 simdpp::zip_lo	(	basic_int16x16	a,
		basic_int16x16	b
	)

inline

basic_int32x4 simdpp::zip_lo	(	basic_int32x4	a,
		basic_int32x4	b
	)

inline

basic_int32x8 simdpp::zip_lo	(	basic_int32x8	a,
		basic_int32x8	b
	)

inline

basic_int64x2 simdpp::zip_lo	(	basic_int64x2	a,
		basic_int64x2	b
	)

inline

basic_int64x4 simdpp::zip_lo	(	basic_int64x4	a,
		basic_int64x4	b
	)

inline

float32x4 simdpp::zip_lo	(	float32x4	a,
		float32x4	b
	)

inline

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

float32x8 simdpp::zip_lo	(	float32x8	a,
		float32x8	b
	)

inline

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

float64x2 simdpp::zip_lo	(	float64x2	a,
		float64x2	b
	)

inline

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

float64x4 simdpp::zip_lo	(	float64x4	a,
		float64x4	b
	)

inline

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

Functions

Detailed Description

Function Documentation