Namespaces
	altivec

	neon

	sse

Classes
class	aligned_allocator
	An allocator that allocates memory with stricter alignment requirements than the defaults. More...

struct	expr_bit_and

struct	expr_bit_andnot

struct	expr_bit_not

struct	expr_bit_or

struct	expr_bit_xor

struct	expr_blend

struct	expr_splat2

struct	expr_splat4

struct	expr_splat8

struct	expr_splat16

struct	expr_vec_construct

struct	expr_vec_load_splat

struct	expr_vec_set_splat

struct	expr_vec_make_const

struct	expr_vec_load

struct	expr_vec_load_u

struct	expr_add

struct	expr_add_sat

struct	expr_sub

struct	expr_sub_sat

struct	expr_abs

struct	expr_neg

struct	expr_mul

struct	expr_mul_lo

struct	expr_mul_hi

struct	expr_mull

struct	expr_fmadd

struct	expr_fmsub

struct	expr_imm_shift_l

struct	expr_imm_shift_r

struct	any_vec
	Represents any vector that has B bytes of data. More...

struct	any_vec8

struct	any_vec16

struct	any_vec32

struct	any_vec64

struct	any_float32

struct	any_float64

struct	any_int8

struct	any_int16

struct	any_int32

struct	any_int64

class	float32< N, void >
	Class representing a float32 vector of arbitrary length. More...

class	mask_float32< N, void >
	Class representing a mask for 32-bit floating-point vector of arbitrary length. More...

class	float32< 4, void >
	Class representing float32x4 vector. More...

class	mask_float32< 4, void >
	Class representing possibly optimized mask data for 4x 32-bit floating-point vector. More...

class	float32< 8, void >
	Class representing float32x8 vector. More...

class	mask_float32< 8, void >
	Class representing possibly optimized mask data for 4x 32-bit floating-point vector. More...

class	float64< N, void >
	Class representing a float64 vector of arbitrary length. More...

class	mask_float64< N, void >
	Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...

class	float64< 2, void >

class	mask_float64< 2, void >
	Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...

class	float64< 4, void >

class	mask_float64< 4, void >
	Class representing possibly optimized mask data for 2x 64-bit floating point vector. More...

class	float32

class	mask_float32

class	float64

class	mask_float64

class	int8

class	uint8

class	mask_int8

class	int16

class	uint16

class	mask_int16

class	int32

class	uint32

class	mask_int32

class	int64

class	uint64

class	mask_int64

class	int16< N, void >
	Class representing an signed int16 vector of arbitrary length. More...

class	uint16< N, void >
	Class representing an unsigned int16 vector of arbitrary length. More...

class	mask_int16< N, void >
	Class representing a mask for 16-bit integer vector of arbitrary length. More...

class	int16< 16, void >
	Class representing 16x 16-bit signed integer vector. More...

class	uint16< 16, void >
	Class representing 16x 16-bit unsigned integer vector. More...

class	mask_int16< 16, void >
	Class representing possibly optimized mask data for 8x 16-bit integer vector. More...

class	int16< 8, void >
	Class representing 8x 16-bit signed integer vector. More...

class	uint16< 8, void >
	Class representing 8x 16-bit unsigned integer vector. More...

class	mask_int16< 8, void >
	Class representing possibly optimized mask data for 8x 16-bit integer vector. More...

class	int32< N, void >
	Class representing an signed int64 vector of arbitrary length. More...

class	uint32< N, void >
	Class representing an unsigned int32 vector of arbitrary length. More...

class	mask_int32< N, void >
	Class representing a mask for 32-bit integer vector of arbitrary length. More...

class	int32< 4, void >
	Class representing 4x 32-bit signed integer vector. More...

class	uint32< 4, void >
	Class representing 4x 32-bit unsigned integer vector. More...

class	mask_int32< 4, void >
	Class representing possibly optimized mask data for 4x 32-bit integer vector. More...

class	int32< 8, void >
	Class representing 8x 32-bit signed integer vector. More...

class	uint32< 8, void >
	Class representing 8x 32-bit unsigned integer vector. More...

class	mask_int32< 8, void >
	Class representing possibly optimized mask data for 4x 32-bit integer vector. More...

class	int64< N, void >
	Class representing an signed int64 vector of arbitrary length. More...

class	uint64< N, void >
	Class representing an unsigned int64 vector of arbitrary length. More...

class	mask_int64< N, void >
	Class representing a mask for 64-bit integer vector of arbitrary length. More...

class	int64< 2, void >
	Class representing 2x 64-bit signed integer vector. More...

class	uint64< 2, void >
	Class representing 2x 64-bit unsigned integer vector. More...

class	mask_int64< 2, void >
	Class representing possibly optimized mask data for 2x 64-bit integer vector. More...

class	int64< 4, void >
	Class representing 4x 64-bit signed integer vector. More...

class	uint64< 4, void >
	Class representing 4x 64-bit unsigned integer vector. More...

class	mask_int64< 4, void >
	Class representing possibly optimized mask data for 4x 64-bit integer vector. More...

class	int8< N, void >
	Class representing an signed int8 vector of arbitrary length. More...

class	uint8< N, void >
	Class representing an unsigned int8 vector of arbitrary length. More...

class	mask_int8< N, void >
	Class representing a mask for 8-bit integer vector of arbitrary length. More...

class	int8< 16, void >
	Class representing 16x 8-bit signed integer vector. More...

class	uint8< 16, void >
	Class representing 16x 8-bit unsigned integer vector. More...

class	mask_int8< 16, void >
	Class representing possibly optimized mask data for 16x 8-bit integer vector. More...

class	int8< 32, void >
	Class representing 32x 8-bit signed integer vector. More...

class	uint8< 32, void >
	Class representing 32x 8-bit unsigned integer vector. More...

class	mask_int8< 32, void >
	Class representing possibly optimized mask data for 16x 8-bit integer vector. More...

struct	is_vector
	Allows detection whether specific type is a simdpp vector. More...

struct	is_vector< float32< N, E > >

struct	is_vector< float64< N, E > >

struct	is_vector< int8< N, E > >

struct	is_vector< int16< N, E > >

struct	is_vector< int32< N, E > >

struct	is_vector< int64< N, E > >

struct	is_vector< uint8< N, E > >

struct	is_vector< uint16< N, E > >

struct	is_vector< uint32< N, E > >

struct	is_vector< uint64< N, E > >

struct	is_vector< mask_int8< N, E > >

struct	is_vector< mask_int16< N, E > >

struct	is_vector< mask_int32< N, E > >

struct	is_vector< mask_int64< N, E > >

struct	is_vector< mask_float32< N, E > >

struct	is_vector< mask_float64< N, E > >

struct	is_mask
	Allows detection whether specific type is a simdpp mask. More...

struct	is_mask< mask_int8< N, E > >

struct	is_mask< mask_int16< N, E > >

struct	is_mask< mask_int32< N, E > >

struct	is_mask< mask_int64< N, E > >

struct	is_mask< mask_float32< N, E > >

struct	is_mask< mask_float64< N, E > >

Typedefs
using	GetArchCb = std::function< Arch()>

using	float32x4 = float32< 4 >

using	float32x8 = float32< 8 >

using	mask_float32x4 = mask_float32< 4 >

using	mask_float32x8 = mask_float32< 8 >

using	float64x2 = float64< 2 >

using	float64x4 = float64< 4 >

using	mask_float64x2 = mask_float64< 2 >

using	mask_float64x4 = mask_float64< 4 >

using	int8x16 = int8< 16 >

using	int8x32 = int8< 32 >

using	uint8x16 = uint8< 16 >

using	uint8x32 = uint8< 32 >

using	mask_int8x16 = mask_int8< 16 >

using	mask_int8x32 = mask_int8< 32 >

using	int16x8 = int16< 8 >

using	int16x16 = int16< 16 >

using	uint16x8 = uint16< 8 >

using	uint16x16 = uint16< 16 >

using	mask_int16x8 = mask_int16< 8 >

using	mask_int16x16 = mask_int16< 16 >

using	int32x4 = int32< 4 >

using	int32x8 = int32< 8 >

using	uint32x4 = uint32< 4 >

using	uint32x8 = uint32< 8 >

using	mask_int32x4 = mask_int32< 4 >

using	mask_int32x8 = mask_int32< 8 >

using	int64x2 = int64< 2 >

using	int64x4 = int64< 4 >

using	uint64x2 = uint64< 2 >

using	uint64x4 = uint64< 4 >

using	mask_int64x2 = mask_int64< 2 >

using	mask_int64x4 = mask_int64< 4 >

using	float32v = float32< SIMDPP_FAST_FLOAT32_SIZE >

using	mask_float32v = mask_float32< SIMDPP_FAST_FLOAT32_SIZE >

using	float64v = float64< SIMDPP_FAST_FLOAT64_SIZE >

using	mask_float64v = mask_float64< SIMDPP_FAST_FLOAT64_SIZE >

using	int8v = int8< SIMDPP_FAST_INT8_SIZE >

using	uint8v = uint8< SIMDPP_FAST_INT8_SIZE >

using	mask_int8v = mask_int8< SIMDPP_FAST_INT8_SIZE >

using	int16v = int16< SIMDPP_FAST_INT16_SIZE >

using	uint16v = uint16< SIMDPP_FAST_INT16_SIZE >

using	mask_int16v = mask_int16< SIMDPP_FAST_INT16_SIZE >

using	int32v = int32< SIMDPP_FAST_INT32_SIZE >

using	uint32v = uint32< SIMDPP_FAST_INT32_SIZE >

using	mask_int32v = mask_int32< SIMDPP_FAST_INT32_SIZE >

using	int64v = int64< SIMDPP_FAST_INT64_SIZE >

using	uint64v = uint64< SIMDPP_FAST_INT64_SIZE >

using	mask_int64v = mask_int64< SIMDPP_FAST_INT64_SIZE >

using	mask_float32v2 = mask_float32< SIMDPP_FAST_FLOAT32_SIZE *2 >

using	float64v2 = float64< SIMDPP_FAST_FLOAT64_SIZE *2 >

using	mask_float64v2 = mask_float64< SIMDPP_FAST_FLOAT64_SIZE *2 >

using	int8v2 = int8< SIMDPP_FAST_INT8_SIZE *2 >

using	uint8v2 = uint8< SIMDPP_FAST_INT8_SIZE *2 >

using	mask_int8v2 = mask_int8< SIMDPP_FAST_INT8_SIZE *2 >

using	int16v2 = int16< SIMDPP_FAST_INT16_SIZE *2 >

using	uint16v2 = uint16< SIMDPP_FAST_INT16_SIZE *2 >

using	mask_int16v2 = mask_int16< SIMDPP_FAST_INT16_SIZE *2 >

using	int32v2 = int32< SIMDPP_FAST_INT32_SIZE *2 >

using	uint32v2 = uint32< SIMDPP_FAST_INT32_SIZE *2 >

using	mask_int32v2 = mask_int32< SIMDPP_FAST_INT32_SIZE *2 >

using	int64v2 = int64< SIMDPP_FAST_INT64_SIZE *2 >

using	uint64v2 = uint64< SIMDPP_FAST_INT64_SIZE *2 >

using	mask_int64v2 = mask_int64< SIMDPP_FAST_INT64_SIZE *2 >

using	float32v4 = float32< SIMDPP_FAST_FLOAT32_SIZE *4 >

using	mask_float32v4 = mask_float32< SIMDPP_FAST_FLOAT32_SIZE *4 >

using	float64v4 = float64< SIMDPP_FAST_FLOAT64_SIZE *4 >

using	mask_float64v4 = mask_float64< SIMDPP_FAST_FLOAT64_SIZE *4 >

using	int8v4 = int8< SIMDPP_FAST_INT8_SIZE *4 >

using	uint8v4 = uint8< SIMDPP_FAST_INT8_SIZE *4 >

using	mask_int8v4 = mask_int8< SIMDPP_FAST_INT8_SIZE *4 >

using	int16v4 = int16< SIMDPP_FAST_INT16_SIZE *4 >

using	uint16v4 = uint16< SIMDPP_FAST_INT16_SIZE *4 >

using	mask_int16v4 = mask_int16< SIMDPP_FAST_INT16_SIZE *4 >

using	int32v4 = int32< SIMDPP_FAST_INT32_SIZE *4 >

using	uint32v4 = uint32< SIMDPP_FAST_INT32_SIZE *4 >

using	mask_int32v4 = mask_int32< SIMDPP_FAST_INT32_SIZE *4 >

using	int64v4 = int64< SIMDPP_FAST_INT64_SIZE *4 >

using	uint64v4 = uint64< SIMDPP_FAST_INT64_SIZE *4 >

using	mask_int64v4 = mask_int64< SIMDPP_FAST_INT64_SIZE *4 >

Enumerations
enum	Arch : std::uint32_t { Arch::NONE_NULL = 0, Arch::X86_SSE2 = 1 << 1, Arch::X86_SSE3 = 1 << 2, Arch::X86_SSSE3 = 1 << 3, Arch::X86_SSE4_1 = 1 << 4, Arch::X86_AVX = 1 << 5, Arch::X86_AVX2 = 1 << 6, Arch::X86_FMA3 = 1 << 7, Arch::X86_FMA4 = 1 << 8, Arch::X86_XOP = 1 << 9, Arch::ARM_NEON = 1 << 0, Arch::ARM_NEON_FLT_SP = 1 << 1, Arch::POWER_ALTIVEC = 1 << 0 }
	Identifies supported instruction set. More...

Functions
void	transpose2 (uint16x8 &a0, uint16x8 &a1)
	Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...

void	transpose2 (int16x8 &a0, int16x8 &a1)

void	transpose2 (uint16x16 &a0, uint16x16 &a1)

void	transpose2 (int16x16 &a0, int16x16 &a1)

void	transpose8 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3, uint8x16 &a4, uint8x16 &a5, uint8x16 &a6, uint8x16 &a7)
	Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...

void	transpose8 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3, int8x16 &a4, int8x16 &a5, int8x16 &a6, int8x16 &a7)

void	transpose8 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3, uint8x32 &a4, uint8x32 &a5, uint8x32 &a6, uint8x32 &a7)

void	transpose8 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3, int8x32 &a4, int8x32 &a5, int8x32 &a6, int8x32 &a7)

void	transpose8 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3, uint16x8 &a4, uint16x8 &a5, uint16x8 &a6, uint16x8 &a7)
	Transposes a 8x8 16-bit matrix within eight int16x8 vectors. More...

void	transpose8 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3, int16x8 &a4, int16x8 &a5, int16x8 &a6, int16x8 &a7)

void	transpose8 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3, uint16x16 &a4, uint16x16 &a5, uint16x16 &a6, uint16x16 &a7)

void	transpose8 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3, int16x16 &a4, int16x16 &a5, int16x16 &a6, int16x16 &a7)

template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	align16 (const any_vec8< N, V1 > &lower, const any_vec8< N, V2 > &upper)
	Extracts a int8x16 vector from two concatenated int8x16 vectors. More...

template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	align8 (const any_vec16< N, V1 > &lower, const any_vec16< N, V2 > &upper)
	Extracts a int16x8 vector from two concatenated int16x8 vectors. More...

template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	align4 (const any_vec32< N, V1 > &lower, const any_vec32< N, V2 > &upper)
	Extracts a int32x4 vector from two concatenated int32x4 vectors. More...

template<unsigned shift, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	align2 (const any_vec64< N, V1 > &lower, const any_vec64< N, V2 > &upper)
	Extracts a int64x2 vector from two concatenated int64x2 vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2< V1, V2, void >::empty	bit_xor (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
	Computes bitwise XOR of integer or floating-point vectors. More...

void	prefetch_read (const void *ptr)
	Prefetches data to the lowest level cache for reading. More...

void	prefetch_write (const void *ptr)
	Prefetches data to the lowest level cache for writing. More...

template<class R , class T >
R	bit_cast (T t)
	Casts between unrelated types. More...

template<unsigned N, class V1 , class V2 >
mask_int8< N, mask_int8< N > >	cmp_eq (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
	Compares 8-bit values for equality. More...

template<unsigned N, class V1 , class V2 >
mask_int16< N, mask_int16< N > >	cmp_eq (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
	Compares 16-bit values for equality. More...

template<unsigned N, class V1 , class V2 >
mask_int32< N, mask_int32< N > >	cmp_eq (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
	Compares the values of two int32x4 vectors for equality. More...

template<unsigned N, class V1 , class V2 >
mask_int64< N, mask_int64< N > >	cmp_eq (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
	Compares the values of two int64x2 vectors for equality. More...

template<unsigned N, class V1 , class V2 >
mask_float32< N, mask_float32 < N > >	cmp_eq (const any_float32< N, V1 > &a, const any_float32< N, V2 > &b)
	Compares the values of two float32x4 vectors for equality. More...

template<unsigned N, class V1 , class V2 >
mask_float64< N, mask_float64 < N > >	cmp_eq (const any_float64< N, V1 > &a, const any_float64< N, V2 > &b)
	Compares the values of two float64x2 vectors for equality. More...

template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64 < N > >	cmp_ge (float64< N, E1 > a, float64< N, E2 > b)
	Compares the values of two float64x2 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > >	cmp_gt (int8< N, E1 > a, int8< N, E2 > b)
	Compares the values of two signed int16x8 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > >	cmp_gt (uint8< N, E1 > a, uint8< N, E2 > b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > >	cmp_gt (int16< N, E1 > a, int16< N, E2 > b)
	Compares the values of two signed int16x8 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > >	cmp_gt (uint16< N, E1 > a, uint16< N, E2 > b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...

template<unsigned N, class V1 , class V2 >
mask_int8< N, mask_int8< N > >	cmp_neq (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
	Compares the values of two int8x16 vectors for inequality. More...

template<unsigned N, class V1 , class V2 >
mask_int16< N, mask_int16< N > >	cmp_neq (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
	Compares the values of two int16x8 vectors for inequality. More...

template<unsigned N, class V1 , class V2 >
mask_int32< N, mask_int32< N > >	cmp_neq (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
	Compares the values of two int32x4 vectors for inequality. More...

template<unsigned N, class V1 , class V2 >
mask_int64< N, mask_int64< N > >	cmp_neq (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
	Compares the values of two int64x2 vectors for inequality. More...

template<unsigned N, class V1 , class V2 >
mask_float32< N, mask_float32 < N > >	cmp_neq (const any_float32< N, V1 > &a, const any_float32< N, V2 > &b)
	Compares the values of two float32x4 vectors for inequality. More...

template<unsigned N, class V1 , class V2 >
mask_float64< N, mask_float64 < N > >	cmp_neq (const any_float64< N, V1 > &a, const any_float64< N, V2 > &b)
	Compares the values of two float64x2 vectors for inequality. More...

template<unsigned id>
float	extract (float32x4 a)
	Extracts an element from float32x4 vector. More...

template<unsigned id>
double	extract (float64x2 a)
	Extracts an element from float64x2 vector. More...

uint16_t	extract_bits_any (uint8x16 a)
	Extracts a bit from each byte of each element of a int8x16 vector. More...

template<unsigned id>
uint16_t	extract_bits (uint8x16 a)
	Extracts specific bit from each byte of each element of a int8x16 vector. More...

template<unsigned N, class E >
float32< N, expr_abs< float32 < N, E > > >	abs (float32< N, E > a)
	Computes absolute value of floating point values. More...

template<unsigned N, class E >
float64< N, expr_abs< float64 < N, E > > >	abs (float64< N, E > a)
	Computes absolute value of floating point values. More...

template<unsigned N, class E1 , class E2 >
float32< N, expr_add< float32 < N, E1 >, float32< N, E2 > > >	add (float32< N, E1 > a, float32< N, E2 > b)
	Adds the values of two vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, expr_add< float64 < N, E1 >, float64< N, E2 > > >	add (float64< N, E1 > a, float64< N, E2 > b)
	Adds the values of two vectors. More...

template<unsigned N, class E >
float32< N, float32< N > >	ceil (float32< N, E > a)
	Rounds the values a vector towards positive infinity. More...

template<unsigned N, class E1 , class E2 >
float32< N, float32< N > >	div (float32< N, E1 > a, float32< N, E2 > b)
	Divides the values of two vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, float64< N > >	div (float64< N, E1 > a, float64< N, E2 > b)
	Divides the values of two vectors. More...

template<unsigned N, class E >
float32< N, float32< N > >	floor (float32< N, E > a)
	Rounds the values of a vector towards negative infinity. More...

template<unsigned N, class E >
mask_float32< N, mask_float32 < N > >	isnan (float32< N, E > a)
	Checks whether elements in a are IEEE754 NaN. More...

template<unsigned N, class E >
mask_float64< N, mask_float64 < N > >	isnan (float64< N, E > a)
	Checks whether elements in a are IEEE754 NaN. More...

template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32 < N > >	isnan2 (float32< N, E1 > a, float32< N, E2 > b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...

template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64 < N > >	isnan2 (float64< N, E1 > a, float64< N, E2 > b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...

template<unsigned N, class E1 , class E2 >
float32< N, float32< N > >	max (float32< N, E1 > a, float32< N, E2 > b)
	Computes maxima of the values of two vectors. More...

template<unsigned N, class E1 , class E2 >
float32< N, float32< N > >	min (float32< N, E1 > a, float32< N, E2 > b)
	Computes minimum of the values in two vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, float64< N > >	min (float64< N, E1 > a, float64< N, E2 > b)
	Computes minima of the values in two vectors. More...

template<unsigned N, class E1 , class E2 >
float32< N, expr_mul< float32 < N, E1 >, float32< N, E2 > > >	mul (float32< N, E1 > a, float32< N, E2 > b)
	Multiplies the values of two vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, expr_mul< float64 < N, E1 >, float64< N, E2 > > >	mul (float64< N, E1 > a, float64< N, E2 > b)
	Multiplies the values of two vectors. More...

template<unsigned N, class E >
float32< N, expr_neg< float32 < N, E > > >	neg (float32< N, E > a)
	Negates the values of a float32x4 vector. More...

template<unsigned N, class E >
float64< N, expr_neg< float64 < N, E > > >	neg (float64< N, E > a)
	Negates the values of a vector. More...

template<unsigned N, class E >
float32< N, float32< N > >	rcp_e (float32< N, E > a)
	Computes approximate reciprocal. More...

template<unsigned N, class E >
float32< N, float32< N > >	rcp_rh (float32< N, E > a)
	Computes one Newton-Rhapson iterations for reciprocal. More...

template<unsigned N, class E >
float32< N, float32< N > >	rsqrt_e (float32< N, E > a)
	Computes approximate reciprocal square root. More...

template<unsigned N, class E >
float32< N, float32< N > >	rsqrt_rh (float32< N, E > a)
	Computes one Newton-Rhapson iteration for inverse of square root. More...

template<unsigned N, class E >
float32< N, float32< N > >	sign (float32< N, E > a)
	Extracts sign bits from the values in float32x4 vector. More...

template<unsigned N, class E >
float64< N, float64< N > >	sign (float64< N, E > a)
	Extracts sigh bit from the values in float64x2 vector. More...

template<unsigned N, class E1 >
float32< N, float32< N > >	sqrt (float32< N, E1 > a)
	Computes square root. More...

template<unsigned N, class E1 >
float64< N, float64< N > >	sqrt (float64< N, E1 > a)
	Computes square root. More...

template<unsigned N, class E1 , class E2 >
float32< N, expr_sub< float32 < N, E1 >, float32< N, E2 > > >	sub (float32< N, E1 > a, float32< N, E2 > b)
	Substracts the values of two vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, expr_sub< float64 < N, E1 >, float64< N, E2 > > >	sub (float64< N, E1 > a, float64< N, E2 > b)
	Subtracts the values of two vectors. More...

template<unsigned N, class E >
float32< N, float32< N > >	trunc (float32< N, E > a)
	Rounds the values of a vector towards zero. More...

template<unsigned N, class E >
uint8< N, expr_abs< int8< N, E > > >	abs (int8< N, E > a)
	Computes absolute value of 8-bit integer values. More...

template<unsigned N, class E >
uint16< N, expr_abs< int16< N, E > > >	abs (int16< N, E > a)
	Computes absolute value of 16-bit integer values. More...

template<unsigned N, class E >
uint32< N, expr_abs< int32< N, E > > >	abs (int32< N, E > a)
	Computes absolute value of 32-bit integer values. More...

template<unsigned N, class E >
uint64< N, expr_abs< int64< N, E > > >	abs (int64< N, E > a)
	Computes absolute value of 64-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_add< uint8< N, typename V1::expr_type > , uint8< N, typename V2::expr_type > > >::type	add (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
	Adds 8-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_add< uint16< N, typename V1::expr_type > , uint16< N, typename V2::expr_type > > >::type	add (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
	Adds 16-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_add< uint32< N, typename V1::expr_type > , uint32< N, typename V2::expr_type > > >::type	add (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
	Adds 32-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_add< uint64< N, typename V1::expr_type > , uint64< N, typename V2::expr_type > > >::type	add (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
	Adds 64-bit integer values. More...

template<unsigned N, class E1 , class E2 >
int8< N, expr_add_sat< int8< N, E1 >, int8< N, E2 > > >	add_sat (int8< N, E1 > a, int8< N, E2 > b)
	Adds and saturates signed 8-bit integer values. More...

template<unsigned N, class E1 , class E2 >
int16< N, expr_add_sat< int16 < N, E1 >, int16< N, E2 > > >	add_sat (int16< N, E1 > a, int16< N, E2 > b)
	Adds and saturates signed 16-bit integer values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, expr_add_sat< uint8 < N, E1 >, uint8< N, E2 > > >	add_sat (uint8< N, E1 > a, uint8< N, E2 > b)
	Adds and saturates unsigned 8-bit integer values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, expr_add_sat < uint16< N, E1 >, uint16< N, E2 > > >	add_sat (uint16< N, E1 > a, uint16< N, E2 > b)
	Adds and saturates unsigned 16-bit integer values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	avg (uint8< N, E1 > a, uint8< N, E2 > b)
	Computes rounded average of the unsigned 8-bit values. More...

template<unsigned N, class E1 , class E2 >
int8< N, int8< N > >	avg (int8< N, E1 > a, int8< N, E2 > b)
	Computes rounded average of signed 8-bit values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	avg (uint16< N, E1 > a, uint16< N, E2 > b)
	Computes rounded average of unsigned 16-bit values. More...

template<unsigned N, class E1 , class E2 >
int16< N, int16< N > >	avg (int16< N, E1 > a, int16< N, E2 > b)
	Computes rounded average of signed 16-bit values. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	avg (uint32< N, E1 > a, uint32< N, E2 > b)
	Computes rounded average of unsigned 32-bit values. More...

template<unsigned N, class E1 , class E2 >
int32< N, int32< N > >	avg (int32< N, E1 > a, int32< N, E2 > b)
	Computes rounded average of signed 32-bit values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	avg_trunc (uint8< N, E1 > a, uint8< N, E2 > b)
	Computes truncated average of the unsigned 8-bit values. More...

template<unsigned N, class E1 , class E2 >
int8< N, int8< N > >	avg_trunc (int8< N, E1 > a, int8< N, E2 > b)
	Computes truncated average of signed 8-bit values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	avg_trunc (uint16< N, E1 > a, uint16< N, E2 > b)
	Computes truncated average of unsigned 16-bit values. More...

template<unsigned N, class E1 , class E2 >
int16< N, int16< N > >	avg_trunc (int16< N, E1 > a, int16< N, E2 > b)
	Computes truncated average of signed 16-bit values. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	avg_trunc (uint32< N, E1 > a, uint32< N, E2 > b)
	Computes truncated average of unsigned 32-bit values. More...

template<unsigned N, class E1 , class E2 >
int32< N, int32< N > >	avg_trunc (int32< N, E1 > a, int32< N, E2 > b)
	Computes truncated average of signed 32-bit values. More...

template<unsigned N, class E1 , class E2 >
int8< N, int8< N > >	max (int8< N, E1 > a, int8< N, E2 > b)
	Computes maximum of the signed 8-bit values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	max (uint8< N, E1 > a, uint8< N, E2 > b)
	Computes maximum of the unsigned 8-bit values. More...

template<unsigned N, class E1 , class E2 >
int16< N, int16< N > >	max (int16< N, E1 > a, int16< N, E2 > b)
	Computes maximum of the signed 16-bit values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	max (uint16< N, E1 > a, uint16< N, E2 > b)
	Computes maximum of the unsigned 16-bit values. More...

template<unsigned N, class E1 , class E2 >
int32< N, int32< N > >	max (int32< N, E1 > a, int32< N, E2 > b)
	Computes maximum of the signed 32-bit values. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	max (uint32< N, E1 > a, uint32< N, E2 > b)
	Computes maximum of the unsigned 32-bit values. More...

template<unsigned N, class E1 , class E2 >
int8< N, int8< N > >	min (int8< N, E1 > a, int8< N, E2 > b)
	Computes minimum of signed 8-bit values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	min (uint8< N, E1 > a, uint8< N, E2 > b)
	Computes minimum of the unsigned 8-bit values. More...

template<unsigned N, class E1 , class E2 >
int16< N, int16< N > >	min (int16< N, E1 > a, int16< N, E2 > b)
	Computes minimum of the signed 16-bit values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	min (uint16< N, E1 > a, uint16< N, E2 > b)
	Computes minimum of the unsigned 16-bit values. More...

template<unsigned N, class E1 , class E2 >
int32< N, int32< N > >	min (int32< N, E1 > a, int32< N, E2 > b)
	Computes minimum of the signed 32-bit values. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	min (uint32< N, E1 > a, uint32< N, E2 > b)
	Computes minimum of the unsigned 32-bit values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_mul_lo< uint16< N, typename V1::expr_type > , uint16< N, typename V2::expr_type > > >::type	mul_lo (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
	Multiplies 16-bit values and returns the lower part of the multiplication. More...

template<unsigned N, class E1 , class E2 >
int16< N, expr_mul_hi< int16 < N, E1 >, int16< N, E2 > > >	mul_hi (int16< N, E1 > a, int16< N, E2 > b)
	Multiplies signed 16-bit values and returns the higher half of the result. More...

template<unsigned N, class E1 , class E2 >
uint16< N, expr_mul_hi< uint16 < N, E1 >, uint16< N, E2 > > >	mul_hi (uint16< N, E1 > a, uint16< N, E2 > b)
	Multiplies unsigned 16-bit values and returns the higher half of the result. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_mul_lo< uint32< N, typename V1::expr_type > , uint32< N, typename V2::expr_type > > >::type	mul_lo (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
	Multiplies 32-bit values and returns the lower half of the result. More...

template<unsigned N, class E >
int8< N, expr_neg< int8< N, E > > >	neg (int8< N, E > a)
	Negates signed 8-bit values. More...

template<unsigned N, class E >
int16< N, expr_neg< int16< N, E > > >	neg (int16< N, E > a)
	Negates signed 16-bit values. More...

template<unsigned N, class E >
int32< N, expr_neg< int32< N, E > > >	neg (int32< N, E > a)
	Negates signed 32-bit values. More...

template<unsigned N, class E >
int64< N, expr_neg< int64< N, E > > >	neg (int64< N, E > a)
	Negates signed 64-bit values. More...

template<unsigned N, class E >
int8< N, int8< N > >	shift_r (int8< N, E > a, unsigned count)
	Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_sub< uint8< N, typename V1::expr_type > , uint8< N, typename V2::expr_type > > >::type	sub (const any_int8< N, V1 > &a, const any_int8< N, V2 > &b)
	Subtracts 8-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_sub< uint16< N, typename V1::expr_type > , uint16< N, typename V2::expr_type > > >::type	sub (const any_int16< N, V1 > &a, const any_int16< N, V2 > &b)
	Subtracts 16-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_sub< uint32< N, typename V1::expr_type > , uint32< N, typename V2::expr_type > > >::type	sub (const any_int32< N, V1 > &a, const any_int32< N, V2 > &b)
	Subtracts 32-bit integer values. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, expr_sub< uint64< N, typename V1::expr_type > , uint64< N, typename V2::expr_type > > >::type	sub (const any_int64< N, V1 > &a, const any_int64< N, V2 > &b)
	Subtracts 64-bit integer values. More...

template<unsigned N, class E1 , class E2 >
int8< N, expr_sub_sat< int8< N, E1 >, int8< N, E2 > > >	sub_sat (int8< N, E1 > a, int8< N, E2 > b)
	Subtracts and saturaters signed 8-bit integer values. More...

template<unsigned N, class E1 , class E2 >
int16< N, expr_sub_sat< int16 < N, E1 >, int16< N, E2 > > >	sub_sat (int16< N, E1 > a, int16< N, E2 > b)
	Subtracts and saturaters signed 16-bit integer values. More...

template<unsigned N, class E1 , class E2 >
uint8< N, expr_sub_sat< uint8 < N, E1 >, uint8< N, E2 > > >	sub_sat (uint8< N, E1 > a, uint8< N, E2 > b)
	Subtracts and saturaters unsigned 8-bit integer values. More...

template<unsigned N, class E1 , class E2 >
uint16< N, expr_sub_sat < uint16< N, E1 >, uint16< N, E2 > > >	sub_sat (uint16< N, E1 > a, uint16< N, E2 > b)
	Subtracts and saturaters unsigned 16-bit integer values. More...

template<unsigned id>
uint8x16	insert (uint8x16 a, uint8_t x)
	Inserts an element into int8x16 vector at the position identified by id. More...

template<unsigned id>
uint16x8	insert (uint16x8 a, uint16_t x)
	Inserts an element into int16x8 vector at the position identified by id. More...

template<unsigned id>
uint32x4	insert (uint32x4 a, uint32_t x)
	Inserts an element into int32x4 vector at the position identified by id. More...

template<unsigned id>
uint64x2	insert (uint64x2 a, uint64_t x)
	Inserts an element into int64x2 vector at the position identified by id. More...

template<unsigned id>
float32x4	insert (float32x4 a, float x)
	Inserts an element into float32x4 vector at the position identified by id. More...

template<unsigned id>
float64x2	insert (float64x2 a, double x)
	Inserts an element into float64x2 vector at the position identified by id. More...

template<class V = expr_vec_load>
V	load (const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location. More...

template<unsigned N, class V >
void	load_packed2 (any_vec< N, V > &a, any_vec< N, V > &b, const void *p)
	Loads values packed in pairs, de-interleaves them and stores the result into two vectors. More...

template<unsigned N, class V >
void	load_packed3 (any_vec< N, V > &a, any_vec< N, V > &b, any_vec< N, V > &c, const void *p)
	Loads values packed in triplets, de-interleaves them and stores the result into three vectors. More...

template<unsigned N, class V >
void	load_packed4 (any_vec< N, V > &a, any_vec< N, V > &b, any_vec< N, V > &c, any_vec< N, V > &d, const void *p)
	Loads values packed in quartets, de-interleaves them and stores the result into four vectors. More...

template<class V = expr_vec_load_splat>
V	load_splat (const void *p)
	Loads a value from a memory location and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_load_u>
V	load_u (const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute2 (const any_vec16< N, V > &a)
	Permutes the 16-bit values within sets of two consecutive elements of the vector. More...

template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute2 (const any_vec32< N, V > &a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	shuffle1 (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
	Selects 64-bit values from two vectors. More...

template<unsigned N, class V >
void	store (void *p, const any_vec< N, V > &a)
	Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...

template<unsigned N, class V >
void	store_first (void *p, const any_vec< N, V > &a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

template<unsigned N, class V >
void	store_last (void *p, const any_vec< N, V > &a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

template<unsigned N, class V1 , class V2 >
void	store_packed2 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
	Interleaves values from two vectors and stores the result into successive locations starting from p. More...

template<unsigned N, class V1 , class V2 , class V3 >
void	store_packed3 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b, const any_vec< N, V3 > &c)
	Interleaves values from three vectors and stores the result into successive locations starting from p. More...

template<unsigned N, class V1 , class V2 , class V3 , class V4 >
void	store_packed4 (void *p, const any_vec< N, V1 > &a, const any_vec< N, V2 > &b, const any_vec< N, V3 > &c, const any_vec< N, V4 > &d)
	Interleaves values from four vectors and stores the result into successive locations starting from p. More...

template<unsigned N, class V >
void	stream (void *p, const any_vec< N, V > &a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

uint16x16	to_int16 (int8x16 a)
	Sign extends the 16 values of a signed int8x16 vector to 16-bits. More...

uint16x16	to_int16 (uint8x16 a)
	Extends the 16 values of a unsigned int8x16 vector to 16-bits. More...

int32x8	to_int32 (int16x8 a)
	Sign extends the first 8 values of a signed int16x16 vector to 32-bits. More...

template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	unzip16_lo (uint8< N, E1 > a, uint8< N, E2 > b)
	De-interleaves the odd(lower) elements of two int8x16 vectors. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	unzip8_lo (uint16< N, E1 > a, uint16< N, E2 > b)
	De-interleaves the odd(lower) elements of two int16x8 vectors. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	unzip4_lo (uint32< N, E1 > a, uint32< N, E2 > b)
	De-interleaves the odd(lower) elements of two int32x4 vectors. More...

template<unsigned N, class E1 , class E2 >
uint64< N, uint64< N > >	unzip2_lo (uint64< N, E1 > a, uint64< N, E2 > b)
	De-interleaves the odd(lower) elements of two int64x2 vectors. More...

template<unsigned N, class E1 , class E2 >
float32< N, float32< N > >	unzip4_lo (float32< N, E1 > a, float32< N, E2 > b)
	De-interleaves the odd(lower) elements of two float32x4 vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, float64< N > >	unzip2_lo (float64< N, E1 > a, float64< N, E2 > b)
	De-interleaves the odd(lower) elements of two float64x2 vectors. More...

Arch	get_arch_gcc_builtin_cpu_supports ()
	Retrieves supported architecture using GCC __builtin_cpu_supports function. More...

Arch	get_arch_linux_cpuinfo ()
	Retrieves supported architecture from Linux /proc/cpuinfo file. More...

Arch	this_compile_arch ()
	Returns the instruction set flags that will be required by the currently compiled code. More...


void	transpose2 (uint32x4 &a0, uint32x4 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	transpose2 (int32x4 &a0, int32x4 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	transpose2 (uint32x8 &a0, uint32x8 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	transpose2 (int32x8 &a0, int32x8 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...


void	transpose2 (uint64x2 &a0, uint64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (int64x2 &a0, int64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (uint64x4 &a0, uint64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (int64x4 &a0, int64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...


void	transpose2 (float32x4 &a0, float32x4 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...

void	transpose2 (float32x8 &a0, float32x8 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...


void	transpose2 (float64x2 &a0, float64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (float64x4 &a0, float64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...


void	transpose4 (uint32x4 &a0, uint32x4 &a1, uint32x4 &a2, uint32x4 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...

void	transpose4 (int32x4 &a0, int32x4 &a1, int32x4 &a2, int32x4 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...

void	transpose4 (int32x8 &a0, int32x8 &a1, int32x8 &a2, int32x8 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...


void	transpose4 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (uint32x8 &a0, uint32x8 &a1, uint32x8 &a2, uint32x8 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...


void	transpose4 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	transpose4 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	transpose4 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	transpose4 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...


void	transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...

void	transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...


template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and < expr_bit_and, V1, V2 >::type	bit_and (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
	Computes bitwise AND of integer or floating-point vectors. More...


template<unsigned N, class V1 , class V2 >
detail::get_expr_bitwise2_and < expr_bit_andnot, V1, V2 > ::type	bit_andnot (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
	Computes bitwise AND NOT of two integer or floating-point vectors. More...


template<unsigned N, class V >
detail::get_expr< V, expr_bit_not< V > >::empty	bit_not (const any_vec< N, V > &a)
	Computes bitwise NOT of an integer or floating-point vector. More...


template<unsigned N, class V1 , class V2 >
detail::get_expr_bit_or< V1, V2 >::type	bit_or (const any_vec< N, V1 > &a, const any_vec< N, V2 > &b)
	Computes bitwise OR of integer vectors. More...


template<unsigned N, class V1 , class V2 , class V3 >
detail::get_expr_blend< V1, V2, V3 >::type	blend (const any_vec< N, V1 > &on, const any_vec< N, V2 > &off, const any_vec< N, V3 > &mask)
	Composes a vector from two sources according to a mask. More...


template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32 < N > >	cmp_ge (float32< N, E1 > a, float32< N, E2 > b)
	Compares the values of two float32x4 vectors for greater-than or equal. More...


template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > >	cmp_gt (int32< N, E1 > a, int32< N, E2 > b)
	Compares the values of two signed int32x4 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > >	cmp_gt (uint32< N, E1 > a, uint32< N, E2 > b)
	Compares the values of two unsigned int32x4 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32 < N > >	cmp_gt (float32< N, E1 > a, float32< N, E2 > b)
	Compares the values of two float32x4 vectors for greater-than. More...

template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64 < N > >	cmp_gt (float64< N, E1 > a, float64< N, E2 > b)
	Compares the values of two float64x2 vectors for greater-than. More...


template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32 < N > >	cmp_le (float32< N, E1 > a, float32< N, E2 > b)
	Compares the values of two float32x4 vectors for less-than or equal. More...


template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64 < N > >	cmp_le (float64< N, E1 > a, float64< N, E2 > b)
	Compares the values of two float64x2 vectors for less-than or equal. More...


template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > >	cmp_lt (int8< N, E1 > a, int8< N, E2 > b)
	Compares the values of two signed int8x16 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_int8< N, mask_int8< N > >	cmp_lt (uint8< N, E1 > a, uint8< N, E2 > b)
	Compares the values of two unsigned int8x16 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > >	cmp_lt (int16< N, E1 > a, int16< N, E2 > b)
	Compares the values of two signed int16x8 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_int16< N, mask_int16< N > >	cmp_lt (uint16< N, E1 > a, uint16< N, E2 > b)
	Compares the values of two unsigned int16x8 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > >	cmp_lt (int32< N, E1 > a, int32< N, E2 > b)
	Compares the values of two signed int32x4 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_int32< N, mask_int32< N > >	cmp_lt (uint32< N, E1 > a, uint32< N, E2 > b)
	Compares the values of two unsigned int32x4 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_float32< N, mask_float32 < N > >	cmp_lt (float32< N, E1 > a, float32< N, E2 > b)
	Compares the values of two float32x4 vectors for less-than. More...

template<unsigned N, class E1 , class E2 >
mask_float64< N, mask_float64 < N > >	cmp_lt (float64< N, E1 > a, float64< N, E2 > b)
	Compares the values of two float64x2 vectors for less-than. More...


template<unsigned id>
uint8_t	extract (uint8x16 a)
	Extracts the id-th element from int8x16 vector. More...

template<unsigned id>
int8_t	extract (int8x16 a)
	Extracts the id-th element from int8x16 vector. More...


template<unsigned id>
uint16_t	extract (uint16x8 a)
	Extracts the id-th element from int16x8 vector. More...

template<unsigned id>
int16_t	extract (int16x8 a)
	Extracts the id-th element from int16x8 vector. More...


template<unsigned id>
uint32_t	extract (uint32x4 a)
	Extracts the id-th element from int32x4 vector. More...

template<unsigned id>
int32_t	extract (int32x4 a)
	Extracts the id-th element from int32x4 vector. More...


template<unsigned id>
uint64_t	extract (uint64x2 a)
	Extracts an element from int64x2 vector. More...

template<unsigned id>
int64_t	extract (int64x2 a)
	Extracts an element from int64x2 vector. More...


void	split (uint8x32 a, uint8x16 &r1, uint8x16 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (uint16x16 a, uint16x8 &r1, uint16x8 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (uint32x8 a, uint32x4 &r1, uint32x4 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (uint64x4 a, uint64x2 &r1, uint64x2 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (int8x32 a, int8x16 &r1, int8x16 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (int16x16 a, int16x8 &r1, int16x8 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (int32x8 a, int32x4 &r1, int32x4 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (int64x4 a, int64x2 &r1, int64x2 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (float32x8 a, float32x4 &r1, float32x4 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

void	split (float64x4 a, float64x2 &r1, float64x2 &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (uint8< N > a, uint8< N/2 > &r1, uint8< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (uint16< N > a, uint16< N/2 > &r1, uint16< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (uint32< N > a, uint32< N/2 > &r1, uint32< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (uint64< N > a, uint64< N/2 > &r1, uint64< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (int8< N > a, int8< N/2 > &r1, int8< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (int16< N > a, int16< N/2 > &r1, int16< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (int32< N > a, int32< N/2 > &r1, int32< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (int64< N > a, int64< N/2 > &r1, int64< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (float32< N > a, float32< N/2 > &r1, float32< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...

template<unsigned N>
void	split (float64< N > a, float64< N/2 > &r1, float64< N/2 > &r2)
	Splits a 256-bit vector into two 128-bit vectors. More...


template<unsigned N, class E1 , class E2 , class E3 >
float32< N, expr_fmadd < float32< N, E1 >, float32< N, E2 >, float32< N, E3 > > >	fmadd (float32< N, E1 > a, float32< N, E2 > b, float32< N, E3 > c)
	Performs a fused multiply-add operation. More...

template<unsigned N, class E1 , class E2 , class E3 >
float64< N, expr_fmadd < float64< N, E1 >, float64< N, E2 >, float64< N, E3 > > >	fmadd (float64< N, E1 > a, float64< N, E2 > b, float64< N, E3 > c)
	Performs a fused multiply-add operation. More...


template<unsigned N, class E1 , class E2 , class E3 >
float32< N, expr_fmsub < float32< N, E1 >, float32< N, E2 >, float32< N, E3 > > >	fmsub (float32< N, E1 > a, float32< N, E2 > b, float32< N, E3 > c)
	Performs a fused multiply-sutract operation. More...

template<unsigned N, class E1 , class E2 , class E3 >
float64< N, expr_fmsub < float64< N, E1 >, float64< N, E2 >, float64< N, E3 > > >	fmsub (float64< N, E1 > a, float64< N, E2 > b, float64< N, E3 > c)
	Performs a fused multiply-sutract operation. More...


template<unsigned N, class E1 , class E2 >
float64< N, float64< N > >	max (float64< N, E1 > a, float64< N, E2 > b)
	Computes maxima of the values of two vectors. More...


template<unsigned P>
uint8x16	div_p (uint8x16 num, uint8x16 den)
	Divides one 8-bit unsigned number by another. More...

template<unsigned P>
uint16x8	div_p (uint16x8 num, uint16x8 den)
	Divides one 8-bit unsigned number by another. More...


template<unsigned N, class E1 , class E2 >
int32< N, expr_mull< int16< N, E1 >, int16< N, E2 > > >	mull (int16< N, E1 > a, int16< N, E2 > b)
	Multiplies signed 16-bit values and expands the results to 32 bits. More...

template<unsigned N, class E1 , class E2 >
uint32< N, expr_mull< uint16 < N, E1 >, uint16< N, E2 > > >	mull (uint16< N, E1 > a, uint16< N, E2 > b)
	Multiplies unsigned 16-bit values and expands the results to 32 bits. More...

template<unsigned N, class E1 , class E2 >
int64< N, expr_mull< int32< N, E1 >, int32< N, E2 > > >	mull (int32< N, E1 > a, int32< N, E2 > b)
	Multiplies signed 32-bit values in and expands the results to 64 bits. More...

template<unsigned N, class E1 , class E2 >
uint64< N, expr_mull< uint32 < N, E1 >, uint32< N, E2 > > >	mull (uint32< N, E1 > a, uint32< N, E2 > b)
	Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...


template<unsigned N, class E >
int8< N, int8< N > >	shift_l (int8< N, E > a, unsigned count)
	Shifts 8-bit values left by count bits while shifting in zeros. More...

template<unsigned N, class E >
uint8< N, uint8< N > >	shift_l (uint8< N, E > a, unsigned count)
	Shifts 8-bit values left by count bits while shifting in zeros. More...


template<unsigned N, class E >
int16< N, int16< N > >	shift_l (int16< N, E > a, unsigned count)
	Shifts 16-bit values left by count bits while shifting in zeros. More...

template<unsigned N, class E >
uint16< N, uint16< N > >	shift_l (uint16< N, E > a, unsigned count)
	Shifts 16-bit values left by count bits while shifting in zeros. More...


template<unsigned N, class E >
int32< N, int32< N > >	shift_l (int32< N, E > a, unsigned count)
	Shifts 32-bit values left by count bits while shifting in zeros. More...

template<unsigned N, class E >
uint32< N, uint32< N > >	shift_l (uint32< N, E > a, unsigned count)
	Shifts 32-bit values left by count bits while shifting in zeros. More...


template<unsigned N, class E >
int64< N, int64< N > >	shift_l (int64< N, E > a, unsigned count)
	Shifts 64-bit values left by count bits while shifting in zeros. More...

template<unsigned N, class E >
uint64< N, uint64< N > >	shift_l (uint64< N, E > a, unsigned count)
	Shifts 64-bit values left by count bits while shifting in zeros. More...


template<unsigned count, unsigned N, class E >
int8< N, int8< N > >	shift_l (int8< N, E > a)
	Shifts 8-bit values left by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > >	shift_l (uint8< N, E > a)
	Shifts 8-bit values left by count bits while shifting in zeros. More...


template<unsigned count, unsigned N, class E >
int16< N, int16< N > >	shift_l (int16< N, E > a)
	Shifts 16-bit values left by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > >	shift_l (uint16< N, E > a)
	Shifts 16-bit values left by count bits while shifting in zeros. More...


template<unsigned count, unsigned N, class E >
int32< N, int32< N > >	shift_l (int32< N, E > a)
	Shifts 32-bit values left by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > >	shift_l (uint32< N, E > a)
	Shifts 32-bit values left by count bits while shifting in zeros. More...


template<unsigned count, unsigned N, class E >
int64< N, int64< N > >	shift_l (int64< N, E > a)
	Shifts 64-bit values left by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > >	shift_l (uint64< N, E > a)
	Shifts 64-bit values left by count bits while shifting in zeros. More...


template<unsigned N, class E >
uint8< N, uint8< N > >	shift_r (uint8< N, E > a, unsigned count)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...

template<unsigned N, class E >
int16< N, int16< N > >	shift_r (int16< N, E > a, unsigned count)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...

template<unsigned N, class E >
uint16< N, uint16< N > >	shift_r (uint16< N, E > a, unsigned count)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...

template<unsigned N, class E >
int32< N, int32< N > >	shift_r (int32< N, E > a, unsigned count)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...

template<unsigned N, class E >
uint32< N, uint32< N > >	shift_r (uint32< N, E > a, unsigned count)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...

template<unsigned N, class E >
int64< N, int64< N > >	shift_r (int64< N, E > a, unsigned count)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...

template<unsigned N, class E >
uint64< N, uint64< N > >	shift_r (uint64< N, E > a, unsigned count)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
int8< N, int8< N > >	shift_r (int8< N, E > a)
	Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count, unsigned N, class E >
uint8< N, uint8< N > >	shift_r (uint8< N, E > a)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
int16< N, int16< N > >	shift_r (int16< N, E > a)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count, unsigned N, class E >
uint16< N, uint16< N > >	shift_r (uint16< N, E > a)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
int32< N, int32< N > >	shift_r (int32< N, E > a)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count, unsigned N, class E >
uint32< N, uint32< N > >	shift_r (uint32< N, E > a)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...

template<unsigned count, unsigned N, class E >
int64< N, int64< N > >	shift_r (int64< N, E > a)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count, unsigned N, class E >
uint64< N, uint64< N > >	shift_r (uint64< N, E > a)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...


template<class E1 , class E2 >
uint8x32	combine (uint8< 16, E1 > a, uint8< 16, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
uint16x16	combine (uint16< 8, E1 > a, uint16< 8, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
uint32x8	combine (uint32< 4, E1 > a, uint32< 4, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
uint64x4	combine (uint64< 2, E1 > a, uint64< 2, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
int16x16	combine (int16< 8, E1 > a, int16< 8, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
int32x8	combine (int32< 4, E1 > a, int32< 4, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
int64x4	combine (int64< 2, E1 > a, int64< 2, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
float32x8	combine (float32< 4, E1 > a, float32< 4, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<class E1 , class E2 >
float64x4	combine (float64< 2, E1 > a, float64< 2, E2 > b)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
uint8< N *2 >	combine (uint8< N, E1 > a1, uint8< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
uint16< N *2 >	combine (uint16< N, E1 > a1, uint16< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
uint32< N *2 >	combine (uint32< N, E1 > a1, uint32< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
uint64< N *2 >	combine (uint64< N, E1 > a1, uint64< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
int8< N *2 >	combine (int8< N, E1 > a1, int8< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
int16< N *2 >	combine (int16< N, E1 > a1, int16< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
int32< N *2 >	combine (int32< N, E1 > a1, int32< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
int64< N *2 >	combine (int64< N, E1 > a1, int64< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
float32< N *2 >	combine (float32< N, E1 > a1, float32< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...

template<unsigned N, class E1 , class E2 >
float64< N *2 >	combine (float64< N, E1 > a1, float64< N, E2 > a2)
	Combines two 128-bit vectors into a 256-bit vector. More...


template<class V = expr_vec_make_const<double,1>>
V	make_float (double v0)
	Creates a vector from floating-point values known at compile-time. More...

template<class V = expr_vec_make_const<double,2>>
V	make_float (double v0, double v1)
	Creates a vector from floating-point values known at compile-time. More...

template<class V = expr_vec_make_const<double,4>>
V	make_float (double v0, double v1, double v2, double v3)
	Creates a vector from floating-point values known at compile-time. More...

template<class V = expr_vec_make_const<double,8>>
V	make_float (double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
	Creates a vector from floating-point values known at compile-time. More...


template<class V = expr_vec_make_const<int64_t,1>>
V	make_int (int64_t v0)
	Creates a vector from signed integer values known at compile-time. More...

template<class V = expr_vec_make_const<int64_t,2>>
V	make_int (int64_t v0, int64_t v1)
	Creates a vector from signed integer values known at compile-time. More...

template<class V = expr_vec_make_const<int64_t,4>>
V	make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3)
	Creates a vector from signed integer values known at compile-time. More...

template<class V = expr_vec_make_const<int64_t,8>>
V	make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7)
	Creates a vector from signed integer values known at compile-time. More...

template<class V = expr_vec_make_const<int64_t,16>>
V	make_int (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7, int64_t v8, int64_t v9, int64_t v10, int64_t v11, int64_t v12, int64_t v13, int64_t v14, int64_t v15)
	Creates a vector from signed integer values known at compile-time. More...


template<int s0, int s1, unsigned N>
uint8< N >	make_shuffle_bytes16_mask (uint8< N > &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, unsigned N>
uint8< N >	make_shuffle_bytes16_mask (uint8< N > &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint8< N >	make_shuffle_bytes16_mask (uint8< N > &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15, unsigned N>
uint8< N >	make_shuffle_bytes16_mask (uint8< N > &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, unsigned N>
uint16< N >	make_shuffle_bytes16_mask (uint16< N > &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, unsigned N>
uint16< N >	make_shuffle_bytes16_mask (uint16< N > &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>
uint16< N >	make_shuffle_bytes16_mask (uint16< N > &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, unsigned N>
uint32< N >	make_shuffle_bytes16_mask (uint32< N > &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, unsigned N>
uint32< N >	make_shuffle_bytes16_mask (uint32< N > &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, unsigned N>
uint64< N >	make_shuffle_bytes16_mask (uint64< N > &mask)
	Makes a mask to shuffle an int64x2 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<class V = expr_vec_make_const<uint64_t,1>>
V	make_uint (uint64_t v0)
	Creates a vector from unsigned integer values known at compile-time. More...

template<class V = expr_vec_make_const<uint64_t,2>>
V	make_uint (uint64_t v0, uint64_t v1)
	Creates a vector from unsigned integer values known at compile-time. More...

template<class V = expr_vec_make_const<uint64_t,4>>
V	make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3)
	Creates a vector from unsigned integer values known at compile-time. More...

template<class V = expr_vec_make_const<uint64_t,8>>
V	make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7)
	Creates a vector from unsigned integer values known at compile-time. More...

template<class V = expr_vec_make_const<uint64_t,16>>
V	make_uint (uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7, uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11, uint64_t v12, uint64_t v13, uint64_t v14, uint64_t v15)
	Creates a vector from unsigned integer values known at compile-time. More...


template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move16_l (const any_vec8< N, V > &a)
	Moves the elements in an int8x16 vector to the left by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move8_l (const any_vec16< N, V > &a)
	Moves the 16-bit elements in a vector to the left by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move4_l (const any_vec32< N, V > &a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move2_l (const any_vec64< N, V > &a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...


template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move16_r (const any_vec8< N, V > &a)
	Moves the 8-bit elements in a vector to the right by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move8_r (const any_vec16< N, V > &a)
	Moves the 16-bit elements in a vector to the right by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move4_r (const any_vec32< N, V > &a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	move2_r (const any_vec64< N, V > &a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...


template<unsigned s0, unsigned s1, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute2 (const any_vec64< N, V > &a)
	Permutes the values of each set of four consecutive 32-bit values. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute4 (const any_vec16< N, V > &a)
	Permutes the 16-bit values within each 4 consecutive values of the vector. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute4 (const any_vec32< N, V > &a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	permute4 (const any_vec64< N, V > &a)
	Permutes the values of each set of four consecutive 64-bit values. More...


uint8x16	permute_bytes16 (uint8x16 a, uint8x16 mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
uint8< N >	permute_bytes16 (uint8< N > a, uint8< N > mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
uint16< N >	permute_bytes16 (uint16< N > a, uint16< N > mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
uint32< N >	permute_bytes16 (uint32< N > a, uint32< N > mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
uint64< N >	permute_bytes16 (uint64< N > a, uint64< N > mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
float32< N >	permute_bytes16 (float32< N > a, uint32< N > mask)
	Selects bytes from a vector according to a mask. More...

template<unsigned N>
float64< N >	permute_bytes16 (float64< N > a, uint64< N > mask)
	Selects bytes from a vector according to a mask. More...


uint8x16	permute_zbytes16 (uint8x16 a, uint8x16 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
uint8< N >	permute_zbytes16 (uint8< N > a, uint8< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
uint16< N >	permute_zbytes16 (uint16< N > a, uint16< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
uint32< N >	permute_zbytes16 (uint32< N > a, uint32< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
uint64< N >	permute_zbytes16 (uint64< N > a, uint64< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
float32< N >	permute_zbytes16 (float32< N > a, uint32< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

template<unsigned N>
float64< N >	permute_zbytes16 (float64< N > a, uint64< N > mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...


template<class V = expr_vec_set_splat<int>>
V	splat (int x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_set_splat<unsigned>>
V	splat (unsigned x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_set_splat<int64_t>>
V	splat (int64_t x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_set_splat<uint64_t>>
V	splat (uint64_t x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_set_splat<float>>
V	splat (float x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...

template<class V = expr_vec_set_splat<double>>
V	splat (double x)
	Loads a value from a register and broadcasts it to all elements of a vector. More...


template<unsigned sa0, unsigned sa1, unsigned sb0, unsigned sb1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	shuffle2 (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
	Selects 32-bit floating-point values from two vectors. More...

template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	shuffle2 (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
	Selects 32-bit values from two vectors. More...


uint8x16	shuffle_bytes16 (uint8x16 a, uint8x16 b, uint8x16 mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
uint8< N >	shuffle_bytes16 (uint8< N > a, uint8< N > b, uint8< N > mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
uint16< N >	shuffle_bytes16 (uint16< N > a, uint16< N > b, uint16< N > mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
uint32< N >	shuffle_bytes16 (uint32< N > a, uint32< N > b, uint32< N > mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
uint64< N >	shuffle_bytes16 (uint64< N > a, uint64< N > b, uint64< N > mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
float32< N >	shuffle_bytes16 (float32< N > a, float32< N > b, uint32< N > mask)
	Selects bytes from two vectors according to a mask. More...

template<unsigned N>
float64< N >	shuffle_bytes16 (float64< N > a, float64< N > b, uint64< N > mask)
	Selects bytes from two vectors according to a mask. More...


uint8x16	shuffle_zbytes16 (uint8x16 a, uint8x16 b, uint8x16 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
uint8< N >	shuffle_zbytes16 (uint8< N > a, uint8< N > b, uint8< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
uint16< N >	shuffle_zbytes16 (uint16< N > a, uint16< N > b, uint16< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
uint32< N >	shuffle_zbytes16 (uint32< N > a, uint32< N > b, uint32< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
uint64< N >	shuffle_zbytes16 (uint64< N > a, uint64< N > b, uint64< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
float32< N >	shuffle_zbytes16 (float32< N > a, float32< N > b, uint32< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

template<unsigned N>
float64< N >	shuffle_zbytes16 (float64< N > a, float64< N > b, uint64< N > mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...


template<unsigned s, unsigned N, class V >
detail::get_expr_nomask< V, void >::empty	splat (const any_vec< N, V > &a)
	Broadcasts the specified element to all elements. More...


template<unsigned s, unsigned N, class E >
int8< N, expr_splat16< s, int8 < N, E > > >	splat16 (int8< N, E > a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...

template<unsigned s, unsigned N, class E >
uint8< N, expr_splat16< s, uint8< N, E > > >	splat16 (uint8< N, E > a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...


template<unsigned s, unsigned N, class E >
int16< N, expr_splat8< s, int16< N, E > > >	splat8 (int16< N, E > a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...

template<unsigned s, unsigned N, class E >
uint16< N, expr_splat8< s, uint16< N, E > > >	splat8 (uint16< N, E > a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...


template<unsigned s, unsigned N, class E >
int32< N, expr_splat4< s, int32< N, E > > >	splat4 (int32< N, E > a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s, unsigned N, class E >
uint32< N, expr_splat4< s, uint32< N, E > > >	splat4 (uint32< N, E > a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...


template<unsigned s, unsigned N, class E >
int64< N, expr_splat2< s, int64< N, E > > >	splat2 (int64< N, E > a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s, unsigned N, class E >
uint64< N, expr_splat2< s, uint64< N, E > > >	splat2 (uint64< N, E > a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...


template<unsigned s, unsigned N, class E >
float32< N, expr_splat4< s, float32< N, E > > >	splat4 (float32< N, E > a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...


template<unsigned s, unsigned N, class E >
float64< N, expr_splat2< s, float64< N, E > > >	splat2 (float64< N, E > a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...


float32x4	to_float32 (int32x4 a)
	Converts 32-bit integer values to 32-bit float values. More...

template<unsigned N>
float32< N >	to_float32 (int32< N > a)
	Converts 32-bit integer values to 32-bit float values. More...


float32x4	to_float32 (float64x4 a)
	Converts 64-bit float values to 32-bit float values. More...


float64x4	to_float64 (int32x4 a)
	Converts the 32-bit integer values to 64-bit float values. More...


float64x4	to_float64 (float32x4 a)
	Converts the 32-bit float values to 64-bit float values. More...


int32x4	to_int32 (float32x4 a)
	Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...

template<unsigned N>
uint32< N >	to_int32x8 (float32< N > a)
	Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...


int32x4	to_int32 (float64x4 a)
	Converts the values of a doublex2 vector into int32_t representation using truncation. More...


uint64x4	to_int64 (int32x4 a)
	Extends the values of a signed int32x4 vector to 64-bits. More...

uint64x4	to_int64 (uint32x4 a)
	Extends the values of an unsigned int32x4 vector to 64-bits. More...


template<unsigned N, class E1 , class E2 >
uint8< N, uint8< N > >	unzip16_hi (uint8< N, E1 > a, uint8< N, E2 > b)
	De-interleaves the even(higher) elements of two int8x16 vectors. More...

template<unsigned N, class E1 , class E2 >
uint16< N, uint16< N > >	unzip8_hi (uint16< N, E1 > a, uint16< N, E2 > b)
	De-interleaves the even(higher) elements of two int16x8 vectors. More...

template<unsigned N, class E1 , class E2 >
uint32< N, uint32< N > >	unzip4_hi (uint32< N, E1 > a, uint32< N, E2 > b)
	De-interleaves the even(higher) elements of two int32x4 vectors. More...

template<unsigned N, class E1 , class E2 >
uint64< N, uint64< N > >	unzip2_hi (uint64< N, E1 > a, uint64< N, E2 > b)
	De-interleaves the even(higher) elements of two int64x2 vectors. More...

template<unsigned N, class E1 , class E2 >
float32< N, float32< N > >	unzip4_hi (float32< N, E1 > a, float32< N, E2 > b)
	De-interleaves the even(higher) elements of two float32x4 vectors. More...

template<unsigned N, class E1 , class E2 >
float64< N, float64< N > >	unzip2_hi (float64< N, E1 > a, float64< N, E2 > b)
	De-interleaves the even(higher) elements of two float64x2 vectors. More...


template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip16_hi (const any_vec8< N, V1 > &a, const any_vec8< N, V2 > &b)
	Interleaves the higher halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip8_hi (const any_vec16< N, V1 > &a, const any_vec16< N, V2 > &b)
	Interleaves the higher halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip4_hi (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
	Interleaves the higher halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip2_hi (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
	Interleaves the higher halves of two vectors. More...


template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip16_lo (const any_vec8< N, V1 > &a, const any_vec8< N, V2 > &b)
	Interleaves the lower halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip8_lo (const any_vec16< N, V1 > &a, const any_vec16< N, V2 > &b)
	Interleaves the lower halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip4_lo (const any_vec32< N, V1 > &a, const any_vec32< N, V2 > &b)
	Interleaves the lower halves of two vectors. More...

template<unsigned N, class V1 , class V2 >
detail::get_expr2_nomask< V1, V2, void >::empty	zip2_lo (const any_vec64< N, V1 > &a, const any_vec64< N, V2 > &b)
	Interleaves the lower halves of two vectors. More...


Arch &	operator\|= (Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch &	operator&= (Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator\| (const Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator& (const Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator~ (const Arch &x)
	Bitwise operators for `Arch`. More...

Typedef Documentation

using simdpp::float32v = typedef float32<SIMDPP_FAST_FLOAT32_SIZE>

using simdpp::float32v4 = typedef float32<SIMDPP_FAST_FLOAT32_SIZE*4>

using simdpp::float32x4 = typedef float32<4>

using simdpp::float32x8 = typedef float32<8>

using simdpp::float64v = typedef float64<SIMDPP_FAST_FLOAT64_SIZE>

using simdpp::float64v2 = typedef float64<SIMDPP_FAST_FLOAT64_SIZE*2>

using simdpp::float64v4 = typedef float64<SIMDPP_FAST_FLOAT64_SIZE*4>

using simdpp::float64x2 = typedef float64<2>

using simdpp::float64x4 = typedef float64<4>

using simdpp::int16v = typedef int16<SIMDPP_FAST_INT16_SIZE>

using simdpp::int16v2 = typedef int16<SIMDPP_FAST_INT16_SIZE*2>

using simdpp::int16v4 = typedef int16<SIMDPP_FAST_INT16_SIZE*4>

using simdpp::int16x16 = typedef int16<16>

using simdpp::int16x8 = typedef int16<8>

using simdpp::int32v = typedef int32<SIMDPP_FAST_INT32_SIZE>

using simdpp::int32v2 = typedef int32<SIMDPP_FAST_INT32_SIZE*2>

using simdpp::int32v4 = typedef int32<SIMDPP_FAST_INT32_SIZE*4>

using simdpp::int32x4 = typedef int32<4>

using simdpp::int32x8 = typedef int32<8>

using simdpp::int64v = typedef int64<SIMDPP_FAST_INT64_SIZE>

using simdpp::int64v2 = typedef int64<SIMDPP_FAST_INT64_SIZE*2>

using simdpp::int64v4 = typedef int64<SIMDPP_FAST_INT64_SIZE*4>

using simdpp::int64x2 = typedef int64<2>

using simdpp::int64x4 = typedef int64<4>

using simdpp::int8v = typedef int8<SIMDPP_FAST_INT8_SIZE>

using simdpp::int8v2 = typedef int8<SIMDPP_FAST_INT8_SIZE*2>

using simdpp::int8v4 = typedef int8<SIMDPP_FAST_INT8_SIZE*4>

using simdpp::int8x16 = typedef int8<16>

using simdpp::int8x32 = typedef int8<32>

using simdpp::mask_float32v = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE>

using simdpp::mask_float32v2 = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE*2>

using simdpp::mask_float32v4 = typedef mask_float32<SIMDPP_FAST_FLOAT32_SIZE*4>

using simdpp::mask_float32x4 = typedef mask_float32<4>

using simdpp::mask_float32x8 = typedef mask_float32<8>

using simdpp::mask_float64v = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE>

using simdpp::mask_float64v2 = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE*2>

using simdpp::mask_float64v4 = typedef mask_float64<SIMDPP_FAST_FLOAT64_SIZE*4>

using simdpp::mask_float64x2 = typedef mask_float64<2>

using simdpp::mask_float64x4 = typedef mask_float64<4>

using simdpp::mask_int16v = typedef mask_int16<SIMDPP_FAST_INT16_SIZE>

using simdpp::mask_int16v2 = typedef mask_int16<SIMDPP_FAST_INT16_SIZE*2>

using simdpp::mask_int16v4 = typedef mask_int16<SIMDPP_FAST_INT16_SIZE*4>

using simdpp::mask_int16x16 = typedef mask_int16<16>

using simdpp::mask_int16x8 = typedef mask_int16<8>

using simdpp::mask_int32v = typedef mask_int32<SIMDPP_FAST_INT32_SIZE>

using simdpp::mask_int32v2 = typedef mask_int32<SIMDPP_FAST_INT32_SIZE*2>

using simdpp::mask_int32v4 = typedef mask_int32<SIMDPP_FAST_INT32_SIZE*4>

using simdpp::mask_int32x4 = typedef mask_int32<4>

using simdpp::mask_int32x8 = typedef mask_int32<8>

using simdpp::mask_int64v = typedef mask_int64<SIMDPP_FAST_INT64_SIZE>

using simdpp::mask_int64v2 = typedef mask_int64<SIMDPP_FAST_INT64_SIZE*2>

using simdpp::mask_int64v4 = typedef mask_int64<SIMDPP_FAST_INT64_SIZE*4>

using simdpp::mask_int64x2 = typedef mask_int64<2>

using simdpp::mask_int64x4 = typedef mask_int64<4>

using simdpp::mask_int8v = typedef mask_int8<SIMDPP_FAST_INT8_SIZE>

using simdpp::mask_int8v2 = typedef mask_int8<SIMDPP_FAST_INT8_SIZE*2>

using simdpp::mask_int8v4 = typedef mask_int8<SIMDPP_FAST_INT8_SIZE*4>

using simdpp::mask_int8x16 = typedef mask_int8<16>

using simdpp::mask_int8x32 = typedef mask_int8<32>

using simdpp::uint16v = typedef uint16<SIMDPP_FAST_INT16_SIZE>

using simdpp::uint16v2 = typedef uint16<SIMDPP_FAST_INT16_SIZE*2>

using simdpp::uint16v4 = typedef uint16<SIMDPP_FAST_INT16_SIZE*4>

using simdpp::uint16x16 = typedef uint16<16>

using simdpp::uint16x8 = typedef uint16<8>

using simdpp::uint32v = typedef uint32<SIMDPP_FAST_INT32_SIZE>

using simdpp::uint32v2 = typedef uint32<SIMDPP_FAST_INT32_SIZE*2>

using simdpp::uint32v4 = typedef uint32<SIMDPP_FAST_INT32_SIZE*4>

using simdpp::uint32x4 = typedef uint32<4>

using simdpp::uint32x8 = typedef uint32<8>

using simdpp::uint64v = typedef uint64<SIMDPP_FAST_INT64_SIZE>

using simdpp::uint64v2 = typedef uint64<SIMDPP_FAST_INT64_SIZE*2>

using simdpp::uint64v4 = typedef uint64<SIMDPP_FAST_INT64_SIZE*4>

using simdpp::uint64x2 = typedef uint64<2>

using simdpp::uint64x4 = typedef uint64<4>

using simdpp::uint8v = typedef uint8<SIMDPP_FAST_INT8_SIZE>

using simdpp::uint8v2 = typedef uint8<SIMDPP_FAST_INT8_SIZE*2>

using simdpp::uint8v4 = typedef uint8<SIMDPP_FAST_INT8_SIZE*4>

using simdpp::uint8x16 = typedef uint8<16>

using simdpp::uint8x32 = typedef uint8<32>

Function Documentation

template<unsigned N, class E >

uint8<N, expr_abs<int8<N,E> > > simdpp::abs ( int8< N, E > a)

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

template<unsigned N, class E >

float32<N, expr_abs<float32<N,E> > > simdpp::abs ( float32< N, E > a)

Computes absolute value of floating point values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
In NEON this intrinsic results in at least 2 instructions.
In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N, class E >

uint16<N, expr_abs<int16<N,E> > > simdpp::abs ( int16< N, E > a)

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

float64<N, expr_abs<float64<N,E> > > simdpp::abs ( float64< N, E > a)

Computes absolute value of floating point values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

Not vectorized in NEON and .
In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
In AVX-AVX2 this intrinsic results in at least 1-2 instructions.

template<unsigned N, class E >

uint32<N, expr_abs<int32<N,E> > > simdpp::abs ( int32< N, E > a)

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

template<unsigned N, class E >

uint64<N, expr_abs<int64<N,E> > > simdpp::abs ( int64< N, E > a)

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In NEON this intrinsic results in at least 6 instructions.
Not vectorized in ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 10 instructions.
In NEON this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
Not vectorized in ALTIVEC.

template<unsigned N, class E1 , class E2 >

float32<N, expr_add<float32<N,E1>, float32<N,E2> > > simdpp::add	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Adds the values of two vectors.

r0 = a0 + b0
...
rN = aN + bN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_add<uint8<N, typename V1::expr_type>, uint8<N, typename V2::expr_type> > >::type simdpp::add	(	const any_int8< N, V1 > &	a,
		const any_int8< N, V2 > &	b
	)

Adds 8-bit integer values.

r0 = a0 + b0
...
rN = aN + bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, expr_add<float64<N,E1>, float64<N,E2> > > simdpp::add	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Adds the values of two vectors.

r0 = a0 + b0
...
rN = aN + bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_add<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::add	(	const any_int16< N, V1 > &	a,
		const any_int16< N, V2 > &	b
	)

Adds 16-bit integer values.

r0 = a0 + b0
...
rN = aN + bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_add<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::add	(	const any_int32< N, V1 > &	a,
		const any_int32< N, V2 > &	b
	)

Adds 32-bit integer values.

r0 = a0 + b0
...
rN = aN + bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_add<uint64<N, typename V1::expr_type>, uint64<N, typename V2::expr_type> > >::type simdpp::add	(	const any_int64< N, V1 > &	a,
		const any_int64< N, V2 > &	b
	)

Adds 64-bit integer values.

r0 = a0 + b0
...
rN = aN + bN

128-bit version:

In ALTIVEC this intrinsic results in at least 5-6 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 10-11 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, expr_add_sat<int8<N,E1>, int8<N,E2> > > simdpp::add_sat	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Adds and saturates signed 8-bit integer values.

r0 = signed_saturate(a0 + b0)
...
rN = signed_saturate(aN + bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, expr_add_sat<int16<N,E1>, int16<N,E2> > > simdpp::add_sat	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Adds and saturates signed 16-bit integer values.

r0 = signed_saturate(a0 + b0)
...
rN = signed_saturate(aN + bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, expr_add_sat<uint8<N,E1>, uint8<N,E2> > > simdpp::add_sat	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Adds and saturates unsigned 8-bit integer values.

r0 = unsigned_saturate(a0 + b0)
...
rN = unsigned_saturate(aN + bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, expr_add_sat<uint16<N,E1>, uint16<N,E2> > > simdpp::add_sat	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Adds and saturates unsigned 16-bit integer values.

r0 = unsigned_saturate(a0 + b0)
...
rN = unsigned_saturate(aN + bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align16	(	const any_vec8< N, V1 > &	lower,
		const any_vec8< N, V2 > &	upper
	)

Extracts a int8x16 vector from two concatenated int8x16 vectors.

shift:  pos:| 0   1    .  14  15  |
    r = [ l0  l1   .  l14 l15 ]
    r = [ l1  l2   .  l15 u0  ]
    r = [ l2  l3   .  u0  l1  ]
  ...    ..   .. ..  ... .. ..
   r = [ l15 u0   .  u13 u14 ]
   r = [ u0  u1   .  u14 u15 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align2	(	const any_vec64< N, V1 > &	lower,
		const any_vec64< N, V2 > &	upper
	)

Extracts a int64x2 vector from two concatenated int64x2 vectors.

shift:  pos:| 0  1  |
    r = [ l0 l1 ]
    r = [ l1 u0 ]
    r = [ u0 u1 ]

int64

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align4	(	const any_vec32< N, V1 > &	lower,
		const any_vec32< N, V2 > &	upper
	)

Extracts a int32x4 vector from two concatenated int32x4 vectors.

shift:  pos:| 0  1  2  3  |
    r = [ l0 l1 l2 l3 ]
    r = [ l1 l2 l3 u0 ]
    r = [ l2 l3 u0 u1 ]
    r = [ l3 u0 u1 u2 ]
    r = [ u0 u1 u2 u3 ]

int32

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-SSE4.1 NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::align8	(	const any_vec16< N, V1 > &	lower,
		const any_vec16< N, V2 > &	upper
	)

Extracts a int16x8 vector from two concatenated int16x8 vectors.

shift:  pos:| 0  1    .  6  7  |
    r = [ l0 l1   .  l6 l7 ]
    r = [ l1 l2   .  l7 u0 ]
    r = [ l2 l3   .  u0 l1 ]
  ...    ..   .. ..  ... .. ..
    r = [ l3 u0   .  u5 u6 ]
    r = [ u0 u1   .  u6 u7 ]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

The all 128-bit sub-vectors are processed as if 128-bit instruction was applied to each of them separately.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::avg	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, int8<N> > simdpp::avg	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::avg	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, int16<N> > simdpp::avg	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::avg	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 12-13 instructions.
In AVX2 this intrinsic results in at least 6-7 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int32<N, int32<N> > simdpp::avg	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 18-19 instructions.
In AVX2 this intrinsic results in at least 9-10 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::avg_trunc	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, int8<N> > simdpp::avg_trunc	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::avg_trunc	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, int16<N> > simdpp::avg_trunc	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::avg_trunc	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int32<N, int32<N> > simdpp::avg_trunc	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 8 instructions.
In NEON this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr_bitwise2_and<expr_bit_and, V1, V2>::type simdpp::bit_and	(	const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b
	)

Computes bitwise AND of integer or floating-point vectors.

r0 = a0 & b0
...
rN = aN & bN

Todo:: : icost

template<unsigned N, class V1 , class V2 >

detail::get_expr_bitwise2_and<expr_bit_andnot, V1, V2>::type simdpp::bit_andnot	(	const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b
	)

Computes bitwise AND NOT of two integer or floating-point vectors.

r0 = a0 & ~b0
...
rN = aN & ~bN

Todo:: : icost

template<class R , class T >

R simdpp::bit_cast ( T t)

Casts between unrelated types.

No changes to the stored values are performed.

Conversions between vector and non-vector types are not allowed.

Conversion from non-mask type to mask type is not allowed.

Conversion from mask type to a non-mask type is not a costless operation because masks may have different logical and physical layout (e.g., in some implementations one bit represents entire element in a vector).

Conversions between mask types is only allowed if the element size is the same.

template<unsigned N, class V >

detail::get_expr<V, expr_bit_not<V> >::empty simdpp::bit_not ( const any_vec< N, V > & a)

Computes bitwise NOT of an integer or floating-point vector.

r = ~a

Todo:: icost

template<unsigned N, class V1 , class V2 >

detail::get_expr_bit_or<V1, V2>::type simdpp::bit_or	(	const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b
	)

Computes bitwise OR of integer vectors.

r0 = a0 | b0
...
rN = aN | bN

Todo:: icost

template<unsigned N, class V1 , class V2 >

detail::get_expr2<V1, V2, void>::empty simdpp::bit_xor	(	const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b
	)

Computes bitwise XOR of integer or floating-point vectors.

r0 = a0 ^ b0
...
rN = aN ^ bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 , class V3 >

detail::get_expr_blend<V1, V2, V3>::type simdpp::blend	(	const any_vec< N, V1 > &	on,
		const any_vec< N, V2 > &	off,
		const any_vec< N, V3 > &	mask
	)

Composes a vector from two sources according to a mask.

Each element within the mask must have either all bits set or all bits unset.

r0 = (mask0 == 0xff ) ? on0 : off0
...
rN = (maskN == 0xff ) ? onN : offN

Todo:: icost

int16

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int64

128-bit version:

In SSE2-AVX this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64

128-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 6 instructions.
Not vectorized in NEON and .

template<unsigned N, class E >

float32<N, float32<N> > simdpp::ceil ( float32< N, E > a)

Rounds the values a vector towards positive infinity.

r0 = ceil(a0)
...
rN = ceil(aN)

128-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 13-15 instructions.
In NEON this intrinsic results in at least 11-13 instructions.

256-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 26-28 instructions.
In NEON this intrinsic results in at least 22-24 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_eq	(	const any_int8< N, V1 > &	a,
		const any_int8< N, V2 > &	b
	)

Compares 8-bit values for equality.

r0 = (a0 == b0) ? 0xff : 0x0
...
rN = (aN == bN) ? 0xff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_eq	(	const any_int16< N, V1 > &	a,
		const any_int16< N, V2 > &	b
	)

Compares 16-bit values for equality.

r0 = (a0 == b0) ? 0xffff : 0x0
...
rN = (aN == bN) ? 0xffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_eq	(	const any_int32< N, V1 > &	a,
		const any_int32< N, V2 > &	b
	)

Compares the values of two int32x4 vectors for equality.

r0 = (a0 == b0) ? 0xffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int64<N, mask_int64<N> > simdpp::cmp_eq	(	const any_int64< N, V1 > &	a,
		const any_int64< N, V2 > &	b
	)

Compares the values of two int64x2 vectors for equality.

r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffffffffffff : 0x0

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 3-4 instructions.

256-bit version:

In SSE2-SSSE3 and AVX this intrinsic results in at least 10 instructions.
In XOP and SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 6-7 instructions.

template<unsigned N, class V1 , class V2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_eq	(	const any_float32< N, V1 > &	a,
		const any_float32< N, V2 > &	b
	)

Compares the values of two float32x4 vectors for equality.

r0 = (a0 == b0) ? 0xffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_eq	(	const any_float64< N, V1 > &	a,
		const any_float64< N, V2 > &	b
	)

Compares the values of two float64x2 vectors for equality.

r0 = (a0 == b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN == bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_ge	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_ge	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 >= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_gt	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Compares the values of two signed int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xff : 0x0
...
rN = (aN > bN) ? 0xff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_gt	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Compares the values of two unsigned int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xff : 0x0
...
rN = (aN > bN) ? 0xff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_gt	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Compares the values of two signed int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xffff : 0x0
...
rN = (aN > bN) ? 0xffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_gt	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Compares the values of two unsigned int16x8 vectors for greater-than.

r0 = (a0 > b0) ? 0xffff : 0x0
...
rN = (aN > bN) ? 0xffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_gt	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Compares the values of two signed int32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_gt	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Compares the values of two unsigned int32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_gt	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Compares the values of two float32x4 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_gt	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_le	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_le	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_lt	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_lt	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_lt	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_lt	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_lt	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_lt	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_lt	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_lt	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int8<N, mask_int8<N> > simdpp::cmp_neq	(	const any_int8< N, V1 > &	a,
		const any_int8< N, V2 > &	b
	)

Compares the values of two int8x16 vectors for inequality.

r0 = (a0 != b0) ? 0xff : 0x0
...
rN = (aN != bN) ? 0xff : 0x0

128-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int16<N, mask_int16<N> > simdpp::cmp_neq	(	const any_int16< N, V1 > &	a,
		const any_int16< N, V2 > &	b
	)

Compares the values of two int16x8 vectors for inequality.

r0 = (a0 != b0) ? 0xffff : 0x0
...
rN = (aN != bN) ? 0xffff : 0x0

128-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int32<N, mask_int32<N> > simdpp::cmp_neq	(	const any_int32< N, V1 > &	a,
		const any_int32< N, V2 > &	b
	)

Compares the values of two int32x4 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffff : 0x0

128-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

mask_int64<N, mask_int64<N> > simdpp::cmp_neq	(	const any_int64< N, V1 > &	a,
		const any_int64< N, V2 > &	b
	)

Compares the values of two int64x2 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffffffffffff : 0x0

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1 and AVX this intrinsic results in at least 2 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 3-5 instructions.

256-bit version:

In SSE2-SSSE3 and AVX this intrinsic results in at least 10 instructions.
In SSE4.1 and NEON this intrinsic results in at least 4 instructions.
In AVX2 and XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 6-8 instructions.

template<unsigned N, class V1 , class V2 >

mask_float32<N, mask_float32<N> > simdpp::cmp_neq	(	const any_float32< N, V1 > &	a,
		const any_float32< N, V2 > &	b
	)

Compares the values of two float32x4 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffff : 0x0

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

256-bit version

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 4 instructions.

template<unsigned N, class V1 , class V2 >

mask_float64<N, mask_float64<N> > simdpp::cmp_neq	(	const any_float64< N, V1 > &	a,
		const any_float64< N, V2 > &	b
	)

Compares the values of two float64x2 vectors for inequality.

r0 = (a0 != b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN != bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float32<N, float32<N> > simdpp::div	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 10 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 12 instructions.
In ALTIVEC this intrinsic results in at least 19 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, float64<N> > simdpp::div	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned P>

uint8x16 simdpp::div_p	(	uint8x16	num,
		uint8x16	den
	)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN

128-bit version:: The operations costs at least 9 instructions per bit of precision.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
In AVX2 this intrinsic results in at least 4 instructions.

template<unsigned P>

uint16x8 simdpp::div_p	(	uint16x8	num,
		uint16x8	den
	)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN

128-bit version:: The operations costs at least 9 instructions per bit of precision.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
In AVX2 this intrinsic results in at least 4 instructions.

template<unsigned id>

uint16_t simdpp::extract ( uint16x8 a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int16_t simdpp::extract ( int16x8 a)

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint32_t simdpp::extract ( uint32x4 a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int32_t simdpp::extract ( int32x4 a)

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint64_t simdpp::extract ( uint64x2 a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In SSE4_1 this intrinsic results in at least 1 instructions.
In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
In SSE4_1_32bit this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int64_t simdpp::extract ( int64x2 a)

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In SSE4_1 this intrinsic results in at least 1 instructions.
In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
In SSE4_1_32bit this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

float simdpp::extract ( float32x4 a)

Extracts an element from float32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

double simdpp::extract ( float64x2 a)

Extracts an element from float64x2 vector.

r = a[id]

This function may have very high latency.

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint16_t simdpp::extract_bits ( uint8x16 a)

Extracts specific bit from each byte of each element of a int8x16 vector.

The default template argument selects the bits from each byte in most efficient way.

r = (a[0] & 0x80 >> 7) | (a[1] & 0x80 >> 6) | ... | (a[15] & 0x80 << 8)

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 7-9 instructions.
In ALTIVEC this intrinsic results in at least 9-11 instructions.

uint16_t simdpp::extract_bits_any ( uint8x16 a)

inline

Extracts a bit from each byte of each element of a int8x16 vector.

This operation is only sensible if each byte within the vector is either 0x00 or 0xff.

r = ((a[0] & 0x??) ? 0x01 : 0) |
    ((a[1] & 0x??) ? 0x02 : 0) |
    ...
    ((a[15] & 0x??) ? 0x80 : 0)

In NEON this intrinsic results in at least 6-7 instructions.
In ALTIVEC this intrinsic results in at least 8-9 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::floor ( float32< N, E > a)

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
In NEON this intrinsic results in at least 10-11 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
In NEON this intrinsic results in at least 20-21 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 , class E3 >

float32<N, expr_fmadd<float32<N,E1>, float32<N,E2>, float32<N,E3> > > simdpp::fmadd	(	float32< N, E1 >	a,
		float32< N, E2 >	b,
		float32< N, E3 >	c
	)

Performs a fused multiply-add operation.

r0 = a0 * b0 + c0
...
rN = aN * bN + cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >

float64<N, expr_fmadd<float64<N,E1>, float64<N,E2>, float64<N,E3> > > simdpp::fmadd	(	float64< N, E1 >	a,
		float64< N, E2 >	b,
		float64< N, E3 >	c
	)

Performs a fused multiply-add operation.

r0 = a0 * b0 + c0
...
rN = aN * bN + cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >

float32<N, expr_fmsub<float32<N,E1>, float32<N,E2>, float32<N,E3> > > simdpp::fmsub	(	float32< N, E1 >	a,
		float32< N, E2 >	b,
		float32< N, E3 >	c
	)

Performs a fused multiply-sutract operation.

r0 = a0 * b0 - c0
...
rN = aN * bN - cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

template<unsigned N, class E1 , class E2 , class E3 >

float64<N, expr_fmsub<float64<N,E1>, float64<N,E2>, float64<N,E3> > > simdpp::fmsub	(	float64< N, E1 >	a,
		float64< N, E2 >	b,
		float64< N, E3 >	c
	)

Performs a fused multiply-sutract operation.

r0 = a0 * b0 - c0
...
rN = aN * bN - cN

Implemented only on architectures with either X86_FMA3 or X86_FMA4 support.

Arch simdpp::get_arch_gcc_builtin_cpu_supports ( )

inline

Retrieves supported architecture using GCC __builtin_cpu_supports function.

Works only on x86.

Arch simdpp::get_arch_linux_cpuinfo ( )

inline

Retrieves supported architecture from Linux /proc/cpuinfo file.

Works on X86 and ARM.

template<unsigned N, class E >

mask_float32<N, mask_float32<N> > simdpp::isnan ( float32< N, E > a)

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffff : 0
...
rN = isnan(aN) ? 0xffffffff : 0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

mask_float64<N, mask_float64<N> > simdpp::isnan ( float64< N, E > a)

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

mask_float32<N, mask_float32<N> > simdpp::isnan2	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 6 instructions.

template<unsigned N, class E1 , class E2 >

mask_float64<N, mask_float64<N> > simdpp::isnan2	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<class V = expr_vec_load>

V simdpp::load ( const void * p)

Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location.

128-bit version:

a[0..127] = *(p)

p must be aligned to 16 bytes.

256-bit version:

a[0..255] = *(p)

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

template<unsigned N, class V >

void simdpp::load_packed2	(	any_vec< N, V > &	a,
		any_vec< N, V > &	b,
		const void *	p
	)

Loads values packed in pairs, de-interleaves them and stores the result into two vectors.

a = [ *(p), *(p+2), *(p+4), ... , *(p+M*2-2) ]

b = [ *(p+1), *(p+3), *(p+5), ... , *(p+M*2-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >

void simdpp::load_packed3	(	any_vec< N, V > &	a,
		any_vec< N, V > &	b,
		any_vec< N, V > &	c,
		const void *	p
	)

Loads values packed in triplets, de-interleaves them and stores the result into three vectors.

a = [ *(p),   *(p+3), *(p+6), ... , *(p+M*3-3) ]
b = [ *(p+1), *(p+4), *(p+7), ... , *(p+M*3-2) ]
c = [ *(p+2), *(p+5), *(p+8), ... , *(p+M*3-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >

void simdpp::load_packed4	(	any_vec< N, V > &	a,
		any_vec< N, V > &	b,
		any_vec< N, V > &	c,
		any_vec< N, V > &	d,
		const void *	p
	)

Loads values packed in quartets, de-interleaves them and stores the result into four vectors.

a = [ *(p),   *(p+4), *(p+8),  ... , *(p+M*4-4) ]
b = [ *(p+1), *(p+5), *(p+9),  ... , *(p+M*4-3) ]
c = [ *(p+2), *(p+6), *(p+10), ... , *(p+M*4-2) ]
d = [ *(p+3), *(p+7), *(p+11), ... , *(p+M*4-1) ]

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<class V = expr_vec_load_splat>

V simdpp::load_splat ( const void * p)

Loads a value from a memory location and broadcasts it to all elements of a vector.

r0 = *p
...
rN = *p

p must have the alignment of the element of the target vector.

template<class V = expr_vec_load_u>

V simdpp::load_u ( const void * p)

Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location.

128-bit version:

a[0..127] = *(p)

p must be aligned to the element size. If p is aligned to 16 bytes only the referenced 16 byte block is accessed. Otherwise, memory within the smallest 16-byte aligned 32-byte block may be accessed.

In ALTIVEC this intrinsic results in at least 4 instructions.

256-bit version:

a[0..255] = *(p)

p must be aligned to 32 bytes.

In SSE2-SSE4.1 and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 6 instructions.

p must be aligned to the element size. If p is aligned to 32 bytes only the referenced 16 byte block is accessed. Otherwise, memory within the smallest 32-byte aligned 64-byte block may be accessed.

template<class V = expr_vec_make_const<double,1>>

V simdpp::make_float ( double v0)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<double,2>>

V simdpp::make_float	(	double	v0,
		double	v1
	)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<double,4>>

V simdpp::make_float	(	double	v0,
		double	v1,
		double	v2,
		double	v3
	)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<double,8>>

V simdpp::make_float	(	double	v0,
		double	v1,
		double	v2,
		double	v3,
		double	v4,
		double	v5,
		double	v6,
		double	v7
	)

Creates a vector from floating-point values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<int64_t,1>>

V simdpp::make_int ( int64_t v0)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<int64_t,2>>

V simdpp::make_int	(	int64_t	v0,
		int64_t	v1
	)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<int64_t,4>>

V simdpp::make_int	(	int64_t	v0,
		int64_t	v1,
		int64_t	v2,
		int64_t	v3
	)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<int64_t,8>>

V simdpp::make_int	(	int64_t	v0,
		int64_t	v1,
		int64_t	v2,
		int64_t	v3,
		int64_t	v4,
		int64_t	v5,
		int64_t	v6,
		int64_t	v7
	)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<int64_t,16>>

V simdpp::make_int	(	int64_t	v0,
		int64_t	v1,
		int64_t	v2,
		int64_t	v3,
		int64_t	v4,
		int64_t	v5,
		int64_t	v6,
		int64_t	v7,
		int64_t	v8,
		int64_t	v9,
		int64_t	v10,
		int64_t	v11,
		int64_t	v12,
		int64_t	v13,
		int64_t	v14,
		int64_t	v15
	)

Creates a vector from signed integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<int s0, int s1, unsigned N>

uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > & mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r14 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+14] : b[s0+12])
r15 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+14] : b[s1+12])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>

uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > & mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>

uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > & mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15, unsigned N>

uint8<N> simdpp::make_shuffle_bytes16_mask ( uint8< N > & mask)

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>

uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > & mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>

uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > & mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, unsigned N>

uint16<N> simdpp::make_shuffle_bytes16_mask ( uint16< N > & mask)

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>

uint32<N> simdpp::make_shuffle_bytes16_mask ( uint32< N > & mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, unsigned N>

uint32<N> simdpp::make_shuffle_bytes16_mask ( uint32< N > & mask)

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, unsigned N>

uint64<N> simdpp::make_shuffle_bytes16_mask ( uint64< N > & mask)

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])

r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<class V = expr_vec_make_const<uint64_t,1>>

V simdpp::make_uint ( uint64_t v0)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<uint64_t,2>>

V simdpp::make_uint	(	uint64_t	v0,
		uint64_t	v1
	)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<uint64_t,4>>

V simdpp::make_uint	(	uint64_t	v0,
		uint64_t	v1,
		uint64_t	v2,
		uint64_t	v3
	)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<uint64_t,8>>

V simdpp::make_uint	(	uint64_t	v0,
		uint64_t	v1,
		uint64_t	v2,
		uint64_t	v3,
		uint64_t	v4,
		uint64_t	v5,
		uint64_t	v6,
		uint64_t	v7
	)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<class V = expr_vec_make_const<uint64_t,16>>

V simdpp::make_uint	(	uint64_t	v0,
		uint64_t	v1,
		uint64_t	v2,
		uint64_t	v3,
		uint64_t	v4,
		uint64_t	v5,
		uint64_t	v6,
		uint64_t	v7,
		uint64_t	v8,
		uint64_t	v9,
		uint64_t	v10,
		uint64_t	v11,
		uint64_t	v12,
		uint64_t	v13,
		uint64_t	v14,
		uint64_t	v15
	)

Creates a vector from unsigned integer values known at compile-time.

The result of this function may be assigned or converted to a vector of any type: standard conversions are used to convert the arguments. All conversions and other overhead is performed at compile-time thus even if the minimal optimization level is selected, the function results in a simple load from memory.

The function is not guaranteed to have adequate performance if the arguments are not known at compile-time.

If the vector has fewer elements than the number of the parameters this function accepts then the extra values are discarded.

Note that per C++ rules negative values are sign-extended to fill entire element before being converted to unsigned type thus e.g. it's safe to use -1 to fill element with ones.

1 parameter version: | 0 1 2 3 ... n |

r = [ v0 v0 v0 v0 ... v0 ]

2 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v0 v1 ... v1 ]

4 parameters version: | 0 1 2 3 ... n |

r = [ v0 v1 v2 v3 ... v3 ]

8 parameters version: | 0 1 .. 7 8 ... n |

r = [ v0 v1 .. v7 v0 ... v7 ]

template<unsigned N, class E1 , class E2 >

float32<N, float32<N> > simdpp::max	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, int8<N> > simdpp::max	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::max	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, float64<N> > simdpp::max	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned N, class E1 , class E2 >

int16<N, int16<N> > simdpp::max	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::max	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int32<N, int32<N> > simdpp::max	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::max	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float32<N, float32<N> > simdpp::min	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, int8<N> > simdpp::min	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Computes minimum of signed 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::min	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, float64<N> > simdpp::min	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, int16<N> > simdpp::min	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::min	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int32<N, int32<N> > simdpp::min	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::min	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move16_l ( const any_vec8< N, V > & a)

Moves the elements in an int8x16 vector to the left by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [ a1  a2   .  a15  0  ]
    r = [ a2  a3   .   0   0  ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [ a15  0   .   0   0  ]
   r = [  0   0   .   0   0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move16_r ( const any_vec8< N, V > & a)

Moves the 8-bit elements in a vector to the right by shift positions.

shift:  pos:| 0   1    .  14  15  |
    r = [ a0  a1   .  a14 a15 ]
    r = [  0  a0   .  a13 a14 ]
    r = [  0   0   .  a12 a13 ]
  ...    ..   .. ..   ...  ..  .. ..
   r = [  0   0   .  a0  a1  ]
   r = [  0   0   .   0  a0  ]
   r = [  0   0   .   0   0  ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move2_l ( const any_vec64< N, V > & a)

Moves the 64-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [ a1  0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move2_r ( const any_vec64< N, V > & a)

Moves the 64-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  |
    r = [ a0 a1 ]
    r = [  0 a0 ]
    r = [  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move4_l ( const any_vec32< N, V > & a)

Moves the 32-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [ a1 a2 a3  0 ]
    r = [ a2 a3  0  0 ]
    r = [ a3  0  0  0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move4_r ( const any_vec32< N, V > & a)

Moves the 32-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1  2  3  |
    r = [ a0 a1 a2 a3 ]
    r = [  0 a0 a1 a2 ]
    r = [  0  0 a0 a1 ]
    r = [  0  0  0 a0 ]
    r = [  0  0  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move8_l ( const any_vec16< N, V > & a)

Moves the 16-bit elements in a vector to the left by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [ a1 a2  . a7  0 ]
    r = [ a2 a3  .  0  0 ]
  ...    ..   .. .. ... .. ..
    r = [ a6 a7  .  0  0 ]
    r = [ a7  0  .  0  0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned shift, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::move8_r ( const any_vec16< N, V > & a)

Moves the 16-bit elements in a vector to the right by shift positions.

shift:  pos:| 0  1   . 6  7  |
    r = [ a0 a1  . a6 a7 ]
    r = [  0 a0  . a5 a6 ]
    r = [  0  0  . a4 a5 ]
  ...    ..   .. .. ... .. ..
    r = [  0  0  . a0 a1 ]
    r = [  0  0  .  0 a0 ]
    r = [  0  0  .  0  0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float32<N, expr_mul<float32<N,E1>, float32<N,E2> > > simdpp::mul	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Multiplies the values of two vectors.

r0 = a0 * b0
...
rN = aN * bN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, expr_mul<float64<N,E1>, float64<N,E2> > > simdpp::mul	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Multiplies the values of two vectors.

r0 = a0 * b0
...
rN = aN * bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, expr_mul_hi<int16<N,E1>, int16<N,E2> > > simdpp::mul_hi	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Multiplies signed 16-bit values and returns the higher half of the result.

r0 = high(a0 * b0)
...
rN = high(aN * bN)

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 6 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, expr_mul_hi<uint16<N,E1>, uint16<N,E2> > > simdpp::mul_hi	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Multiplies unsigned 16-bit values and returns the higher half of the result.

r0 = high(a0 * b0)
...
rN = high(aN * bN)

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 6 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_mul_lo<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::mul_lo	(	const any_int16< N, V1 > &	a,
		const any_int16< N, V2 > &	b
	)

Multiplies 16-bit values and returns the lower part of the multiplication.

r0 = low(a0 * b0)
...
rN = low(aN * bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_mul_lo<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::mul_lo	(	const any_int32< N, V1 > &	a,
		const any_int32< N, V2 > &	b
	)

Multiplies 32-bit values and returns the lower half of the result.

r0 = low(a0 * b0)
...
rN = low(aN * bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 8 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1, AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 16 instructions.

template<unsigned N, class E1 , class E2 >

int32<N, expr_mull<int16<N,E1>, int16<N,E2> > > simdpp::mull	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Multiplies signed 16-bit values and expands the results to 32 bits.

128-bit version:: r0 = a0 * b0

...

rN = aN * bN

In SSE2-AVX and ALTIVEC this intrinsic results in at least 2-3 instructions.

256-bit version:

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX and ALTIVEC this intrinsic results in at least 4-6 instructions.
In AVX2 and NEON this intrinsic results in at least 2-3 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, expr_mull<uint16<N,E1>, uint16<N,E2> > > simdpp::mull	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Multiplies unsigned 16-bit values and expands the results to 32 bits.

128-bit version:: r0 = a0 * b0

...

rN = aN * bN

In SSE2-AVX2 and ALTIVEC this intrinsic results in at least 2-3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX and ALTIVEC this intrinsic results in at least 4-6 instructions.
In AVX2 this intrinsic results in at least 2-3 instructions.
In NEON this intrinsic results in at least 2 instructions.
Note
Use with mull_hi on the same arguments to save instructions.

template<unsigned N, class E1 , class E2 >

int64<N, expr_mull<int32<N,E1>, int32<N,E2> > > simdpp::mull	(	int32< N, E1 >	a,
		int32< N, E2 >	b
	)

Multiplies signed 32-bit values in and expands the results to 64 bits.

r0 = a0 * b0
...
rN = aN * bN

128-bit version:

In SSE4.1-AVX this intrinsic results in at least 3 instructions.
Not implemented for SSE2-SSSE3 and ALTIVEC.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE4.1-AVX this intrinsic results in at least 6 instructions.
In AVX2 this intrinsic results in at least 3 instructions.
In NEON this intrinsic results in at least 2 instructions.
Not implemented for SSE2-SSSE3 and ALTIVEC.

template<unsigned N, class E1 , class E2 >

uint64<N, expr_mull<uint32<N,E1>, uint32<N,E2> > > simdpp::mull	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits.

128-bit version:

r0 = a0 * b0

r1 = a1 * b1

In SSE2-AVX this intrinsic results in at least 3 instructions.
Not implemented for ALTIVEC.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In AVX2 this intrinsic results in at least 3 instructions.
In NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

template<unsigned N, class E >

int8<N, expr_neg<int8<N,E> > > simdpp::neg ( int8< N, E > a)

Negates signed 8-bit values.

r0 = -a0
...
rN = -aN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

float32<N, expr_neg<float32<N,E> > > simdpp::neg ( float32< N, E > a)

Negates the values of a float32x4 vector.

r0 = -a0
...
rN = -aN

128-bit version:

In SSE2-AVX2 and ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1 and ALTIVEC this intrinsic results in at least 2-3 instructions.
In AVX-AVX2 and NEON this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

int16<N, expr_neg<int16<N,E> > > simdpp::neg ( int16< N, E > a)

Negates signed 16-bit values.

r0 = -a0
...
rN = -aN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

float64<N, expr_neg<float64<N,E> > > simdpp::neg ( float64< N, E > a)

Negates the values of a vector.

r0 = -a0
...
rN = -aN

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
Not vectorized in NEON and .

template<unsigned N, class E >

int32<N, expr_neg<int32<N,E> > > simdpp::neg ( int32< N, E > a)

Negates signed 32-bit values.

r0 = -a0
...
rN = -aN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

int64<N, expr_neg<int64<N,E> > > simdpp::neg ( int64< N, E > a)

Negates signed 64-bit values.

r0 = -a0
...
rN = -aN

128-bit version:

In ALTIVEC this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 8-9 instructions.

template<unsigned s0, unsigned s1, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec16< N, V > & a)

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec32< N, V > & a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]

integer

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

floating-point

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute2 ( const any_vec64< N, V > & a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec16< N, V > & a)

Permutes the 16-bit values within each 4 consecutive values of the vector.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a[s0+12]
...
r15 = a[s3+12]

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-10 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec32< N, V > & a)

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]

integer

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

floating-point

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::permute4 ( const any_vec64< N, V > & a)

Permutes the values of each set of four consecutive 64-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]

integer

In SSE2-AVX this intrinsic results in at least 2 instructions.

floating-point

In SSE2-AVX this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

uint8x16 simdpp::permute_bytes16	(	uint8x16	a,
		uint8x16	mask
	)

inline

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

uint8<N> simdpp::permute_bytes16	(	uint8< N >	a,
		uint8< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

uint16<N> simdpp::permute_bytes16	(	uint16< N >	a,
		uint16< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

uint32<N> simdpp::permute_bytes16	(	uint32< N >	a,
		uint32< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

uint64<N> simdpp::permute_bytes16	(	uint64< N >	a,
		uint64< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

float32<N> simdpp::permute_bytes16	(	float32< N >	a,
		uint32< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

template<unsigned N>

float64<N> simdpp::permute_bytes16	(	float64< N >	a,
		uint64< N >	mask
	)

Selects bytes from a vector according to a mask.

Each byte within the mask defines which element to select: Bits 7-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX and ALTIVEC this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.

uint8x16 simdpp::permute_zbytes16	(	uint8x16	a,
		uint8x16	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint8<N> simdpp::permute_zbytes16	(	uint8< N >	a,
		uint8< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint16<N> simdpp::permute_zbytes16	(	uint16< N >	a,
		uint16< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint32<N> simdpp::permute_zbytes16	(	uint32< N >	a,
		uint32< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint64<N> simdpp::permute_zbytes16	(	uint64< N >	a,
		uint64< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

float32<N> simdpp::permute_zbytes16	(	float32< N >	a,
		uint32< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

float64<N> simdpp::permute_zbytes16	(	float64< N >	a,
		uint64< N >	mask
	)

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::rcp_e ( float32< N, E > a)

Computes approximate reciprocal.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::rcp_rh ( float32< N, E > a)

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
float32x4 x;
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 4-5 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::rsqrt_e ( float32< N, E > a)

Computes approximate reciprocal square root.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))

128-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::rsqrt_rh ( float32< N, E > a)

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5

128-bit version:

In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
In NEON this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 7 instructions.
In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 8-10 instructions.

template<unsigned N, class E >

int8<N, int8<N> > simdpp::shift_l	(	int8< N, E >	a,
		unsigned	count
	)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In SSE2-AVX this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint8< N, uint8< N > > simdpp::shift_l	(	uint8< N, E >	a,
		unsigned	count
	)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In SSE2-AVX this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int16<N, int16<N> > simdpp::shift_l	(	int16< N, E >	a,
		unsigned	count
	)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint16< N, uint16< N > > simdpp::shift_l	(	uint16< N, E >	a,
		unsigned	count
	)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int32<N, int32<N> > simdpp::shift_l	(	int32< N, E >	a,
		unsigned	count
	)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint32< N, uint32< N > > simdpp::shift_l	(	uint32< N, E >	a,
		unsigned	count
	)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int64<N, int64<N> > simdpp::shift_l	(	int64< N, E >	a,
		unsigned	count
	)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
Not implemented for ALTIVEC.

template<unsigned N, class E >

uint64< N, uint64< N > > simdpp::shift_l	(	uint64< N, E >	a,
		unsigned	count
	)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
Not implemented for ALTIVEC.

template<unsigned count, unsigned N, class E >

int8<N, int8<N> > simdpp::shift_l ( int8< N, E > a)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.
In NEON this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 4-5 instructions.
In AVX2 and NEON this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint8< N, uint8< N > > simdpp::shift_l ( uint8< N, E > a)

Shifts 8-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.
In NEON this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 4-5 instructions.
In AVX2 and NEON this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int16<N, int16<N> > simdpp::shift_l ( int16< N, E > a)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint16< N, uint16< N > > simdpp::shift_l ( uint16< N, E > a)

Shifts 16-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int32<N, int32<N> > simdpp::shift_l ( int32< N, E > a)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint32< N, uint32< N > > simdpp::shift_l ( uint32< N, E > a)

Shifts 32-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int64<N, int64<N> > simdpp::shift_l ( int64< N, E > a)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

template<unsigned count, unsigned N, class E >

uint64< N, uint64< N > > simdpp::shift_l ( uint64< N, E > a)

Shifts 64-bit values left by count bits while shifting in zeros.

r0 = a0 << count
...
rN = aN << count

Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

template<unsigned N, class E >

int8< N, int8< N > > simdpp::shift_r	(	int8< N, E >	a,
		unsigned	count
	)

Shifts signed 8-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 6 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint8< N, uint8< N > > simdpp::shift_r	(	uint8< N, E >	a,
		unsigned	count
	)

Shifts unsigned 8-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int16< N, int16< N > > simdpp::shift_r	(	int16< N, E >	a,
		unsigned	count
	)

Shifts signed 16-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint16< N, uint16< N > > simdpp::shift_r	(	uint16< N, E >	a,
		unsigned	count
	)

Shifts unsigned 16-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int32< N, int32< N > > simdpp::shift_r	(	int32< N, E >	a,
		unsigned	count
	)

Shifts signed 32-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

r0 = a0 >> count
...
rN = aN >> count

In NEON this intrinsic results in at least 2 instructions.

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

uint32< N, uint32< N > > simdpp::shift_r	(	uint32< N, E >	a,
		unsigned	count
	)

Shifts unsigned 32-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

template<unsigned N, class E >

int64< N, int64< N > > simdpp::shift_r	(	int64< N, E >	a,
		unsigned	count
	)

Shifts signed 64-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-6 instructions.
In NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-10 instructions.
In AVX2 this intrinsic results in at least 4-6 instructions.
In NEON this intrinsic results in at least 3 instructions.
Not implemented for ALTIVEC.

template<unsigned N, class E >

uint64< N, uint64< N > > simdpp::shift_r	(	uint64< N, E >	a,
		unsigned	count
	)

Shifts unsigned 64-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-3 instructions.
Not implemented for ALTIVEC.

template<unsigned count, unsigned N, class E >

int8< N, int8< N > > simdpp::shift_r ( int8< N, E > a)

Shifts signed 8-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint8< N, uint8< N > > simdpp::shift_r ( uint8< N, E > a)

Shifts unsigned 8-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In SSE2-AVX2 this intrinsic results in at least 2-3 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 4-5 instructions.
In AVX2 this intrinsic results in at least 2-3 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int16< N, int16< N > > simdpp::shift_r ( int16< N, E > a)

Shifts signed 16-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint16< N, uint16< N > > simdpp::shift_r ( uint16< N, E > a)

Shifts unsigned 16-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int32< N, int32< N > > simdpp::shift_r ( int32< N, E > a)

Shifts signed 32-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

uint32< N, uint32< N > > simdpp::shift_r ( uint32< N, E > a)

Shifts unsigned 32-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned count, unsigned N, class E >

int64< N, int64< N > > simdpp::shift_r ( int64< N, E > a)

Shifts signed 64-bit values right by count bits while shifting in the sign bit.

r0 = a0 >> count
...
rN = aN >> count

128-bit version:

Not vectorized in SSE2-AVX2.
In SSE2-AVX2 this intrinsic results in at least 4-6 instructions.
Not implemented for ALTIVEC.

256-bit version:

Not vectorized in SSE2-AVX.
In SSE2-AVX this intrinsic results in at least 8-10 instructions.
In AVX2 this intrinsic results in at least 4-6 instructions.
In NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

template<unsigned count, unsigned N, class E >

uint64< N, uint64< N > > simdpp::shift_r ( uint64< N, E > a)

Shifts unsigned 64-bit values right by count bits while shifting in zeros.

r0 = a0 >> count
...
rN = aN >> count

Not implemented for ALTIVEC.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
Not implemented for ALTIVEC.

template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle1	(	const any_vec64< N, V1 > &	a,
		const any_vec64< N, V2 > &	b
	)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]

floating-point

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

integer

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned sa0, unsigned sa1, unsigned sb0, unsigned sb1, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle2	(	const any_vec32< N, V1 > &	a,
		const any_vec32< N, V2 > &	b
	)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]

floating-point

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

integer

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::shuffle2	(	const any_vec32< N, V1 > &	a,
		const any_vec32< N, V2 > &	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]

floating-point

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

integer

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

uint8x16 simdpp::shuffle_bytes16	(	uint8x16	a,
		uint8x16	b,
		uint8x16	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

uint8<N> simdpp::shuffle_bytes16	(	uint8< N >	a,
		uint8< N >	b,
		uint8< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

uint16<N> simdpp::shuffle_bytes16	(	uint16< N >	a,
		uint16< N >	b,
		uint16< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

uint32<N> simdpp::shuffle_bytes16	(	uint32< N >	a,
		uint32< N >	b,
		uint32< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

uint64<N> simdpp::shuffle_bytes16	(	uint64< N >	a,
		uint64< N >	b,
		uint64< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

float32<N> simdpp::shuffle_bytes16	(	float32< N >	a,
		float32< N >	b,
		uint32< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

float64<N> simdpp::shuffle_bytes16	(	float64< N >	a,
		float64< N >	b,
		uint64< N >	mask
	)

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

uint8x16 simdpp::shuffle_zbytes16	(	uint8x16	a,
		uint8x16	b,
		uint8x16	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint8<N> simdpp::shuffle_zbytes16	(	uint8< N >	a,
		uint8< N >	b,
		uint8< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint16<N> simdpp::shuffle_zbytes16	(	uint16< N >	a,
		uint16< N >	b,
		uint16< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint32<N> simdpp::shuffle_zbytes16	(	uint32< N >	a,
		uint32< N >	b,
		uint32< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

uint64<N> simdpp::shuffle_zbytes16	(	uint64< N >	a,
		uint64< N >	b,
		uint64< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

float32<N> simdpp::shuffle_zbytes16	(	float32< N >	a,
		float32< N >	b,
		uint32< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N>

float64<N> simdpp::shuffle_zbytes16	(	float64< N >	a,
		float64< N >	b,
		uint64< N >	mask
	)

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N, class E >

float32<N, float32<N> > simdpp::sign ( float32< N, E > a)

Extracts sign bits from the values in float32x4 vector.

r0 = a0 & 0x80000000
...
rN = aN & 0x80000000

128-bit version:

In SSE2-SSE4.1, ALTIVEC and NEON this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1, ALTIVEC and NEON this intrinsic results in at least 2-3 instructions.
In AVX-AVX2 this intrinsic results in at least 1-2 instructions.

template<unsigned N, class E >

float64<N, float64<N> > simdpp::sign ( float64< N, E > a)

Extracts sigh bit from the values in float64x2 vector.

r0 = a0 & 0x8000000000000000
...
r0 = aN & 0x8000000000000000

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 1-2 instructions.
Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2-3 instructions.
In AVX-AVX2 this intrinsic results in at least 1-2 instructions.
Not vectorized in NEON and .

template<class V = expr_vec_set_splat<int>>

V simdpp::splat ( int x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<class V = expr_vec_set_splat<unsigned>>

V simdpp::splat ( unsigned x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<class V = expr_vec_set_splat<int64_t>>

V simdpp::splat ( int64_t x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<class V = expr_vec_set_splat<uint64_t>>

V simdpp::splat ( uint64_t x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<class V = expr_vec_set_splat<float>>

V simdpp::splat ( float x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<class V = expr_vec_set_splat<double>>

V simdpp::splat ( double x)

Loads a value from a register and broadcasts it to all elements of a vector.

The argument value is converted to the element of the resulting vector using standard conversions.

r0 = a
...
rN = a

template<unsigned s, unsigned N, class V >

detail::get_expr_nomask<V, void>::empty simdpp::splat ( const any_vec< N, V > & a)

Broadcasts the specified element to all elements.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

int8

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int64

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

float32

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s, unsigned N, class E >

int8<N, expr_splat16<s,int8<N,E> > > simdpp::splat16 ( int8< N, E > a)

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 7 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 14 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

uint8<N, expr_splat16<s,uint8<N,E> > > simdpp::splat16 ( uint8< N, E > a)

Broadcasts the specified 8-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
rN = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 7 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE3 this intrinsic results in at least 14 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

int64<N, expr_splat2<s,int64<N,E> > > simdpp::splat2 ( int64< N, E > a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s, unsigned N, class E >

uint64<N, expr_splat2<s,uint64<N,E> > > simdpp::splat2 ( uint64< N, E > a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s, unsigned N, class E >

float64<N, expr_splat2<s,float64<N,E> > > simdpp::splat2 ( float64< N, E > a)

Broadcasts the specified 64-bit value to all elements within 128-bit lanes.

r0 = a[s]

r1 = a[s]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s, unsigned N, class E >

int32<N, expr_splat4<s,int32<N,E> > > simdpp::splat4 ( int32< N, E > a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

uint32<N, expr_splat4<s,uint32<N,E> > > simdpp::splat4 ( uint32< N, E > a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

float32<N, expr_splat4<s,float32<N,E> > > simdpp::splat4 ( float32< N, E > a)

Broadcasts the specified 32-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
r2 = a[s]
r3 = a[s]

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

int16<N, expr_splat8<s,int16<N,E> > > simdpp::splat8 ( int16< N, E > a)

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned s, unsigned N, class E >

uint16<N, expr_splat8<s,uint16<N,E> > > simdpp::splat8 ( uint16< N, E > a)

Broadcasts the specified 16-bit value to all elements within 128-bit lanes.

r0 = a[s]
r1 = a[s]
...
r7 = a[s]

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In SSSE3-AVX this intrinsic results in at least 1-2 instructions.
In AVX2 this intrinsic results in at least 2 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX this intrinsic results in at least 2-3 instructions.
In AVX2, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

void simdpp::split	(	uint8x32	a,
		uint8x16 &	r1,
		uint8x16 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	uint16x16	a,
		uint16x8 &	r1,
		uint16x8 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	uint32x8	a,
		uint32x4 &	r1,
		uint32x4 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	uint64x4	a,
		uint64x2 &	r1,
		uint64x2 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	int8x32	a,
		int8x16 &	r1,
		int8x16 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	int16x16	a,
		int16x8 &	r1,
		int16x8 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	int32x8	a,
		int32x4 &	r1,
		int32x4 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	int64x4	a,
		int64x2 &	r1,
		int64x2 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	float32x8	a,
		float32x4 &	r1,
		float32x4 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

void simdpp::split	(	float64x4	a,
		float64x2 &	r1,
		float64x2 &	r2
	)

inline

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	uint8< N >	a,
		uint8< N/2 > &	r1,
		uint8< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	uint16< N >	a,
		uint16< N/2 > &	r1,
		uint16< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	uint32< N >	a,
		uint32< N/2 > &	r1,
		uint32< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	uint64< N >	a,
		uint64< N/2 > &	r1,
		uint64< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	int8< N >	a,
		int8< N/2 > &	r1,
		int8< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	int16< N >	a,
		int16< N/2 > &	r1,
		int16< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	int32< N >	a,
		int32< N/2 > &	r1,
		int32< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	int64< N >	a,
		int64< N/2 > &	r1,
		int64< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	float32< N >	a,
		float32< N/2 > &	r1,
		float32< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N>

void simdpp::split	(	float64< N >	a,
		float64< N/2 > &	r1,
		float64< N/2 > &	r2
	)

Splits a 256-bit vector into two 128-bit vectors.

[ r1, r2 ] = a

In AVX2 this intrinsic results in at least 1 instructions.
In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.

template<unsigned N, class E1 >

float32<N, float32<N> > simdpp::sqrt ( float32< N, E1 > a)

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

In NEON this intrinsic results in at least 5 instructions.
In ALTIVEC this intrinsic results in at least 5-7 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 10 instructions.
In ALTIVEC this intrinsic results in at least 10-12 instructions.

template<unsigned N, class E1 >

float64<N, float64<N> > simdpp::sqrt ( float64< N, E1 > a)

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned N, class V >

void simdpp::store	(	void *	p,
		const any_vec< N, V > &	a
	)

Stores a 128-bit or 256-bit integer vector to an aligned memory location.

128-bit version:

(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

template<unsigned N, class V >

void simdpp::store_first	(	void *	p,
		const any_vec< N, V > &	a,
		unsigned	n
	)

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

The function may write entire block of 128 or 256 bits.

 (p) = a0
 (p+1) = a1
...
 (p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

template<unsigned N, class V >

void simdpp::store_last	(	void *	p,
		const any_vec< N, V > &	a,
		unsigned	n
	)

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

The function may write entire block of 128 or 256 bits.

 (p+N-n) = a{N-n}
...
 (p+N-2) = a{N-2}
 (p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

template<unsigned N, class V1 , class V2 >

void simdpp::store_packed2	(	void *	p,
		const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b
	)

Interleaves values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+M*2-2) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+M*2-1) ] = b

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V1 , class V2 , class V3 >

void simdpp::store_packed3	(	void *	p,
		const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b,
		const any_vec< N, V3 > &	c
	)

Interleaves values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+M*3-3) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+M*3-2) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+M*3-1) ] = c

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V1 , class V2 , class V3 , class V4 >

void simdpp::store_packed4	(	void *	p,
		const any_vec< N, V1 > &	a,
		const any_vec< N, V2 > &	b,
		const any_vec< N, V3 > &	c,
		const any_vec< N, V4 > &	d
	)

Interleaves values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+M*4-4) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+M*4-3) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+M*4-2) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+M*4-1) ] = d

Here M is the number of elements in the vector

p must be aligned to the vector size in bytes

template<unsigned N, class V >

void simdpp::stream	(	void *	p,
		const any_vec< N, V > &	a
	)

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float32<N, expr_sub<float32<N,E1>, float32<N,E2> > > simdpp::sub	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

Substracts the values of two vectors.

r0 = a0 - b0
...
rN = aN - bN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_sub<uint8<N, typename V1::expr_type>, uint8<N, typename V2::expr_type> > >::type simdpp::sub	(	const any_int8< N, V1 > &	a,
		const any_int8< N, V2 > &	b
	)

Subtracts 8-bit integer values.

r0 = a0 - b0
...
rN = aN - bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, expr_sub<float64<N,E1>, float64<N,E2> > > simdpp::sub	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

Subtracts the values of two vectors.

r0 = a0 - b0
...
rN = aN - bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_sub<uint16<N, typename V1::expr_type>, uint16<N, typename V2::expr_type> > >::type simdpp::sub	(	const any_int16< N, V1 > &	a,
		const any_int16< N, V2 > &	b
	)

Subtracts 16-bit integer values.

r0 = a0 - b0
...
rN = aN - bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_sub<uint32<N, typename V1::expr_type>, uint32<N, typename V2::expr_type> > >::type simdpp::sub	(	const any_int32< N, V1 > &	a,
		const any_int32< N, V2 > &	b
	)

Subtracts 32-bit integer values.

r0 = a0 - b0
...
rN = aN - bN

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, expr_sub<uint64<N, typename V1::expr_type>, uint64<N, typename V2::expr_type> > >::type simdpp::sub	(	const any_int64< N, V1 > &	a,
		const any_int64< N, V2 > &	b
	)

Subtracts 64-bit integer values.

r0 = a0 - b0
...
rN = aN - bN

128-bit version:

In ALTIVEC this intrinsic results in at least 5-6 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 10-11 instructions.

template<unsigned N, class E1 , class E2 >

int8<N, expr_sub_sat<int8<N,E1>, int8<N,E2> > > simdpp::sub_sat	(	int8< N, E1 >	a,
		int8< N, E2 >	b
	)

Subtracts and saturaters signed 8-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

int16<N, expr_sub_sat<int16<N,E1>, int16<N,E2> > > simdpp::sub_sat	(	int16< N, E1 >	a,
		int16< N, E2 >	b
	)

Subtracts and saturaters signed 16-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, expr_sub_sat<uint8<N,E1>, uint8<N,E2> > > simdpp::sub_sat	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

Subtracts and saturaters unsigned 8-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, expr_sub_sat<uint16<N,E1>, uint16<N,E2> > > simdpp::sub_sat	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

Subtracts and saturaters unsigned 16-bit integer values.

r0 = saturated(a0 - b0)
...
rN = saturated(aN - bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::to_float32 ( int32x4 a)

inline

Converts 32-bit integer values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the current rounding mode is used.

NEON, ALTIVEC specific:

If only inexact conversion can be performed, round to nearest mode is used.

r0 = (float) a0
...
rN = (float) aN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N>

float32<N> simdpp::to_float32 ( int32< N > a)

Converts 32-bit integer values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the current rounding mode is used.

NEON, ALTIVEC specific:

If only inexact conversion can be performed, round to nearest mode is used.

r0 = (float) a0
...
rN = (float) aN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::to_float32 ( float64x4 a)

inline

Converts 64-bit float values to 32-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is truncated.

128-bit version:: r0 = (float) a0

r1 = (float) a1

r2 = 0.0f

r3 = 0.0f

Not vectorized in NEON and .

256-bit version:: r0 = (float) a0

...

r3 = (float) a3

r4 = 0.0f

...

r7 = 0.0f

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

float64x4 simdpp::to_float64 ( int32x4 a)

inline

Converts the 32-bit integer values to 64-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is rounded to the nearest representable value.

256-bit version:: r0 = (double) a0

...

r3 = (double) a3

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

float64x4 simdpp::to_float64 ( float32x4 a)

inline

Converts the 32-bit float values to 64-bit float values.

SSE specific:

If only inexact conversion can be performed, the value is rounded according to the current rounding mode.

NEON specific:

If only inexact conversion can be performed, the value is rounded to the nearest representable value.

256-bit version:: r0 = (double) a0

...

r3 = (double) a3

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.
Not vectorized in NEON and .

uint16x16 simdpp::to_int16 ( int8x16 a)

inline

Sign extends the 16 values of a signed int8x16 vector to 16-bits.

r0 = (int16_t) a0
...
r15 = (int16_t) a15

In SSE4.1-AVX this intrinsic results in at least 3 instructions.
In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x16 simdpp::to_int16 ( uint8x16 a)

inline

Extends the 16 values of a unsigned int8x16 vector to 16-bits.

r0 = (uint16_t) a0
...
r15 = (uint16_t) a15

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x8 simdpp::to_int32 ( int16x8 a)

inline

Sign extends the first 8 values of a signed int16x16 vector to 32-bits.

r0 = (int32_t) a0
...
r7 = (int32_t) a7

In SSE4.1-AVX this intrinsic results in at least 3 instructions.
In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::to_int32 ( float32x4 a)

inline

Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned TODO: NaN handling

NEON, ALTIVEC specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand (saturation occurs). Conversion of NaNs results in 0.

r0 = (int32_t) a0
...
rN = (int32_t) aN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::to_int32 ( float64x4 a)

inline

Converts the values of a doublex2 vector into int32_t representation using truncation.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned

Todo:: NaN handling

NEON VFP specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand. Conversion of NaNs results in 0.

128-bit version:

Not vectorized in NEON and .

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = (int32_t) a2
r3 = (int32_t) a3

In SSE2-SSE4.1 this intrinsic results in at least 3 instructions.

template<unsigned N>

uint32<N> simdpp::to_int32x8 ( float32< N > a)

Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed.

The behavior is undefined if the value can not be represented in the result type.

SSE specific: If the value can not be represented by int32_t, 0x80000000 is returned TODO: NaN handling

NEON, ALTIVEC specific: If the value can not be represented by int32_t, either 0x80000000 or 0x7fffffff is returned depending on the sign of the operand (saturation occurs). Conversion of NaNs results in 0.

r0 = (int32_t) a0
...
rN = (int32_t) aN

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint64x4 simdpp::to_int64 ( int32x4 a)

inline

Extends the values of a signed int32x4 vector to 64-bits.

r0 = (int64_t) a0
...
r3 = (int64_t) a3

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX this intrinsic results in at least 3 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 3-4 instructions.

uint64x4 simdpp::to_int64 ( uint32x4 a)

inline

Extends the values of an unsigned int32x4 vector to 64-bits.

r0 = (uint64_t) a0
...
r3 = (uint64_t) a3

In SSE2-AVX this intrinsic results in at least 3 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

void simdpp::transpose8	(	uint16x8 &	a0,
		uint16x8 &	a1,
		uint16x8 &	a2,
		uint16x8 &	a3,
		uint16x8 &	a4,
		uint16x8 &	a5,
		uint16x8 &	a6,
		uint16x8 &	a7
	)

inline

Transposes a 8x8 16-bit matrix within eight int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ...; a7_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ...; a7_1 ]
...
r7 = [ a0_7; a1_7; a2_7; a3_7 ...; a7_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 24 instructions.
In NEON this intrinsic results in at least 12 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 48 instructions.
In AVX2 this intrinsic results in at least 24 instructions.
In NEON this intrinsic results in at least 24 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose8	(	int16x8 &	a0,
		int16x8 &	a1,
		int16x8 &	a2,
		int16x8 &	a3,
		int16x8 &	a4,
		int16x8 &	a5,
		int16x8 &	a6,
		int16x8 &	a7
	)

inline

void simdpp::transpose8	(	uint16x16 &	a0,
		uint16x16 &	a1,
		uint16x16 &	a2,
		uint16x16 &	a3,
		uint16x16 &	a4,
		uint16x16 &	a5,
		uint16x16 &	a6,
		uint16x16 &	a7
	)

inline

void simdpp::transpose8	(	int16x16 &	a0,
		int16x16 &	a1,
		int16x16 &	a2,
		int16x16 &	a3,
		int16x16 &	a4,
		int16x16 &	a5,
		int16x16 &	a6,
		int16x16 &	a7
	)

inline

template<unsigned N, class E >

float32<N, float32<N> > simdpp::trunc ( float32< N, E > a)

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)

128-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
In NEON this intrinsic results in at least 5-6 instructions.

256-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
In NEON this intrinsic results in at least 10-11 instructions.
In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::unzip16_hi	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

De-interleaves the even(higher) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a1 a3 a5 a7 a9 a11 a13 a15 b1 b3 b5 b7 b9 b11 b13 b15 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

template<unsigned N, class E1 , class E2 >

uint8<N, uint8<N> > simdpp::unzip16_lo	(	uint8< N, E1 >	a,
		uint8< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two int8x16 vectors.

| 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |

r = [ a0 a2 a4 a6 a8 a10 a12 a14 b0 b2 b4 b6 b8 b10 b12 b14 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In NEON this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N, class E1 , class E2 >

uint64<N, uint64<N> > simdpp::unzip2_hi	(	uint64< N, E1 >	a,
		uint64< N, E2 >	b
	)

De-interleaves the even(higher) elements of two int64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, float64<N> > simdpp::unzip2_hi	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

De-interleaves the even(higher) elements of two float64x2 vectors.

| 0 1 |

r = [ a1 b1 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

Not vectorized in NEON and .
In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint64<N, uint64<N> > simdpp::unzip2_lo	(	uint64< N, E1 >	a,
		uint64< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two int64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float64<N, float64<N> > simdpp::unzip2_lo	(	float64< N, E1 >	a,
		float64< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two float64x2 vectors.

| 0 1 |

r = [ a0 b0 ]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::unzip4_hi	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

De-interleaves the even(higher) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In ALTIVEC this intrinsic results in at least 2-3 instructions.
In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

float32<N, float32<N> > simdpp::unzip4_hi	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

De-interleaves the even(higher) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a1 a3 b1 b3 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint32<N, uint32<N> > simdpp::unzip4_lo	(	uint32< N, E1 >	a,
		uint32< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two int32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

template<unsigned N, class E1 , class E2 >

float32<N, float32<N> > simdpp::unzip4_lo	(	float32< N, E1 >	a,
		float32< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two float32x4 vectors.

| 0 1 2 3 |

r = [ a0 a2 b0 b2 ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::unzip8_hi	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

De-interleaves the even(higher) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a1 a3 a5 a7 b1 b3 b5 b7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 6 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 3 instructions.

template<unsigned N, class E1 , class E2 >

uint16<N, uint16<N> > simdpp::unzip8_lo	(	uint16< N, E1 >	a,
		uint16< N, E2 >	b
	)

De-interleaves the odd(lower) elements of two int16x8 vectors.

| 0 1 2 3 4 5 6 7 |

r = [ a0 a2 a4 a6 b0 b2 b4 b6 ]

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4-5 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSSE3 this intrinsic results in at least 5 instructions.
In SSE4.1-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip16_hi	(	const any_vec8< N, V1 > &	a,
		const any_vec8< N, V2 > &	b
	)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip16_lo	(	const any_vec8< N, V1 > &	a,
		const any_vec8< N, V2 > &	b
	)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip2_hi	(	const any_vec64< N, V1 > &	a,
		const any_vec64< N, V2 > &	b
	)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip2_lo	(	const any_vec64< N, V1 > &	a,
		const any_vec64< N, V2 > &	b
	)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip4_hi	(	const any_vec32< N, V1 > &	a,
		const any_vec32< N, V2 > &	b
	)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip4_lo	(	const any_vec32< N, V1 > &	a,
		const any_vec32< N, V2 > &	b
	)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip8_hi	(	const any_vec16< N, V1 > &	a,
		const any_vec16< N, V2 > &	b
	)

Interleaves the higher halves of two vectors.

| 0 1 2 3 ... N-2 N-1 |

r = [ a(N/2) b(N/2) a(N/2+1) b(N/2+1) ... a(N-1) b(N-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSV2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned N, class V1 , class V2 >

detail::get_expr2_nomask<V1, V2, void>::empty simdpp::zip8_lo	(	const any_vec16< N, V1 > &	a,
		const any_vec16< N, V2 > &	b
	)

Interleaves the lower halves of two vectors.

| 0 1 2 3 4 5 ... N-2 N-1 |

r = [ a0 b0 a1 b1 a2 b2 ... a(N/2-1) b(N/2-1) ]

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

Namespaces

Classes

Typedefs

Enumerations

Functions

Typedef Documentation

Function Documentation