Namespaces
namespace	altivec

namespace	neon

namespace	null

namespace	sse

Classes
class	Arch
	Identifies supported instruction set. More...

class	float32x4
	Class representing float32x4 vector. More...

class	mask_float32x4
	Class representing a mask for 4x 32-bit floating-point vector. More...

class	float32x8

class	mask_float32x8
	Class representing a mask for 8x 32-bit floating-point vector. More...

class	float64x2

class	mask_float64x2
	Class representing a mask for 2x 64-bit floating-point vector. More...

class	float64x4

class	mask_float64x4
	Class representing a mask for 4x 64-bit floating-point vector. More...

class	int128
	Base class for all 128-bit integer objects. More...

class	basic_int16x16
	Generic class representing 16x 16-bit integer vector. More...

class	int16x16
	Class representing 16x 16-bit signed integer vector. More...

class	uint16x16
	Class representing 8x 16-bit unsigned integer vector. More...

class	mask_int16x16
	Class representing a mask for 16x 16-bit integer vector. More...

class	basic_int16x8
	Generic class representing 8x 16-bit integer vector. More...

class	int16x8
	Class representing 8x 16-bit signed integer vector. More...

class	uint16x8
	Class representing 8x 16-bit unsigned integer vector. More...

class	mask_int16x8
	Class representing mask for 8x 16-bit integer vector. More...

class	int256
	Base class for all 128-bit integer objects. More...

class	basic_int32x4
	Generic class representing 4x 32-bit integer vector. More...

class	int32x4
	Class representing 4x 32-bit signed integer vector. More...

class	uint32x4
	Class representing 4x 32-bit unsigned integer vector. More...

class	mask_int32x4
	Class representing mask for 4x 32-bit integer vector. More...

class	basic_int32x8
	Generic class representing 4x 32-bit integer vector. More...

class	int32x8
	Class representing 4x 32-bit signed integer vector. More...

class	uint32x8
	Class representing 4x 32-bit unsigned integer vector. More...

class	mask_int32x8
	Class representing a mask for 8x 32-bit integer vector. More...

class	basic_int64x2
	Generic class representing 2x 64-bit integer vector. More...

class	int64x2
	Class representing 2x 64-bit signed integer vector. More...

class	uint64x2
	Class representing 2x 64-bit unsigned integer vector. More...

class	mask_int64x2
	Class representing mask for 2x 64-bit integer vector. More...

class	basic_int64x4
	Generic class representing 2x 64-bit integer vector. More...

class	int64x4
	Class representing 2x 64-bit signed integer vector. More...

class	uint64x4
	Class representing 2x 64-bit unsigned integer vector. More...

class	mask_int64x4
	Class representing a mask for 4x 64-bit integer vector. More...

class	basic_int8x16
	Generic class representing 16x 8-bit integer vector. More...

class	int8x16
	Class representing 16x 8-bit signed integer vector. More...

class	uint8x16
	Class representing 16x 8-bit unsigned integer vector. More...

class	mask_int8x16
	Class representing mask for 16x 8-bit integer vector. More...

class	basic_int8x32
	Generic class representing 16x 8-bit integer vector. More...

class	int8x32
	Class representing 16x 8-bit signed integer vector. More...

class	uint8x32
	Class representing 16x 8-bit unsigned integer vector. More...

class	mask_int8x32
	Class representing a mask for 32x 8-bit integer vector. More...

struct	is_vector
	Allows detection whether specific type is a simdpp vector. More...

class	is_vector< float32x4 >

class	is_vector< float64x2 >

class	is_vector< float32x8 >

class	is_vector< float64x4 >

class	is_vector< int128 >

class	is_vector< int256 >

class	is_vector< int8x16 >

class	is_vector< int16x8 >

class	is_vector< int32x4 >

class	is_vector< int64x2 >

class	is_vector< int8x32 >

class	is_vector< int16x16 >

class	is_vector< int32x8 >

class	is_vector< int64x4 >

class	is_vector< uint8x16 >

class	is_vector< uint16x8 >

class	is_vector< uint32x4 >

class	is_vector< uint64x2 >

class	is_vector< uint8x32 >

class	is_vector< uint16x16 >

class	is_vector< uint32x8 >

class	is_vector< uint64x4 >

class	is_vector< basic_int8x16 >

class	is_vector< basic_int16x8 >

class	is_vector< basic_int32x4 >

class	is_vector< basic_int64x2 >

class	is_vector< basic_int8x32 >

class	is_vector< basic_int16x16 >

class	is_vector< basic_int32x8 >

class	is_vector< basic_int64x4 >

struct	is_mask
	Allows detection whether specific type is a simdpp mask. More...

class	is_mask< mask_int8x16 >

class	is_mask< mask_int8x32 >

class	is_mask< mask_int16x8 >

class	is_mask< mask_int16x16 >

class	is_mask< mask_int32x4 >

class	is_mask< mask_int32x8 >

class	is_mask< mask_int64x2 >

class	is_mask< mask_int64x4 >

class	is_mask< mask_float32x4 >

class	is_mask< mask_float32x8 >

class	is_mask< mask_float64x2 >

class	is_mask< mask_float64x4 >

Typedefs
typedef boost::function< Arch()>	GetArchCb

Functions
Arch	get_arch_gcc_builtin_cpu_supports ()
	Retrieves supported architecture using GCC __builtin_cpu_supports function. More...

Arch	get_arch_linux_cpuinfo ()
	Retrieves supported architecture from Linux /proc/cpuinfo file. More...

basic_int8x16	bit_and (basic_int8x16 a, int128 b)
	Computes bitwise AND of integer vectors. More...

basic_int16x8	bit_and (basic_int16x8 a, int128 b)

basic_int32x4	bit_and (basic_int32x4 a, int128 b)

basic_int64x2	bit_and (basic_int64x2 a, int128 b)

basic_int8x32	bit_and (basic_int8x32 a, int256 b)

basic_int16x16	bit_and (basic_int16x16 a, int256 b)

basic_int32x8	bit_and (basic_int32x8 a, int256 b)

basic_int64x4	bit_and (basic_int64x4 a, int256 b)

basic_int8x16	bit_and (basic_int8x16 a, mask_int8x16 b)

basic_int16x8	bit_and (basic_int16x8 a, mask_int16x8 b)

basic_int32x4	bit_and (basic_int32x4 a, mask_int32x4 b)

basic_int64x2	bit_and (basic_int64x2 a, mask_int64x2 b)

basic_int8x32	bit_and (basic_int8x32 a, mask_int8x32 b)

basic_int16x16	bit_and (basic_int16x16 a, mask_int16x16 b)

basic_int32x8	bit_and (basic_int32x8 a, mask_int32x8 b)

basic_int64x4	bit_and (basic_int64x4 a, mask_int64x4 b)

mask_int8x16	bit_and (mask_int8x16 a, mask_int8x16 b)

mask_int16x8	bit_and (mask_int16x8 a, mask_int16x8 b)

mask_int32x4	bit_and (mask_int32x4 a, mask_int32x4 b)

mask_int64x2	bit_and (mask_int64x2 a, mask_int64x2 b)

mask_int8x32	bit_and (mask_int8x32 a, mask_int8x32 b)

mask_int16x16	bit_and (mask_int16x16 a, mask_int16x16 b)

mask_int32x8	bit_and (mask_int32x8 a, mask_int32x8 b)

mask_int64x4	bit_and (mask_int64x4 a, mask_int64x4 b)

void	prefetch_read (const void *ptr)
	Prefetches data to the lowest level cache for reading. More...

void	prefetch_write (const void *ptr)
	Prefetches data to the lowest level cache for writing. More...

template<class R , class T >
R	bit_cast (T t)
	Casts between unrelated types. More...

mask_int8x16	cmp_eq (basic_int8x16 a, basic_int8x16 b)
	Compares 8-bit values for equality. More...

mask_int8x32	cmp_eq (basic_int8x32 a, basic_int8x32 b)

mask_float64x2	cmp_gt (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for greater-than. More...

mask_float64x4	cmp_gt (float64x4 a, float64x4 b)

mask_float64x2	cmp_ge (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for greater-than. More...

mask_float64x4	cmp_ge (float64x4 a, float64x4 b)

basic_int16x8	to_int16x8 (int8x16 a)
	Sign extends the first 8 values of a signed int8x16 vector to 16-bits. More...

basic_int16x16	to_int16x16 (int8x32 a)
	Sign extends the first 16 values of a signed int8x32 vector to 16-bits. More...

basic_int16x8	to_int16x8 (uint8x16 a)
	Extends the first 8 values of a unsigned int8x16 vector to 16-bits. More...

basic_int16x16	to_int16x16 (uint8x32 a)
	Extends the first 16 values of a unsigned int8x32 vector to 16-bits. More...

basic_int32x4	to_int32x4 (int16x8 a)
	Sign extends the first 4 values of a signed int16x8 vector to 32-bits. More...

basic_int32x8	to_int32x8 (int16x16 a)
	Sign extends the first 8 values of a signed int16x16 vector to 32-bits. More...

basic_int32x4	to_int32x4 (uint16x8 a)
	Zero-extends the values of a unsigned int16x8 vector to 32-bits. More...

basic_int32x8	to_int32x8 (uint16x16 a)
	Zero-extends the first 8 values of a unsigned int16x16 vector to 32-bits. More...

template<unsigned id>
float	extract (float32x4 a)
	Extracts an element from float32x4 vector. More...

template<unsigned id>
double	extract (float64x2 a)
	Extracts an element from float64x2 vector. More...

uint16_t	extract_bits_any (uint8x16 a)
	Extracts a bit from each byte of each element of a int8x16 vector. More...

template<unsigned id>
uint16_t	extract_bits (uint8x16 a)
	Extracts specific bit from each byte of each element of a int8x16 vector. More...

template<unsigned id>
basic_int8x16	insert (basic_int8x16 a, uint8_t x)
	Inserts an element into int8x16 vector at the position identified by id. More...

template<unsigned id>
basic_int16x8	insert (basic_int16x8 a, uint16_t x)
	Inserts an element into int16x8 vector at the position identified by id. More...

template<unsigned id>
basic_int32x4	insert (basic_int32x4 a, uint32_t x)
	Inserts an element into int32x4 vector at the position identified by id. More...

template<unsigned id>
basic_int64x2	insert (basic_int64x2 a, uint64_t x)
	Inserts an element into int64x2 vector at the position identified by id. More...

template<unsigned id>
float32x4	insert (float32x4 a, float x)
	Inserts an element into float32x4 vector at the position identified by id. More...

template<unsigned id>
float64x2	insert (float64x2 a, double x)
	Inserts an element into float64x2 vector at the position identified by id. More...

float32x4	abs (float32x4 a)
	Computes absolute value of floating point values. More...

float32x8	abs (float32x8 a)

basic_int8x16	add (basic_int8x16 a, basic_int8x16 b)
	Adds 8-bit integer values. More...

basic_int8x32	add (basic_int8x32 a, basic_int8x32 b)

int8x16	shift_r (int8x16 a, unsigned count)
	Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...

int8x32	shift_r (int8x32 a, unsigned count)

int128	load (int128 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an aligned memory location. More...

int256	load (int256 &a, const void *p)

float32x4	load (float32x4 &a, const float *p)

float32x8	load (float32x8 &a, const float *p)

float64x2	load (float64x2 &a, const double *p)

float64x4	load (float64x4 &a, const double *p)

void	load_packed2 (float32x4 &a, float32x4 &b, const float *p)
	Loads 32-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (float32x8 &a, float32x8 &b, const float *p)

void	store (void *p, int128 a)
	Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...

void	store (void *p, int256 a)

void	store (float *p, float32x4 a)

void	store (float *p, float32x8 a)

void	store (double *p, float64x2 a)

void	store (double *p, float64x4 a)

basic_int8x16	zip_lo (basic_int8x16 a, basic_int8x16 b)
	Interleaves the lower halves of two vectors. More...

basic_int8x32	zip_lo (basic_int8x32 a, basic_int8x32 b)

basic_int16x8	zip_lo (basic_int16x8 a, basic_int16x8 b)

basic_int16x16	zip_lo (basic_int16x16 a, basic_int16x16 b)

basic_int32x4	zip_lo (basic_int32x4 a, basic_int32x4 b)

basic_int32x8	zip_lo (basic_int32x8 a, basic_int32x8 b)

basic_int64x2	zip_lo (basic_int64x2 a, basic_int64x2 b)

basic_int64x4	zip_lo (basic_int64x4 a, basic_int64x4 b)

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int64x4	permute (basic_int64x4 a)
	Permutes the values of each set of four consecutive 64-bit values. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float64x4	permute (float64x4 a)
	Permutes the values of each set of four consecutive 64-bit floating-point values. More...

Arch	this_compile_arch ()
	Returns the instruction set flags that will be required by the currently compiled code. More...

void	transpose2 (basic_int16x8 &a0, basic_int16x8 &a1)
	Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...

void	transpose2 (basic_int16x16 &a0, basic_int16x16 &a1)

void	transpose8 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3, basic_int8x16 &a4, basic_int8x16 &a5, basic_int8x16 &a6, basic_int8x16 &a7)
	Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...

void	transpose8 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3, basic_int8x32 &a4, basic_int8x32 &a5, basic_int8x32 &a6, basic_int8x32 &a7)

void	transpose8 (basic_int16x8 &a0, basic_int16x8 &a1, basic_int16x8 &a2, basic_int16x8 &a3, basic_int16x8 &a4, basic_int16x8 &a5, basic_int16x8 &a6, basic_int16x8 &a7)
	Transposes a 8x8 16-bit matrix within eight int16x8 vectors. More...

void	transpose8 (basic_int16x16 &a0, basic_int16x16 &a1, basic_int16x16 &a2, basic_int16x16 &a3, basic_int16x16 &a4, basic_int16x16 &a5, basic_int16x16 &a6, basic_int16x16 &a7)

void	transpose16 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3, basic_int8x16 &a4, basic_int8x16 &a5, basic_int8x16 &a6, basic_int8x16 &a7, basic_int8x16 &a8, basic_int8x16 &a9, basic_int8x16 &a10, basic_int8x16 &a11, basic_int8x16 &a12, basic_int8x16 &a13, basic_int8x16 &a14, basic_int8x16 &a15)
	Transposes a 16x16 8-bit matrix within sixteen int8x16 vectors. More...

void	transpose16 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3, basic_int8x32 &a4, basic_int8x32 &a5, basic_int8x32 &a6, basic_int8x32 &a7, basic_int8x32 &a8, basic_int8x32 &a9, basic_int8x32 &a10, basic_int8x32 &a11, basic_int8x32 &a12, basic_int8x32 &a13, basic_int8x32 &a14, basic_int8x32 &a15)


Arch &	operator\|= (Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch &	operator&= (Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator\| (const Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator& (const Arch &x, const Arch &y)
	Bitwise operators for `Arch`. More...

Arch	operator~ (const Arch &x)
	Bitwise operators for `Arch`. More...


float32x4	bit_and (float32x4 a, float32x4 b)
	Computes bitwise AND of floating-point vectors. More...

float32x8	bit_and (float32x8 a, float32x8 b)
	Computes bitwise AND of floating-point vectors. More...

float32x4	bit_and (float32x4 a, int128 b)
	Computes bitwise AND of floating-point vectors. More...

float32x8	bit_and (float32x8 a, int256 b)
	Computes bitwise AND of floating-point vectors. More...

float32x4	bit_and (float32x4 a, mask_float32x4 b)
	Computes bitwise AND of floating-point vectors. More...

float32x8	bit_and (float32x8 a, mask_float32x8 b)
	Computes bitwise AND of floating-point vectors. More...

mask_float32x4	bit_and (mask_float32x4 a, mask_float32x4 b)
	Computes bitwise AND of floating-point vectors. More...

mask_float32x8	bit_and (mask_float32x8 a, mask_float32x8 b)
	Computes bitwise AND of floating-point vectors. More...

float64x2	bit_and (float64x2 a, float64x2 b)
	Computes bitwise AND of floating-point vectors. More...

float64x4	bit_and (float64x4 a, float64x4 b)
	Computes bitwise AND of floating-point vectors. More...

float64x2	bit_and (float64x2 a, int128 b)
	Computes bitwise AND of floating-point vectors. More...

float64x4	bit_and (float64x4 a, int256 b)
	Computes bitwise AND of floating-point vectors. More...

float64x2	bit_and (float64x2 a, mask_float64x2 b)
	Computes bitwise AND of floating-point vectors. More...

float64x4	bit_and (float64x4 a, mask_float64x4 b)
	Computes bitwise AND of floating-point vectors. More...

mask_float64x2	bit_and (mask_float64x2 a, mask_float64x2 b)
	Computes bitwise AND of floating-point vectors. More...

mask_float64x4	bit_and (mask_float64x4 a, mask_float64x4 b)
	Computes bitwise AND of floating-point vectors. More...


basic_int8x16	bit_andnot (basic_int8x16 a, int128 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int16x8	bit_andnot (basic_int16x8 a, int128 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int32x4	bit_andnot (basic_int32x4 a, int128 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int64x2	bit_andnot (basic_int64x2 a, int128 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int8x32	bit_andnot (basic_int8x32 a, int256 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int16x16	bit_andnot (basic_int16x16 a, int256 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int32x8	bit_andnot (basic_int32x8 a, int256 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int64x4	bit_andnot (basic_int64x4 a, int256 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int8x16	bit_andnot (basic_int8x16 a, mask_int8x16 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int16x8	bit_andnot (basic_int16x8 a, mask_int16x8 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int32x4	bit_andnot (basic_int32x4 a, mask_int32x4 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int64x2	bit_andnot (basic_int64x2 a, mask_int64x2 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int8x32	bit_andnot (basic_int8x32 a, mask_int8x32 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int16x16	bit_andnot (basic_int16x16 a, mask_int16x16 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int32x8	bit_andnot (basic_int32x8 a, mask_int32x8 b)
	Computes bitwise AND NOT of integer vectors. More...

basic_int64x4	bit_andnot (basic_int64x4 a, mask_int64x4 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int8x16	bit_andnot (mask_int8x16 a, mask_int8x16 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int16x8	bit_andnot (mask_int16x8 a, mask_int16x8 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int32x4	bit_andnot (mask_int32x4 a, mask_int32x4 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int64x2	bit_andnot (mask_int64x2 a, mask_int64x2 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int8x32	bit_andnot (mask_int8x32 a, mask_int8x32 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int16x16	bit_andnot (mask_int16x16 a, mask_int16x16 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int32x8	bit_andnot (mask_int32x8 a, mask_int32x8 b)
	Computes bitwise AND NOT of integer vectors. More...

mask_int64x4	bit_andnot (mask_int64x4 a, mask_int64x4 b)
	Computes bitwise AND NOT of integer vectors. More...


float32x4	bit_andnot (float32x4 a, float32x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float32x8	bit_andnot (float32x8 a, float32x8 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float32x4	bit_andnot (float32x4 a, int128 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float32x8	bit_andnot (float32x8 a, int256 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float32x4	bit_andnot (float32x4 a, mask_float32x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float32x8	bit_andnot (float32x8 a, mask_float32x8 b)
	Computes bitwise AND NOT of floating-point vectors. More...

mask_float32x4	bit_andnot (mask_float32x4 a, mask_float32x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...

mask_float32x8	bit_andnot (mask_float32x8 a, mask_float32x8 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x2	bit_andnot (float64x2 a, float64x2 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x4	bit_andnot (float64x4 a, float64x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x2	bit_andnot (float64x2 a, int128 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x4	bit_andnot (float64x4 a, int256 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x2	bit_andnot (float64x2 a, mask_float64x2 b)
	Computes bitwise AND NOT of floating-point vectors. More...

float64x4	bit_andnot (float64x4 a, mask_float64x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...

mask_float64x2	bit_andnot (mask_float64x2 a, mask_float64x2 b)
	Computes bitwise AND NOT of floating-point vectors. More...

mask_float64x4	bit_andnot (mask_float64x4 a, mask_float64x4 b)
	Computes bitwise AND NOT of floating-point vectors. More...


basic_int8x16	bit_or (basic_int8x16 a, int128 b)
	Computes bitwise OR of integer vectors. More...

basic_int16x8	bit_or (basic_int16x8 a, int128 b)
	Computes bitwise OR of integer vectors. More...

basic_int32x4	bit_or (basic_int32x4 a, int128 b)
	Computes bitwise OR of integer vectors. More...

basic_int64x2	bit_or (basic_int64x2 a, int128 b)
	Computes bitwise OR of integer vectors. More...

basic_int8x32	bit_or (basic_int8x32 a, int256 b)
	Computes bitwise OR of integer vectors. More...

basic_int16x16	bit_or (basic_int16x16 a, int256 b)
	Computes bitwise OR of integer vectors. More...

basic_int32x8	bit_or (basic_int32x8 a, int256 b)
	Computes bitwise OR of integer vectors. More...

basic_int64x4	bit_or (basic_int64x4 a, int256 b)
	Computes bitwise OR of integer vectors. More...

mask_int8x16	bit_or (mask_int8x16 a, mask_int8x16 b)
	Computes bitwise OR of integer vectors. More...

mask_int16x8	bit_or (mask_int16x8 a, mask_int16x8 b)
	Computes bitwise OR of integer vectors. More...

mask_int32x4	bit_or (mask_int32x4 a, mask_int32x4 b)
	Computes bitwise OR of integer vectors. More...

mask_int64x2	bit_or (mask_int64x2 a, mask_int64x2 b)
	Computes bitwise OR of integer vectors. More...

mask_int8x32	bit_or (mask_int8x32 a, mask_int8x32 b)
	Computes bitwise OR of integer vectors. More...

mask_int16x16	bit_or (mask_int16x16 a, mask_int16x16 b)
	Computes bitwise OR of integer vectors. More...

mask_int32x8	bit_or (mask_int32x8 a, mask_int32x8 b)
	Computes bitwise OR of integer vectors. More...

mask_int64x4	bit_or (mask_int64x4 a, mask_int64x4 b)
	Computes bitwise OR of integer vectors. More...


float32x4	bit_or (float32x4 a, float32x4 b)
	Computes bitwise OR of floating-point vectors. More...

float32x8	bit_or (float32x8 a, float32x8 b)
	Computes bitwise OR of floating-point vectors. More...

float32x4	bit_or (float32x4 a, int128 b)
	Computes bitwise OR of floating-point vectors. More...

float32x8	bit_or (float32x8 a, int256 b)
	Computes bitwise OR of floating-point vectors. More...

float64x2	bit_or (float64x2 a, float64x2 b)
	Computes bitwise OR of floating-point vectors. More...

float64x4	bit_or (float64x4 a, float64x4 b)
	Computes bitwise OR of floating-point vectors. More...

float64x2	bit_or (float64x2 a, int128 b)
	Computes bitwise OR of floating-point vectors. More...

float64x4	bit_or (float64x4 a, int256 b)
	Computes bitwise OR of floating-point vectors. More...

mask_float32x4	bit_or (mask_float32x4 a, mask_float32x4 b)
	Computes bitwise OR of floating-point vectors. More...

mask_float64x2	bit_or (mask_float64x2 a, mask_float64x2 b)
	Computes bitwise OR of floating-point vectors. More...

mask_float32x8	bit_or (mask_float32x8 a, mask_float32x8 b)
	Computes bitwise OR of floating-point vectors. More...

mask_float64x4	bit_or (mask_float64x4 a, mask_float64x4 b)
	Computes bitwise OR of floating-point vectors. More...


basic_int8x16	bit_xor (basic_int8x16 a, int128 b)
	Computes bitwise XOR of integer vectors. More...

basic_int16x8	bit_xor (basic_int16x8 a, int128 b)
	Computes bitwise XOR of integer vectors. More...

basic_int32x4	bit_xor (basic_int32x4 a, int128 b)
	Computes bitwise XOR of integer vectors. More...

basic_int64x2	bit_xor (basic_int64x2 a, int128 b)
	Computes bitwise XOR of integer vectors. More...

basic_int8x32	bit_xor (basic_int8x32 a, int256 b)
	Computes bitwise XOR of integer vectors. More...

basic_int16x16	bit_xor (basic_int16x16 a, int256 b)
	Computes bitwise XOR of integer vectors. More...

basic_int32x8	bit_xor (basic_int32x8 a, int256 b)
	Computes bitwise XOR of integer vectors. More...

basic_int64x4	bit_xor (basic_int64x4 a, int256 b)
	Computes bitwise XOR of integer vectors. More...

mask_int8x16	bit_xor (mask_int8x16 a, mask_int8x16 b)
	Computes bitwise XOR of integer vectors. More...

mask_int16x8	bit_xor (mask_int16x8 a, mask_int16x8 b)
	Computes bitwise XOR of integer vectors. More...

mask_int32x4	bit_xor (mask_int32x4 a, mask_int32x4 b)
	Computes bitwise XOR of integer vectors. More...

mask_int64x2	bit_xor (mask_int64x2 a, mask_int64x2 b)
	Computes bitwise XOR of integer vectors. More...

mask_int8x32	bit_xor (mask_int8x32 a, mask_int8x32 b)
	Computes bitwise XOR of integer vectors. More...

mask_int16x16	bit_xor (mask_int16x16 a, mask_int16x16 b)
	Computes bitwise XOR of integer vectors. More...

mask_int32x8	bit_xor (mask_int32x8 a, mask_int32x8 b)
	Computes bitwise XOR of integer vectors. More...

mask_int64x4	bit_xor (mask_int64x4 a, mask_int64x4 b)
	Computes bitwise XOR of integer vectors. More...


float32x4	bit_xor (float32x4 a, float32x4 b)
	Computes bitwise XOR of floating-point vectors. More...

float32x8	bit_xor (float32x8 a, float32x8 b)
	Computes bitwise XOR of floating-point vectors. More...

float32x4	bit_xor (float32x4 a, int128 b)
	Computes bitwise XOR of floating-point vectors. More...

float32x8	bit_xor (float32x8 a, int256 b)
	Computes bitwise XOR of floating-point vectors. More...

float64x2	bit_xor (float64x2 a, float64x2 b)
	Computes bitwise XOR of floating-point vectors. More...

float64x4	bit_xor (float64x4 a, float64x4 b)
	Computes bitwise XOR of floating-point vectors. More...

float64x2	bit_xor (float64x2 a, int128 b)
	Computes bitwise XOR of floating-point vectors. More...

float64x4	bit_xor (float64x4 a, int256 b)
	Computes bitwise XOR of floating-point vectors. More...

mask_float32x4	bit_xor (mask_float32x4 a, mask_float32x4 b)
	Computes bitwise XOR of floating-point vectors. More...

mask_float64x2	bit_xor (mask_float64x2 a, mask_float64x2 b)
	Computes bitwise XOR of floating-point vectors. More...

mask_float32x8	bit_xor (mask_float32x8 a, mask_float32x8 b)
	Computes bitwise XOR of floating-point vectors. More...

mask_float64x4	bit_xor (mask_float64x4 a, mask_float64x4 b)
	Computes bitwise XOR of floating-point vectors. More...


basic_int8x16	bit_not (basic_int8x16 a)
	Computes bitwise NOT of an integer vector. More...

basic_int16x8	bit_not (basic_int16x8 a)
	Computes bitwise NOT of an integer vector. More...

basic_int32x4	bit_not (basic_int32x4 a)
	Computes bitwise NOT of an integer vector. More...

basic_int64x2	bit_not (basic_int64x2 a)
	Computes bitwise NOT of an integer vector. More...

basic_int8x32	bit_not (basic_int8x32 a)
	Computes bitwise NOT of an integer vector. More...

basic_int16x16	bit_not (basic_int16x16 a)
	Computes bitwise NOT of an integer vector. More...

basic_int32x8	bit_not (basic_int32x8 a)
	Computes bitwise NOT of an integer vector. More...

basic_int64x4	bit_not (basic_int64x4 a)
	Computes bitwise NOT of an integer vector. More...

mask_int8x16	bit_not (mask_int8x16 a)
	Computes bitwise NOT of an integer vector. More...

mask_int16x8	bit_not (mask_int16x8 a)
	Computes bitwise NOT of an integer vector. More...

mask_int32x4	bit_not (mask_int32x4 a)
	Computes bitwise NOT of an integer vector. More...

mask_int64x2	bit_not (mask_int64x2 a)
	Computes bitwise NOT of an integer vector. More...

mask_int8x32	bit_not (mask_int8x32 a)
	Computes bitwise NOT of an integer vector. More...

mask_int16x16	bit_not (mask_int16x16 a)
	Computes bitwise NOT of an integer vector. More...

mask_int32x8	bit_not (mask_int32x8 a)
	Computes bitwise NOT of an integer vector. More...

mask_int64x4	bit_not (mask_int64x4 a)
	Computes bitwise NOT of an integer vector. More...


float32x4	bit_not (float32x4 a)
	Computes bitwise NOT of a floating-point vector. More...

float64x2	bit_not (float64x2 a)
	Computes bitwise NOT of a floating-point vector. More...

float32x8	bit_not (float32x8 a)
	Computes bitwise NOT of a floating-point vector. More...

float64x4	bit_not (float64x4 a)
	Computes bitwise NOT of a floating-point vector. More...

mask_float32x4	bit_not (mask_float32x4 a)
	Computes bitwise NOT of a floating-point vector. More...

mask_float64x2	bit_not (mask_float64x2 a)
	Computes bitwise NOT of a floating-point vector. More...

mask_float32x8	bit_not (mask_float32x8 a)
	Computes bitwise NOT of a floating-point vector. More...

mask_float64x4	bit_not (mask_float64x4 a)
	Computes bitwise NOT of a floating-point vector. More...


mask_int16x8	cmp_eq (basic_int16x8 a, basic_int16x8 b)
	Compares 16-bit values for equality. More...

mask_int16x16	cmp_eq (basic_int16x16 a, basic_int16x16 b)
	Compares 16-bit values for equality. More...


mask_int32x4	cmp_eq (basic_int32x4 a, basic_int32x4 b)
	Compares the values of two int32x4 vectors for equality. More...

mask_int32x8	cmp_eq (basic_int32x8 a, basic_int32x8 b)
	Compares the values of two int32x4 vectors for equality. More...


mask_int64x2	cmp_eq (basic_int64x2 a, basic_int64x2 b)
	Compares the values of two int64x2 vectors for equality. More...

mask_int64x4	cmp_eq (basic_int64x4 a, basic_int64x4 b)
	Compares the values of two int64x2 vectors for equality. More...


mask_float32x4	cmp_eq (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for equality. More...

mask_float32x8	cmp_eq (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for equality. More...


mask_float64x2	cmp_eq (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for equality. More...

mask_float64x4	cmp_eq (float64x4 a, float64x4 b)
	Compares the values of two float64x2 vectors for equality. More...


mask_int8x16	cmp_neq (basic_int8x16 a, basic_int8x16 b)
	Compares the values of two int8x16 vectors for inequality. More...

mask_int8x32	cmp_neq (basic_int8x32 a, basic_int8x32 b)
	Compares the values of two int8x16 vectors for inequality. More...


mask_int16x8	cmp_neq (basic_int16x8 a, basic_int16x8 b)
	Compares the values of two int16x8 vectors for inequality. More...

mask_int16x16	cmp_neq (basic_int16x16 a, basic_int16x16 b)
	Compares the values of two int16x8 vectors for inequality. More...


mask_int32x4	cmp_neq (basic_int32x4 a, basic_int32x4 b)
	Compares the values of two int32x4 vectors for inequality. More...

mask_int32x8	cmp_neq (basic_int32x8 a, basic_int32x8 b)
	Compares the values of two int32x4 vectors for inequality. More...


mask_int64x2	cmp_neq (basic_int64x2 a, basic_int64x2 b)
	Compares the values of two int64x2 vectors for inequality. More...

mask_int64x4	cmp_neq (basic_int64x4 a, basic_int64x4 b)
	Compares the values of two int64x2 vectors for inequality. More...


mask_float32x4	cmp_neq (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for inequality. More...

mask_float32x8	cmp_neq (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for inequality. More...


mask_float64x2	cmp_neq (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for inequality. More...

mask_float64x4	cmp_neq (float64x4 a, float64x4 b)
	Compares the values of two float64x2 vectors for inequality. More...


mask_int8x16	cmp_gt (int8x16 a, int8x16 b)
	Compares the values of two signed int16x8 vectors for greater-than. More...

mask_int8x32	cmp_gt (int8x32 a, int8x32 b)
	Compares the values of two signed int16x8 vectors for greater-than. More...


mask_int8x16	cmp_gt (uint8x16 a, uint8x16 b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...

mask_int8x32	cmp_gt (uint8x32 a, uint8x32 b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...


mask_int16x8	cmp_gt (int16x8 a, int16x8 b)
	Compares the values of two signed int16x8 vectors for greater-than. More...

mask_int16x16	cmp_gt (int16x16 a, int16x16 b)
	Compares the values of two signed int16x8 vectors for greater-than. More...


mask_int16x8	cmp_gt (uint16x8 a, uint16x8 b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...

mask_int16x16	cmp_gt (uint16x16 a, uint16x16 b)
	Compares the values of two unsigned int16x8 vectors for greater-than. More...


mask_int32x4	cmp_gt (int32x4 a, int32x4 b)
	Compares the values of two signed int32x4 vectors for greater-than. More...

mask_int32x8	cmp_gt (int32x8 a, int32x8 b)
	Compares the values of two signed int32x4 vectors for greater-than. More...


mask_int32x4	cmp_gt (uint32x4 a, uint32x4 b)
	Compares the values of two unsigned int32x4 vectors for greater-than. More...

mask_int32x8	cmp_gt (uint32x8 a, uint32x8 b)
	Compares the values of two unsigned int32x4 vectors for greater-than. More...


mask_float32x4	cmp_gt (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for greater-than. More...

mask_float32x8	cmp_gt (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for greater-than. More...


mask_float32x4	cmp_ge (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for greater-than or equal. More...

mask_float32x8	cmp_ge (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for greater-than or equal. More...


mask_int8x16	cmp_lt (int8x16 a, int8x16 b)
	Compares the values of two signed int8x16 vectors for less-than. More...

mask_int8x32	cmp_lt (int8x32 a, int8x32 b)
	Compares the values of two signed int8x16 vectors for less-than. More...


mask_int8x16	cmp_lt (uint8x16 a, uint8x16 b)
	Compares the values of two unsigned int8x16 vectors for less-than. More...

mask_int8x32	cmp_lt (uint8x32 a, uint8x32 b)
	Compares the values of two unsigned int8x16 vectors for less-than. More...


mask_int16x8	cmp_lt (int16x8 a, int16x8 b)
	Compares the values of two signed int16x8 vectors for less-than. More...

mask_int16x16	cmp_lt (int16x16 a, int16x16 b)
	Compares the values of two signed int16x8 vectors for less-than. More...


mask_int16x8	cmp_lt (uint16x8 a, uint16x8 b)
	Compares the values of two unsigned int16x8 vectors for less-than. More...

mask_int16x16	cmp_lt (uint16x16 a, uint16x16 b)
	Compares the values of two unsigned int16x8 vectors for less-than. More...


mask_int32x4	cmp_lt (int32x4 a, int32x4 b)
	Compares the values of two signed int32x4 vectors for less-than. More...

mask_int32x8	cmp_lt (int32x8 a, int32x8 b)
	Compares the values of two signed int32x4 vectors for less-than. More...


mask_int32x4	cmp_lt (uint32x4 a, uint32x4 b)
	Compares the values of two unsigned int32x4 vectors for less-than. More...

mask_int32x8	cmp_lt (uint32x8 a, uint32x8 b)
	Compares the values of two unsigned int32x4 vectors for less-than. More...


mask_float32x4	cmp_lt (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for less-than. More...

mask_float32x8	cmp_lt (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for less-than. More...


mask_float64x2	cmp_lt (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for less-than. More...

mask_float64x4	cmp_lt (float64x4 a, float64x4 b)
	Compares the values of two float64x2 vectors for less-than. More...


mask_float32x4	cmp_le (float32x4 a, float32x4 b)
	Compares the values of two float32x4 vectors for less-than or equal. More...

mask_float32x8	cmp_le (float32x8 a, float32x8 b)
	Compares the values of two float32x4 vectors for less-than or equal. More...


mask_float64x2	cmp_le (float64x2 a, float64x2 b)
	Compares the values of two float64x2 vectors for less-than or equal. More...

mask_float64x4	cmp_le (float64x4 a, float64x4 b)
	Compares the values of two float64x2 vectors for less-than or equal. More...


basic_int32x4	to_int32x4 (float32x4 a)
	Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...

basic_int32x8	to_int32x8 (float32x8 a)
	Converts the values of a float32x4 vector into signed int32_t representation using truncation if only an inexact conversion can be performed. More...


basic_int32x4	to_int32x4 (float64x2 a)
	Converts the values of a doublex2 vector into int32_t representation using truncation. More...

basic_int32x8	to_int32x8 (float64x4 a)
	Converts the values of a doublex2 vector into int32_t representation using truncation. More...


basic_int64x2	to_int64x2 (int32x4 a)
	Extends the values of a signed int32x4 vector to 64-bits. More...

basic_int64x4	to_int64x4 (int32x8 a)
	Extends the values of a signed int32x4 vector to 64-bits. More...

basic_int64x2	to_int64x2 (uint32x4 a)
	Extends the values of an unsigned int32x4 vector to 64-bits. More...

basic_int64x4	to_int64x4 (uint32x8 a)
	Extends the values of a signed int32x4 vector to 64-bits. More...


float32x4	to_float32x4 (int32x4 a)
	Converts 32-bit integer values to 32-bit float values. More...

float32x8	to_float32x8 (int32x8 a)
	Converts 32-bit integer values to 32-bit float values. More...


float32x4	to_float32x4 (float64x2 a)
	Converts 64-bit float values to 32-bit float values. More...

float32x8	to_float32x8 (float64x4 a)
	Converts 64-bit float values to 32-bit float values. More...


float64x2	to_float64x2 (int32x4 a)
	Converts the 32-bit integer values to 64-bit float values. More...

float64x4	to_float64x4 (int32x8 a)
	Converts the 32-bit integer values to 64-bit float values. More...


float64x2	to_float64x2 (float32x4 a)
	Converts the 32-bit float values to 64-bit float values. More...

float64x4	to_float64x4 (float32x8 a)
	Converts the 32-bit float values to 64-bit float values. More...


template<unsigned id>
uint8_t	extract (basic_int8x16 a)
	Extracts the id-th element from int8x16 vector. More...

template<unsigned id>
int8_t	extract (int8x16 a)
	Extracts the id-th element from int8x16 vector. More...


template<unsigned id>
uint16_t	extract (basic_int16x8 a)
	Extracts the id-th element from int16x8 vector. More...

template<unsigned id>
int16_t	extract (int16x8 a)
	Extracts the id-th element from int16x8 vector. More...


template<unsigned id>
uint32_t	extract (basic_int32x4 a)
	Extracts the id-th element from int32x4 vector. More...

template<unsigned id>
int32_t	extract (int32x4 a)
	Extracts the id-th element from int32x4 vector. More...


template<unsigned id>
uint64_t	extract (basic_int64x2 a)
	Extracts an element from int64x2 vector. More...

template<unsigned id>
int64_t	extract (int64x2 a)
	Extracts an element from int64x2 vector. More...


int256	combine (int128 a, int128 b)
	Combines two 128-bit vectors into a 256-bit vector. More...

float32x8	combine (float32x4 a, float32x4 b)
	Combines two 128-bit vectors into a 256-bit vector. More...

float64x4	combine (float64x2 a, float64x2 b)
	Combines two 128-bit vectors into a 256-bit vector. More...


template<int s0, int s1>
basic_int8x16	make_shuffle_bytes16_mask (basic_int8x16 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1>
basic_int8x32	make_shuffle_bytes16_mask (basic_int8x32 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3>
basic_int8x16	make_shuffle_bytes16_mask (basic_int8x16 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3>
basic_int8x32	make_shuffle_bytes16_mask (basic_int8x32 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x16	make_shuffle_bytes16_mask (basic_int8x16 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int8x32	make_shuffle_bytes16_mask (basic_int8x32 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x16	make_shuffle_bytes16_mask (basic_int8x16 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>
basic_int8x32	make_shuffle_bytes16_mask (basic_int8x32 &mask)
	Makes a mask to shuffle an int8x16 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1>
basic_int16x8	make_shuffle_bytes16_mask (basic_int16x8 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1>
basic_int16x16	make_shuffle_bytes16_mask (basic_int16x16 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3>
basic_int16x8	make_shuffle_bytes16_mask (basic_int16x8 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3>
basic_int16x16	make_shuffle_bytes16_mask (basic_int16x16 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x8	make_shuffle_bytes16_mask (basic_int16x8 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>
basic_int16x16	make_shuffle_bytes16_mask (basic_int16x16 &mask)
	Makes a mask to shuffle an int16x8 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1>
basic_int32x4	make_shuffle_bytes16_mask (basic_int32x4 &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1>
basic_int32x8	make_shuffle_bytes16_mask (basic_int32x8 &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1, int s2, int s3>
basic_int32x4	make_shuffle_bytes16_mask (basic_int32x4 &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1, int s2, int s3>
basic_int32x8	make_shuffle_bytes16_mask (basic_int32x8 &mask)
	Makes a mask to shuffle an int32x4 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


template<int s0, int s1>
basic_int64x2	make_shuffle_bytes16_mask (basic_int64x2 &mask)
	Makes a mask to shuffle an int64x2 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...

template<int s0, int s1>
basic_int64x4	make_shuffle_bytes16_mask (basic_int64x4 &mask)
	Makes a mask to shuffle an int64x2 vector using `permute_bytes16`, `shuffle_bytes16`, `permute_zbytes16` or `shuffle_zbytes16` functions. More...


mask_float32x4	isnan (float32x4 a)
	Checks whether elements in a are IEEE754 NaN. More...

mask_float32x8	isnan (float32x8 a)
	Checks whether elements in a are IEEE754 NaN. More...


mask_float64x2	isnan (float64x2 a)
	Checks whether elements in a are IEEE754 NaN. More...

mask_float64x4	isnan (float64x4 a)
	Checks whether elements in a are IEEE754 NaN. More...


mask_float32x4	isnan2 (float32x4 a, float32x4 b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...

mask_float32x8	isnan2 (float32x8 a, float32x8 b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...


mask_float64x2	isnan2 (float64x2 a, float64x2 b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...

mask_float64x4	isnan2 (float64x4 a, float64x4 b)
	Checks whether corresponding elements in either a or b are IEEE754 NaN. More...


float32x4	rcp_e (float32x4 a)
	Computes approximate reciprocal. More...

float32x8	rcp_e (float32x8 a)
	Computes approximate reciprocal. More...


float32x4	rcp_rh (float32x4 x, float32x4 a)
	Computes one Newton-Rhapson iterations for reciprocal. More...

float32x8	rcp_rh (float32x8 x, float32x8 a)
	Computes one Newton-Rhapson iterations for reciprocal. More...


float32x4	div (float32x4 a, float32x4 b)
	Divides the values of two vectors. More...

float32x8	div (float32x8 a, float32x8 b)
	Divides the values of two vectors. More...


float64x2	div (float64x2 a, float64x2 b)
	Divides the values of two vectors. More...

float64x4	div (float64x4 a, float64x4 b)
	Divides the values of two vectors. More...


float32x4	rsqrt_e (float32x4 a)
	Computes approximate reciprocal square root. More...

float32x8	rsqrt_e (float32x8 a)
	Computes approximate reciprocal square root. More...


float32x4	rsqrt_rh (float32x4 x, float32x4 a)
	Computes one Newton-Rhapson iteration for inverse of square root. More...

float32x8	rsqrt_rh (float32x8 x, float32x8 a)
	Computes one Newton-Rhapson iteration for inverse of square root. More...


float32x4	sqrt (float32x4 a)
	Computes square root. More...

float32x8	sqrt (float32x8 a)
	Computes square root. More...


float64x2	sqrt (float64x2 a)
	Computes square root. More...

float64x4	sqrt (float64x4 a)
	Computes square root. More...


float32x4	min (float32x4 a, float32x4 b)
	Computes minimum of the values in two vectors. More...

float32x8	min (float32x8 a, float32x8 b)
	Computes minimum of the values in two vectors. More...


float64x2	min (float64x2 a, float64x2 b)
	Computes minima of the values in two vectors. More...

float64x4	min (float64x4 a, float64x4 b)
	Computes minima of the values in two vectors. More...


float32x4	max (float32x4 a, float32x4 b)
	Computes maxima of the values of two vectors. More...

float32x8	max (float32x8 a, float32x8 b)
	Computes maxima of the values of two vectors. More...


float64x2	max (float64x2 a, float64x2 b)
	Computes maxima of the values of two vectors. More...

float64x4	max (float64x4 a, float64x4 b)
	Computes maxima of the values of two vectors. More...


float32x4	floor (float32x4 a)
	Rounds the values of a vector towards negative infinity. More...

float32x8	floor (float32x8 a)
	Rounds the values of a vector towards negative infinity. More...

float32x4	ceil (float32x4 a)
	Rounds the values a vector towards positive infinity. More...

float32x8	ceil (float32x8 a)
	Rounds the values of a vector towards negative infinity. More...


float32x4	trunc (float32x4 a)
	Rounds the values of a vector towards zero. More...

float32x8	trunc (float32x8 a)
	Rounds the values of a vector towards zero. More...


float64x2	abs (float64x2 a)
	Computes absolute value of floating point values. More...

float64x4	abs (float64x4 a)
	Computes absolute value of floating point values. More...


float32x4	sign (float32x4 a)
	Extracts sign bits from the values in float32x4 vector. More...

float32x8	sign (float32x8 a)
	Extracts sign bits from the values in float32x4 vector. More...


float64x2	sign (float64x2 a)
	Extracts sigh bit from the values in float64x2 vector. More...

float64x4	sign (float64x4 a)
	Extracts sigh bit from the values in float64x2 vector. More...


float32x4	add (float32x4 a, float32x4 b)
	Adds the values of two vectors. More...

float32x8	add (float32x8 a, float32x8 b)
	Adds the values of two vectors. More...


float64x2	add (float64x2 a, float64x2 b)
	Adds the values of two vectors. More...

float64x4	add (float64x4 a, float64x4 b)
	Adds the values of two vectors. More...


float32x4	sub (float32x4 a, float32x4 b)
	Substracts the values of two vectors. More...

float32x8	sub (float32x8 a, float32x8 b)
	Substracts the values of two vectors. More...


float64x2	sub (float64x2 a, float64x2 b)
	Subtracts the values of two vectors. More...

float64x4	sub (float64x4 a, float64x4 b)
	Subtracts the values of two vectors. More...


float32x4	neg (float32x4 a)
	Negates the values of a float32x4 vector. More...

float32x8	neg (float32x8 a)
	Negates the values of a float32x4 vector. More...


float64x2	neg (float64x2 a)
	Negates the values of a vector. More...

float64x4	neg (float64x4 a)
	Negates the values of a vector. More...


float32x4	mul (float32x4 a, float32x4 b)
	Multiplies the values of two vectors. More...

float32x8	mul (float32x8 a, float32x8 b)
	Multiplies the values of two vectors. More...


float64x2	mul (float64x2 a, float64x2 b)
	Multiplies the values of two vectors. More...

float64x4	mul (float64x4 a, float64x4 b)
	Multiplies the values of two vectors. More...


float32x4	fmadd (float32x4 a, float32x4 b, float32x4 c)
	Performs a fused multiply-add operation. More...

float32x8	fmadd (float32x8 a, float32x8 b, float32x8 c)
	Performs a fused multiply-add operation. More...

float64x2	fmadd (float64x2 a, float64x2 b, float64x2 c)
	Performs a fused multiply-add operation. More...

float64x4	fmadd (float64x4 a, float64x4 b, float64x4 c)
	Performs a fused multiply-add operation. More...


float32x4	fmsub (float32x4 a, float32x4 b, float32x4 c)
	Performs a fused multiply-sutract operation. More...

float32x8	fmsub (float32x8 a, float32x8 b, float32x8 c)
	Performs a fused multiply-sutract operation. More...

float64x2	fmsub (float64x2 a, float64x2 b, float64x2 c)
	Performs a fused multiply-sutract operation. More...

float64x4	fmsub (float64x4 a, float64x4 b, float64x4 c)
	Performs a fused multiply-sutract operation. More...


int8x16	min (int8x16 a, int8x16 b)
	Computes minimum of signed 8-bit values. More...

int8x32	min (int8x32 a, int8x32 b)
	Computes minimum of signed 8-bit values. More...


uint8x16	min (uint8x16 a, uint8x16 b)
	Computes minimum of the unsigned 8-bit values. More...

uint8x32	min (uint8x32 a, uint8x32 b)
	Computes minimum of the unsigned 8-bit values. More...


int16x8	min (int16x8 a, int16x8 b)
	Computes minimum of the signed 16-bit values. More...

int16x16	min (int16x16 a, int16x16 b)
	Computes minimum of the signed 16-bit values. More...


uint16x8	min (uint16x8 a, uint16x8 b)
	Computes minimum of the unsigned 16-bit values. More...

uint16x16	min (uint16x16 a, uint16x16 b)
	Computes minimum of the unsigned 16-bit values. More...


int32x4	min (int32x4 a, int32x4 b)
	Computes minimum of the signed 32-bit values. More...

int32x8	min (int32x8 a, int32x8 b)
	Computes minimum of the signed 32-bit values. More...


uint32x4	min (uint32x4 a, uint32x4 b)
	Computes minimum of the unsigned 32-bit values. More...

uint32x8	min (uint32x8 a, uint32x8 b)
	Computes minimum of the unsigned 32-bit values. More...


int8x16	max (int8x16 a, int8x16 b)
	Computes maximum of the signed 8-bit values. More...

int8x32	max (int8x32 a, int8x32 b)
	Computes maximum of the signed 8-bit values. More...


uint8x16	max (uint8x16 a, uint8x16 b)
	Computes maximum of the unsigned 8-bit values. More...

uint8x32	max (uint8x32 a, uint8x32 b)
	Computes maximum of the unsigned 8-bit values. More...


int16x8	max (int16x8 a, int16x8 b)
	Computes maximum of the signed 16-bit values. More...

int16x16	max (int16x16 a, int16x16 b)
	Computes maximum of the signed 16-bit values. More...


uint16x8	max (uint16x8 a, uint16x8 b)
	Computes maximum of the unsigned 16-bit values. More...

uint16x16	max (uint16x16 a, uint16x16 b)
	Computes maximum of the unsigned 16-bit values. More...


int32x4	max (int32x4 a, int32x4 b)
	Computes maximum of the signed 32-bit values. More...

int32x8	max (int32x8 a, int32x8 b)
	Computes maximum of the signed 32-bit values. More...


uint32x4	max (uint32x4 a, uint32x4 b)
	Computes maximum of the unsigned 32-bit values. More...

uint32x8	max (uint32x8 a, uint32x8 b)
	Computes maximum of the unsigned 32-bit values. More...


uint8x16	avg (uint8x16 a, uint8x16 b)
	Computes rounded average of the unsigned 8-bit values. More...

uint8x32	avg (uint8x32 a, uint8x32 b)
	Computes rounded average of the unsigned 8-bit values. More...


int8x16	avg (int8x16 a, int8x16 b)
	Computes rounded average of signed 8-bit values. More...

int8x32	avg (int8x32 a, int8x32 b)
	Computes rounded average of signed 8-bit values. More...


uint16x8	avg (uint16x8 a, uint16x8 b)
	Computes rounded average of unsigned 16-bit values. More...

uint16x16	avg (uint16x16 a, uint16x16 b)
	Computes rounded average of unsigned 16-bit values. More...


int16x8	avg (int16x8 a, int16x8 b)
	Computes rounded average of signed 16-bit values. More...

int16x16	avg (int16x16 a, int16x16 b)
	Computes rounded average of signed 16-bit values. More...


uint32x4	avg (uint32x4 a, uint32x4 b)
	Computes rounded average of unsigned 32-bit values. More...

uint32x8	avg (uint32x8 a, uint32x8 b)
	Computes rounded average of unsigned 32-bit values. More...


int32x4	avg (int32x4 a, int32x4 b)
	Computes rounded average of signed 32-bit values. More...

int32x8	avg (int32x8 a, int32x8 b)
	Computes rounded average of signed 32-bit values. More...


uint8x16	avg_trunc (uint8x16 a, uint8x16 b)
	Computes truncated average of the unsigned 8-bit values. More...

uint8x32	avg_trunc (uint8x32 a, uint8x32 b)
	Computes truncated average of the unsigned 8-bit values. More...


int8x16	avg_trunc (int8x16 a, int8x16 b)
	Computes truncated average of signed 8-bit values. More...

int8x32	avg_trunc (int8x32 a, int8x32 b)
	Computes truncated average of signed 8-bit values. More...


uint16x8	avg_trunc (uint16x8 a, uint16x8 b)
	Computes truncated average of unsigned 16-bit values. More...

uint16x16	avg_trunc (uint16x16 a, uint16x16 b)
	Computes truncated average of unsigned 16-bit values. More...


int16x8	avg_trunc (int16x8 a, int16x8 b)
	Computes truncated average of signed 16-bit values. More...

int16x16	avg_trunc (int16x16 a, int16x16 b)
	Computes truncated average of signed 16-bit values. More...


uint32x4	avg_trunc (uint32x4 a, uint32x4 b)
	Computes truncated average of unsigned 32-bit values. More...

uint32x8	avg_trunc (uint32x8 a, uint32x8 b)
	Computes truncated average of unsigned 32-bit values. More...


int32x4	avg_trunc (int32x4 a, int32x4 b)
	Computes truncated average of signed 32-bit values. More...

int32x8	avg_trunc (int32x8 a, int32x8 b)
	Computes truncated average of signed 32-bit values. More...


uint8x16	abs (int8x16 a)
	Computes absolute value of 8-bit integer values. More...

uint8x32	abs (int8x32 a)
	Computes absolute value of 8-bit integer values. More...


uint16x8	abs (int16x8 a)
	Computes absolute value of 16-bit integer values. More...

uint16x16	abs (int16x16 a)
	Computes absolute value of 16-bit integer values. More...


uint32x4	abs (int32x4 a)
	Computes absolute value of 32-bit integer values. More...

uint32x8	abs (int32x8 a)
	Computes absolute value of 32-bit integer values. More...


uint64x2	abs (int64x2 a)
	Computes absolute value of 64-bit integer values. More...

uint64x4	abs (int64x4 a)
	Computes absolute value of 64-bit integer values. More...


template<unsigned P>
uint8x16	div_p (uint8x16 num, uint8x16 den)
	Divides one 8-bit unsigned number by another. More...

template<unsigned P>
uint16x8	div_p (uint16x8 num, uint16x8 den)
	Divides one 8-bit unsigned number by another. More...


basic_int16x8	add (basic_int16x8 a, basic_int16x8 b)
	Adds 16-bit integer values. More...

basic_int16x16	add (basic_int16x16 a, basic_int16x16 b)
	Adds 16-bit integer values. More...


basic_int32x4	add (basic_int32x4 a, basic_int32x4 b)
	Adds 32-bit integer values. More...

basic_int32x8	add (basic_int32x8 a, basic_int32x8 b)
	Adds 32-bit integer values. More...


basic_int64x2	add (basic_int64x2 a, basic_int64x2 b)
	Adds 64-bit integer values. More...

basic_int64x4	add (basic_int64x4 a, basic_int64x4 b)
	Adds 64-bit integer values. More...


int8x16	adds (int8x16 a, int8x16 b)
	Adds and saturates signed 8-bit integer values. More...

int8x32	adds (int8x32 a, int8x32 b)
	Adds and saturates signed 8-bit integer values. More...


int16x8	adds (int16x8 a, int16x8 b)
	Adds and saturates signed 16-bit integer values. More...

int16x16	adds (int16x16 a, int16x16 b)
	Adds and saturates signed 16-bit integer values. More...


uint8x16	adds (uint8x16 a, uint8x16 b)
	Adds and saturates unsigned 8-bit integer values. More...

uint8x32	adds (uint8x32 a, uint8x32 b)
	Adds and saturates unsigned 8-bit integer values. More...


uint16x8	adds (uint16x8 a, uint16x8 b)
	Adds and saturates unsigned 16-bit integer values. More...

uint16x16	adds (uint16x16 a, uint16x16 b)
	Adds and saturates unsigned 16-bit integer values. More...


basic_int8x16	sub (basic_int8x16 a, basic_int8x16 b)
	Subtracts 8-bit integer values. More...

basic_int8x32	sub (basic_int8x32 a, basic_int8x32 b)
	Subtracts 8-bit integer values. More...


basic_int16x8	sub (basic_int16x8 a, basic_int16x8 b)
	Subtracts 16-bit integer values. More...

basic_int16x16	sub (basic_int16x16 a, basic_int16x16 b)
	Subtracts 16-bit integer values. More...


basic_int32x4	sub (basic_int32x4 a, basic_int32x4 b)
	Subtracts 32-bit integer values. More...

basic_int32x8	sub (basic_int32x8 a, basic_int32x8 b)
	Subtracts 32-bit integer values. More...


basic_int64x2	sub (basic_int64x2 a, basic_int64x2 b)
	Subtracts 64-bit integer values. More...

basic_int64x4	sub (basic_int64x4 a, basic_int64x4 b)
	Subtracts 64-bit integer values. More...


int8x16	subs (int8x16 a, int8x16 b)
	Subtracts and saturaters signed 8-bit integer values. More...

int8x32	subs (int8x32 a, int8x32 b)
	Subtracts and saturaters signed 8-bit integer values. More...


int16x8	subs (int16x8 a, int16x8 b)
	Subtracts and saturaters signed 16-bit integer values. More...

int16x16	subs (int16x16 a, int16x16 b)
	Subtracts and saturaters signed 16-bit integer values. More...


uint8x16	subs (uint8x16 a, uint8x16 b)
	Subtracts and saturaters unsigned 8-bit integer values. More...

uint8x32	subs (uint8x32 a, uint8x32 b)
	Subtracts and saturaters unsigned 8-bit integer values. More...


uint16x8	subs (uint16x8 a, uint16x8 b)
	Subtracts and saturaters unsigned 16-bit integer values. More...

uint16x16	subs (uint16x16 a, uint16x16 b)
	Subtracts and saturaters unsigned 16-bit integer values. More...


int8x16	neg (int8x16 a)
	Negates signed 8-bit values. More...

int8x32	neg (int8x32 a)
	Negates signed 8-bit values. More...


int16x8	neg (int16x8 a)
	Negates signed 16-bit values. More...

int16x16	neg (int16x16 a)
	Negates signed 16-bit values. More...


int32x4	neg (int32x4 a)
	Negates signed 32-bit values. More...

int32x8	neg (int32x8 a)
	Negates signed 32-bit values. More...


int64x2	neg (int64x2 a)
	Negates signed 64-bit values. More...

int64x4	neg (int64x4 a)
	Negates signed 64-bit values. More...


basic_int16x8	mul_lo (basic_int16x8 a, basic_int16x8 b)
	Multiplies 16-bit values and returns the lower part of the multiplication. More...

basic_int16x16	mul_lo (basic_int16x16 a, basic_int16x16 b)
	Multiplies 16-bit values and returns the lower part of the multiplication. More...


int16x8	mul_hi (int16x8 a, int16x8 b)
	Multiplies signed 16-bit values and returns the higher half of the result. More...

int16x16	mul_hi (int16x16 a, int16x16 b)
	Multiplies signed 16-bit values and returns the higher half of the result. More...


uint16x8	mul_hi (uint16x8 a, uint16x8 b)
	Multiplies unsigned 16-bit values and returns the higher half of the result. More...

uint16x16	mul_hi (uint16x16 a, uint16x16 b)
	Multiplies unsigned 16-bit values and returns the higher half of the result. More...


int128	mul_lo (basic_int32x4 a, basic_int32x4 b)
	Multiplies 32-bit values and returns the lower half of the result. More...

basic_int32x8	mul_lo (basic_int32x8 a, basic_int32x8 b)
	Multiplies 32-bit values and returns the lower half of the result. More...


int32x4	mull_lo (int16x8 a, int16x8 b)
	Multiplies signed 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...

int32x8	mull_lo (int16x16 a, int16x16 b)
	Multiplies signed 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...


uint32x4	mull_lo (uint16x8 a, uint16x8 b)
	Multiplies unsigned 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...

uint32x8	mull_lo (uint16x16 a, uint16x16 b)
	Multiplies unsigned 16-bit values in the lower halves of the vectors and expands the results to 32 bits. More...


int32x4	mull_hi (int16x8 a, int16x8 b)
	Multiplies signed 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...

int32x8	mull_hi (int16x16 a, int16x16 b)
	Multiplies signed 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...


uint32x4	mull_hi (uint16x8 a, uint16x8 b)
	Multiplies unsigned 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...

uint32x8	mull_hi (uint16x16 a, uint16x16 b)
	Multiplies unsigned 16-bit values in the higher halves of the vectors and expands the results to 32 bits. More...


int64x2	mull_lo (int32x4 a, int32x4 b)
	Multiplies signed 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...

int64x4	mull_lo (int32x8 a, int32x8 b)
	Multiplies signed 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...


uint64x2	mull_lo (uint32x4 a, uint32x4 b)
	Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...

uint64x4	mull_lo (uint32x8 a, uint32x8 b)
	Multiplies unsigned 32-bit values in the lower halves of the vectors and expands the results to 64 bits. More...


int64x2	mull_hi (int32x4 a, int32x4 b)
	Multiplies signed 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...

int64x4	mull_hi (int32x8 a, int32x8 b)
	Multiplies signed 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...


uint64x2	mull_hi (uint32x4 a, uint32x4 b)
	Multiplies unsigned 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...

uint64x4	mull_hi (uint32x8 a, uint32x8 b)
	Multiplies unsigned 32-bit values in the higher halves of the vectors and expands the results to 64 bits. More...


uint8x16	shift_r (uint8x16 a, unsigned count)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...

uint8x32	shift_r (uint8x32 a, unsigned count)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...


int16x8	shift_r (int16x8 a, unsigned count)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...

int16x16	shift_r (int16x16 a, unsigned count)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...


uint16x8	shift_r (uint16x8 a, unsigned count)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...

uint16x16	shift_r (uint16x16 a, unsigned count)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...


int32x4	shift_r (int32x4 a, unsigned count)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...

int32x8	shift_r (int32x8 a, unsigned count)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...


uint32x4	shift_r (uint32x4 a, unsigned count)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...

uint32x8	shift_r (uint32x8 a, unsigned count)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...


int64x2	shift_r (int64x2 a, unsigned count)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...

int64x4	shift_r (int64x4 a, unsigned count)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...


uint64x2	shift_r (uint64x2 a, unsigned count)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...

uint64x4	shift_r (uint64x4 a, unsigned count)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...


basic_int8x16	shift_l (basic_int8x16 a, unsigned count)
	Shifts 8-bit values left by count bits while shifting in zeros. More...

basic_int8x32	shift_l (basic_int8x32 a, unsigned count)
	Shifts 8-bit values left by count bits while shifting in zeros. More...


basic_int16x8	shift_l (basic_int16x8 a, unsigned count)
	Shifts 16-bit values left by count bits while shifting in zeros. More...

basic_int16x16	shift_l (basic_int16x16 a, unsigned count)
	Shifts 16-bit values left by count bits while shifting in zeros. More...


basic_int32x4	shift_l (basic_int32x4 a, unsigned count)
	Shifts 32-bit values left by count bits while shifting in zeros. More...

basic_int32x8	shift_l (basic_int32x8 a, unsigned count)
	Shifts 32-bit values left by count bits while shifting in zeros. More...


basic_int64x2	shift_l (basic_int64x2 a, unsigned count)
	Shifts 64-bit values left by count bits while shifting in zeros. More...

basic_int64x4	shift_l (basic_int64x4 a, unsigned count)
	Shifts 64-bit values left by count bits while shifting in zeros. More...


template<unsigned count>
int8x16	shift_r (int8x16 a)
	Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count>
int8x32	shift_r (int8x32 a)
	Shifts signed 8-bit values right by count bits while shifting in the sign bit. More...


template<unsigned count>
uint8x16	shift_r (uint8x16 a)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...

template<unsigned count>
uint8x32	shift_r (uint8x32 a)
	Shifts unsigned 8-bit values right by count bits while shifting in zeros. More...


template<unsigned count>
int16x8	shift_r (int16x8 a)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count>
int16x16	shift_r (int16x16 a)
	Shifts signed 16-bit values right by count bits while shifting in the sign bit. More...


template<unsigned count>
uint16x8	shift_r (uint16x8 a)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...

template<unsigned count>
uint16x16	shift_r (uint16x16 a)
	Shifts unsigned 16-bit values right by count bits while shifting in zeros. More...


template<unsigned count>
int32x4	shift_r (int32x4 a)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count>
int32x8	shift_r (int32x8 a)
	Shifts signed 32-bit values right by count bits while shifting in the sign bit. More...


template<unsigned count>
uint32x4	shift_r (uint32x4 a)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...

template<unsigned count>
uint32x8	shift_r (uint32x8 a)
	Shifts unsigned 32-bit values right by count bits while shifting in zeros. More...


template<unsigned count>
int64x2	shift_r (int64x2 a)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...

template<unsigned count>
int64x4	shift_r (int64x4 a)
	Shifts signed 64-bit values right by count bits while shifting in the sign bit. More...


template<unsigned count>
uint64x2	shift_r (uint64x2 a)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...

template<unsigned count>
uint64x4	shift_r (uint64x4 a)
	Shifts unsigned 64-bit values right by count bits while shifting in zeros. More...


template<unsigned count>
basic_int8x16	shift_l (basic_int8x16 a)
	Shifts 8-bit values left by count bits while shifting in zeros. More...

template<unsigned count>
basic_int8x32	shift_l (basic_int8x32 a)
	Shifts 8-bit values left by count bits while shifting in zeros. More...


template<unsigned count>
basic_int16x8	shift_l (basic_int16x8 a)
	Shifts 16-bit values left by count bits while shifting in zeros. More...

template<unsigned count>
basic_int16x16	shift_l (basic_int16x16 a)
	Shifts 16-bit values left by count bits while shifting in zeros. More...


template<unsigned count>
basic_int32x4	shift_l (basic_int32x4 a)
	Shifts 32-bit values left by count bits while shifting in zeros. More...

template<unsigned count>
basic_int32x8	shift_l (basic_int32x8 a)
	Shifts 32-bit values left by count bits while shifting in zeros. More...


template<unsigned count>
basic_int64x2	shift_l (basic_int64x2 a)
	Shifts 64-bit values left by count bits while shifting in zeros. More...

template<unsigned count>
basic_int64x4	shift_l (basic_int64x4 a)
	Shifts 64-bit values left by count bits while shifting in zeros. More...


basic_int8x16	load_u (basic_int8x16 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int16x8	load_u (basic_int16x8 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int32x4	load_u (basic_int32x4 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int64x2	load_u (basic_int64x2 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

float32x4	load_u (float32x4 &a, const float *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

float64x2	load_u (float64x2 &a, const double *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int8x32	load_u (basic_int8x32 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int16x16	load_u (basic_int16x16 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int32x8	load_u (basic_int32x8 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

basic_int64x4	load_u (basic_int64x4 &a, const void *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

float32x8	load_u (float32x8 &a, const float *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...

float64x4	load_u (float64x4 &a, const double *p)
	Loads a 128-bit or 256-bit integer, 32-bit or 64-bit float vector from an unaligned memory location. More...


void	load_packed2 (basic_int8x16 &a, basic_int8x16 &b, const void *p)
	Loads 8-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (basic_int8x32 &a, basic_int8x32 &b, const void *p)
	Loads 8-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...


void	load_packed2 (basic_int16x8 &a, basic_int16x8 &b, const void *p)
	Loads 16-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (basic_int16x16 &a, basic_int16x16 &b, const void *p)
	Loads 16-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...


void	load_packed2 (basic_int32x4 &a, basic_int32x4 &b, const void *p)
	Loads 32-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (basic_int32x8 &a, basic_int32x8 &b, const void *p)
	Loads 32-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...


void	load_packed2 (basic_int64x2 &a, basic_int64x2 &b, const void *p)
	Loads 64-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (basic_int64x4 &a, basic_int64x4 &b, const void *p)
	Loads 64-bit values packed in pairs, de-interleaves them and stores the result into two vectors. More...


void	load_packed2 (float64x2 &a, float64x2 &b, const double *p)
	Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...

void	load_packed2 (float64x4 &a, float64x4 &b, const double *p)
	Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors. More...


void	load_packed3 (basic_int8x16 &a, basic_int8x16 &b, basic_int8x16 &c, const void *p)
	Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (basic_int8x32 &a, basic_int8x32 &b, basic_int8x32 &c, const void *p)
	Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed3 (basic_int16x8 &a, basic_int16x8 &b, basic_int16x8 &c, const void *p)
	Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (basic_int16x16 &a, basic_int16x16 &b, basic_int16x16 &c, const void *p)
	Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed3 (basic_int32x4 &a, basic_int32x4 &b, basic_int32x4 &c, const void *p)
	Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (basic_int32x8 &a, basic_int32x8 &b, basic_int32x8 &c, const void *p)
	Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed3 (basic_int64x2 &a, basic_int64x2 &b, basic_int64x2 &c, const void *p)
	Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (basic_int64x4 &a, basic_int64x4 &b, basic_int64x4 &c, const void *p)
	Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed3 (float32x4 &a, float32x4 &b, float32x4 &c, const float *p)
	Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (float32x8 &a, float32x8 &b, float32x8 &c, const float *p)
	Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed3 (float64x2 &a, float64x2 &b, float64x2 &c, const double *p)
	Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...

void	load_packed3 (float64x4 &a, float64x4 &b, float64x4 &c, const double *p)
	Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors. More...


void	load_packed4 (basic_int8x16 &a, basic_int8x16 &b, basic_int8x16 &c, basic_int8x16 &d, const void *p)
	Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (basic_int8x32 &a, basic_int8x32 &b, basic_int8x32 &c, basic_int8x32 &d, const void *p)
	Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	load_packed4 (basic_int16x8 &a, basic_int16x8 &b, basic_int16x8 &c, basic_int16x8 &d, const void *p)
	Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (basic_int16x16 &a, basic_int16x16 &b, basic_int16x16 &c, basic_int16x16 &d, const void *p)
	Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	load_packed4 (basic_int32x4 &a, basic_int32x4 &b, basic_int32x4 &c, basic_int32x4 &d, const void *p)
	Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (basic_int32x8 &a, basic_int32x8 &b, basic_int32x8 &c, basic_int32x8 &d, const void *p)
	Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	load_packed4 (basic_int64x2 &a, basic_int64x2 &b, basic_int64x2 &c, basic_int64x2 &d, const void *p)
	Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (basic_int64x4 &a, basic_int64x4 &b, basic_int64x4 &c, basic_int64x4 &d, const void *p)
	Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	load_packed4 (float32x4 &a, float32x4 &b, float32x4 &c, float32x4 &d, const float *p)
	Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (float32x8 &a, float32x8 &b, float32x8 &c, float32x8 &d, const float *p)
	Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	load_packed4 (float64x2 &a, float64x2 &b, float64x2 &c, float64x2 &d, const double *p)
	Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...

void	load_packed4 (float64x4 &a, float64x4 &b, float64x4 &c, float64x4 &d, const double *p)
	Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors. More...


void	stream (void *p, int128 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	stream (void *p, int256 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	stream (float *p, float32x4 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	stream (float *p, float32x8 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	stream (double *p, float64x2 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	stream (double *p, float64x4 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...


void	store_first (void *p, basic_int8x16 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int8x32 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int16x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int16x16 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int32x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int32x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int64x2 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (void *p, basic_int64x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (float *p, float32x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (float *p, float32x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (double *p, float64x2 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_first (double *p, float64x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...


void	store_last (void *p, basic_int8x16 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int8x32 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int16x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int16x16 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int32x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int32x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int64x2 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (void *p, basic_int64x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (float *p, float32x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (float *p, float32x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (double *p, float64x2 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	store_last (double *p, float64x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...


void	store_packed2 (void *p, basic_int8x16 a, basic_int8x16 b)
	Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (void *p, basic_int8x32 a, basic_int8x32 b)
	Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed2 (void *p, basic_int16x8 a, basic_int16x8 b)
	Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (void *p, basic_int16x16 a, basic_int16x16 b)
	Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed2 (void *p, basic_int32x4 a, basic_int32x4 b)
	Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (void *p, basic_int32x8 a, basic_int32x8 b)
	Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed2 (void *p, basic_int64x2 a, basic_int64x2 b)
	Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (void *p, basic_int64x4 a, basic_int64x4 b)
	Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed2 (float *p, float32x4 a, float32x4 b)
	Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (float *p, float32x8 a, float32x8 b)
	Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed2 (double *p, float64x2 a, float64x2 b)
	Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	store_packed2 (double *p, float64x4 a, float64x4 b)
	Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c)
	Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c)
	Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c)
	Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c)
	Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c)
	Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c)
	Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c)
	Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c)
	Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (float *p, float32x4 a, float32x4 b, float32x4 c)
	Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (float *p, float32x8 a, float32x8 b, float32x8 c)
	Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed3 (double *p, float64x2 a, float64x2 b, float64x2 c)
	Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	store_packed3 (double *p, float64x4 a, float64x4 b, float64x4 c)
	Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c, basic_int8x16 d)
	Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c, basic_int8x32 d)
	Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c, basic_int16x8 d)
	Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c, basic_int16x16 d)
	Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d)
	Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c, basic_int32x8 d)
	Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c, basic_int64x2 d)
	Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c, basic_int64x4 d)
	Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (float *p, float32x4 a, float32x4 b, float32x4 c, float32x4 d)
	Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (float *p, float32x8 a, float32x8 b, float32x8 c, float32x8 d)
	Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...


void	store_packed4 (double *p, float64x2 a, float64x2 b, float64x2 c, float64x2 d)
	Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

void	store_packed4 (double *p, float64x4 a, float64x4 b, float64x4 c, float64x4 d)
	Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...


float32x4	zip_lo (float32x4 a, float32x4 b)
	Interleaves the lower halves of two vectors. More...

float32x8	zip_lo (float32x8 a, float32x8 b)
	Interleaves the lower halves of two vectors. More...

float64x2	zip_lo (float64x2 a, float64x2 b)
	Interleaves the lower halves of two vectors. More...

float64x4	zip_lo (float64x4 a, float64x4 b)
	Interleaves the lower halves of two vectors. More...


basic_int8x16	zip_hi (basic_int8x16 a, basic_int8x16 b)
	Interleaves the higher halves of two vectors. More...

basic_int8x32	zip_hi (basic_int8x32 a, basic_int8x32 b)
	Interleaves the higher halves of two vectors. More...

basic_int16x8	zip_hi (basic_int16x8 a, basic_int16x8 b)
	Interleaves the higher halves of two vectors. More...

basic_int16x16	zip_hi (basic_int16x16 a, basic_int16x16 b)
	Interleaves the higher halves of two vectors. More...

basic_int32x4	zip_hi (basic_int32x4 a, basic_int32x4 b)
	Interleaves the higher halves of two vectors. More...

basic_int32x8	zip_hi (basic_int32x8 a, basic_int32x8 b)
	Interleaves the higher halves of two vectors. More...

basic_int64x2	zip_hi (basic_int64x2 a, basic_int64x2 b)
	Interleaves the higher halves of two vectors. More...

basic_int64x4	zip_hi (basic_int64x4 a, basic_int64x4 b)
	Interleaves the higher halves of two vectors. More...


float32x4	zip_hi (float32x4 a, float32x4 b)
	Interleaves the higher halves of two vectors. More...

float32x8	zip_hi (float32x8 a, float32x8 b)
	Interleaves the higher halves of two vectors. More...

float64x2	zip_hi (float64x2 a, float64x2 b)
	Interleaves the higher halves of two vectors. More...

float64x4	zip_hi (float64x4 a, float64x4 b)
	Interleaves the higher halves of two vectors. More...


template<unsigned shift>
basic_int8x16	move_l (basic_int8x16 a)
	Moves the elements in an int8x16 vector to the left by shift positions. More...

template<unsigned shift>
basic_int8x32	move_l (basic_int8x32 a)
	Moves the elements in an int8x16 vector to the left by shift positions. More...


template<unsigned shift>
basic_int16x8	move_l (basic_int16x8 a)
	Moves the 16-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int16x16	move_l (basic_int16x16 a)
	Moves the 16-bit elements in a vector to the left by shift positions. More...


template<unsigned shift>
basic_int32x4	move_l (basic_int32x4 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int32x8	move_l (basic_int32x8 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...


template<unsigned shift>
basic_int64x2	move_l (basic_int64x2 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
basic_int64x4	move_l (basic_int64x4 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...


template<unsigned shift>
float32x4	move_l (float32x4 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float32x8	move_l (float32x8 a)
	Moves the 32-bit elements in a vector to the left by shift positions. More...


template<unsigned shift>
float64x2	move_l (float64x2 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...

template<unsigned shift>
float64x4	move_l (float64x4 a)
	Moves the 64-bit elements in a vector to the left by shift positions. More...


template<unsigned shift>
basic_int8x16	move_r (basic_int8x16 a)
	Moves the 8-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int8x32	move_r (basic_int8x32 a)
	Moves the 8-bit elements in a vector to the right by shift positions. More...


template<unsigned shift>
basic_int16x8	move_r (basic_int16x8 a)
	Moves the 16-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int16x16	move_r (basic_int16x16 a)
	Moves the 16-bit elements in a vector to the right by shift positions. More...


template<unsigned shift>
basic_int32x4	move_r (basic_int32x4 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int32x8	move_r (basic_int32x8 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...


template<unsigned shift>
basic_int64x2	move_r (basic_int64x2 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
basic_int64x4	move_r (basic_int64x4 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...


template<unsigned shift>
float32x4	move_r (float32x4 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float32x8	move_r (float32x8 a)
	Moves the 32-bit elements in a vector to the right by shift positions. More...


template<unsigned shift>
float64x2	move_r (float64x2 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...

template<unsigned shift>
float64x4	move_r (float64x4 a)
	Moves the 64-bit elements in a vector to the right by shift positions. More...


template<unsigned s>
basic_int8x16	broadcast (basic_int8x16 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int8x32	broadcast (basic_int8x32 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
basic_int16x8	broadcast (basic_int16x8 a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int16x16	broadcast (basic_int16x16 a)
	Broadcasts the specified 16-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
basic_int32x4	broadcast (basic_int32x4 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int32x8	broadcast (basic_int32x8 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
basic_int64x2	broadcast (basic_int64x2 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
basic_int64x4	broadcast (basic_int64x4 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
float32x4	broadcast (float32x4 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float32x8	broadcast (float32x8 a)
	Broadcasts the specified 32-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
float64x2	broadcast (float64x2 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...

template<unsigned s>
float64x4	broadcast (float64x4 a)
	Broadcasts the specified 64-bit value to all elements within 128-bit lanes. More...


template<unsigned s>
basic_int8x16	broadcast_w (basic_int8x16 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...

template<unsigned s>
basic_int8x32	broadcast_w (basic_int8x32 a)
	Broadcasts the specified 8-bit value to all elements within 128-bit lane. More...


template<unsigned s>
basic_int16x8	broadcast_w (basic_int16x8 a)
	Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...

template<unsigned s>
basic_int16x16	broadcast_w (basic_int16x16 a)
	Broadcasts the specified 16-bit value to all elements within a int16x8 vector. More...


template<unsigned s>
basic_int32x4	broadcast_w (basic_int32x4 a)
	Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...

template<unsigned s>
basic_int32x8	broadcast_w (basic_int32x8 a)
	Broadcasts the specified 32-bit value to all elements within a int32x4 vector. More...


template<unsigned s>
basic_int64x2	broadcast_w (basic_int64x2 a)
	Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...

template<unsigned s>
basic_int64x4	broadcast_w (basic_int64x4 a)
	Broadcasts the specified 64-bit value to all elements within a int64x2 vector. More...


template<unsigned s>
float32x4	broadcast_w (float32x4 a)
	Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...

template<unsigned s>
float32x8	broadcast_w (float32x8 a)
	Broadcasts the specified 32-bit value to all elements within a float32x4 vector. More...


template<unsigned s>
float64x2	broadcast_w (float64x2 a)
	Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...

template<unsigned s>
float64x4	broadcast_w (float64x4 a)
	Broadcasts the specified 64-bit value to all elements within a float64x2 vector. More...


template<unsigned shift>
basic_int8x16	align (basic_int8x16 lower, basic_int8x16 upper)
	Extracts a int8x16 vector from two concatenated int8x16 vectors. More...

template<unsigned shift>
basic_int8x32	align (basic_int8x32 lower, basic_int8x32 upper)
	Extracts a int8x16 vector from two concatenated int8x16 vectors. More...


template<unsigned shift>
basic_int16x8	align (basic_int16x8 lower, basic_int16x8 upper)
	Extracts a int16x8 vector from two concatenated int16x8 vectors. More...

template<unsigned shift>
basic_int16x16	align (basic_int16x16 lower, basic_int16x16 upper)
	Extracts a int16x8 vector from two concatenated int16x8 vectors. More...


template<unsigned shift>
basic_int32x4	align (basic_int32x4 lower, basic_int32x4 upper)
	Extracts a int32x4 vector from two concatenated int32x4 vectors. More...

template<unsigned shift>
basic_int32x8	align (basic_int32x8 lower, basic_int32x8 upper)
	Extracts a int32x4 vector from two concatenated int32x4 vectors. More...


template<unsigned shift>
basic_int64x2	align (basic_int64x2 lower, basic_int64x2 upper)
	Extracts a int64x2 vector from two concatenated int64x2 vectors. More...

template<unsigned shift>
basic_int64x4	align (basic_int64x4 lower, basic_int64x4 upper)
	Extracts a int64x2 vector from two concatenated int64x2 vectors. More...


template<unsigned shift>
float32x4	align (float32x4 lower, float32x4 upper)
	Extracts a float32x4 vector from two concatenated float32x4 vectors. More...

template<unsigned shift>
float32x8	align (float32x8 lower, float32x8 upper)
	Extracts a float32x4 vector from two concatenated float32x4 vectors. More...


template<unsigned shift>
float64x2	align (float64x2 lower, float64x2 upper)
	Extracts a float64x2 vector from two concatenated float64x2 vectors. More...

template<unsigned shift>
float64x4	align (float64x4 lower, float64x4 upper)
	Extracts a float64x2 vector from two concatenated float64x2 vectors. More...


basic_int8x16	blend (basic_int8x16 on, basic_int8x16 off, basic_int8x16 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x16	blend (basic_int8x16 on, basic_int8x16 off, mask_int8x16 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x32	blend (basic_int8x32 on, basic_int8x32 off, basic_int8x32 mask)
	Composes a vector from two sources according to a mask. More...

basic_int8x32	blend (basic_int8x32 on, basic_int8x32 off, mask_int8x32 mask)
	Composes a vector from two sources according to a mask. More...


basic_int16x8	blend (basic_int16x8 on, basic_int16x8 off, basic_int16x8 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x16	blend (basic_int16x16 on, basic_int16x16 off, basic_int16x16 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x8	blend (basic_int16x8 on, basic_int16x8 off, mask_int16x8 mask)
	Composes vector from two sources according to a mask. More...

basic_int16x16	blend (basic_int16x16 on, basic_int16x16 off, mask_int16x16 mask)
	Composes vector from two sources according to a mask. More...


basic_int32x4	blend (basic_int32x4 on, basic_int32x4 off, basic_int32x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x8	blend (basic_int32x8 on, basic_int32x8 off, basic_int32x8 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x4	blend (basic_int32x4 on, basic_int32x4 off, mask_int32x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int32x8	blend (basic_int32x8 on, basic_int32x8 off, mask_int32x8 mask)
	Composes a vector from two sources according to a mask. More...


basic_int64x2	blend (basic_int64x2 on, basic_int64x2 off, basic_int64x2 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x4	blend (basic_int64x4 on, basic_int64x4 off, basic_int64x4 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x2	blend (basic_int64x2 on, basic_int64x2 off, mask_int64x2 mask)
	Composes a vector from two sources according to a mask. More...

basic_int64x4	blend (basic_int64x4 on, basic_int64x4 off, mask_int64x4 mask)
	Composes a vector from two sources according to a mask. More...


float32x4	blend (float32x4 on, float32x4 off, float32x4 mask)
	Composes a vector from two sources according to a mask. More...

float32x4	blend (float32x4 on, float32x4 off, int128 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	blend (float32x8 on, float32x8 off, float32x8 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	blend (float32x8 on, float32x8 off, int256 mask)
	Composes a vector from two sources according to a mask. More...

float32x4	blend (float32x4 on, float32x4 off, mask_float32x4 mask)
	Composes a vector from two sources according to a mask. More...

float32x8	blend (float32x8 on, float32x8 off, mask_float32x8 mask)
	Composes a vector from two sources according to a mask. More...


float64x2	blend (float64x2 on, float64x2 off, float64x2 mask)
	Composes a vector from two sources according to a mask. More...

float64x2	blend (float64x2 on, float64x2 off, int128 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	blend (float64x4 on, float64x4 off, float64x4 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	blend (float64x4 on, float64x4 off, int256 mask)
	Composes a vector from two sources according to a mask. More...

float64x2	blend (float64x2 on, float64x2 off, mask_float64x2 mask)
	Composes a vector from two sources according to a mask. More...

float64x4	blend (float64x4 on, float64x4 off, mask_float64x4 mask)
	Composes a vector from two sources according to a mask. More...


basic_int8x16	unzip_lo (basic_int8x16 a, basic_int8x16 b)
	De-interleaves the odd(lower) elements of two int8x16 vectors. More...

basic_int8x32	unzip_lo (basic_int8x32 a, basic_int8x32 b)
	De-interleaves the odd(lower) elements of two int8x16 vectors. More...


basic_int16x8	unzip_lo (basic_int16x8 a, basic_int16x8 b)
	De-interleaves the odd(lower) elements of two int16x8 vectors. More...

basic_int16x16	unzip_lo (basic_int16x16 a, basic_int16x16 b)
	De-interleaves the odd(lower) elements of two int16x8 vectors. More...


basic_int32x4	unzip_lo (basic_int32x4 a, basic_int32x4 b)
	De-interleaves the odd(lower) elements of two int32x4 vectors. More...

basic_int32x8	unzip_lo (basic_int32x8 a, basic_int32x8 b)
	De-interleaves the odd(lower) elements of two int32x4 vectors. More...


basic_int64x2	unzip_lo (basic_int64x2 a, basic_int64x2 b)
	De-interleaves the odd(lower) elements of two int64x2 vectors. More...

basic_int64x4	unzip_lo (basic_int64x4 a, basic_int64x4 b)
	De-interleaves the odd(lower) elements of two int64x2 vectors. More...


float32x4	unzip_lo (float32x4 a, float32x4 b)
	De-interleaves the odd(lower) elements of two float32x4 vectors. More...

float32x8	unzip_lo (float32x8 a, float32x8 b)
	De-interleaves the odd(lower) elements of two float32x4 vectors. More...


float64x2	unzip_lo (float64x2 a, float64x2 b)
	De-interleaves the odd(lower) elements of two float64x2 vectors. More...

float64x4	unzip_lo (float64x4 a, float64x4 b)
	De-interleaves the odd(lower) elements of two float64x2 vectors. More...


basic_int8x16	unzip_hi (basic_int8x16 a, basic_int8x16 b)
	De-interleaves the even(higher) elements of two int8x16 vectors. More...

basic_int8x32	unzip_hi (basic_int8x32 a, basic_int8x32 b)
	De-interleaves the even(higher) elements of two int8x16 vectors. More...


basic_int16x8	unzip_hi (basic_int16x8 a, basic_int16x8 b)
	De-interleaves the even(higher) elements of two int16x8 vectors. More...

basic_int16x16	unzip_hi (basic_int16x16 a, basic_int16x16 b)
	De-interleaves the even(higher) elements of two int16x8 vectors. More...


basic_int32x4	unzip_hi (basic_int32x4 a, basic_int32x4 b)
	De-interleaves the even(higher) elements of two int32x4 vectors. More...

basic_int32x8	unzip_hi (basic_int32x8 a, basic_int32x8 b)
	De-interleaves the even(higher) elements of two int32x4 vectors. More...


basic_int64x2	unzip_hi (basic_int64x2 a, basic_int64x2 b)
	De-interleaves the even(higher) elements of two int64x2 vectors. More...

basic_int64x4	unzip_hi (basic_int64x4 a, basic_int64x4 b)
	De-interleaves the even(higher) elements of two int64x2 vectors. More...


float32x4	unzip_hi (float32x4 a, float32x4 b)
	De-interleaves the even(higher) elements of two float32x4 vectors. More...

float32x8	unzip_hi (float32x8 a, float32x8 b)
	De-interleaves the even(higher) elements of two float32x4 vectors. More...


float64x2	unzip_hi (float64x2 a, float64x2 b)
	De-interleaves the even(higher) elements of two float64x2 vectors. More...

float64x4	unzip_hi (float64x4 a, float64x4 b)
	De-interleaves the even(higher) elements of two float64x2 vectors. More...


int128	permute_bytes16 (int128 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

float32x4	permute_bytes16 (float32x4 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

float64x2	permute_bytes16 (float64x2 a, int128 mask)
	Selects bytes from a vector according to a mask. More...

int256	permute_bytes16 (int256 a, int256 mask)
	Selects bytes from a vector according to a mask. More...

float32x8	permute_bytes16 (float32x8 a, int256 mask)
	Selects bytes from a vector according to a mask. More...

float64x4	permute_bytes16 (float64x4 a, int256 mask)
	Selects bytes from a vector according to a mask. More...


int128	shuffle_bytes16 (int128 a, int128 b, int128 mask)
	Selects bytes from two vectors according to a mask. More...

float32x4	shuffle_bytes16 (float32x4 a, float32x4 b, int128 mask)
	Selects bytes from two vectors according to a mask. More...

float64x2	shuffle_bytes16 (float64x2 a, float64x2 b, int128 mask)
	Selects bytes from two vectors according to a mask. More...

int256	shuffle_bytes16 (int256 a, int256 b, int256 mask)
	Selects bytes from two vectors according to a mask. More...

float32x8	shuffle_bytes16 (float32x8 a, float32x8 b, int256 mask)
	Selects bytes from two vectors according to a mask. More...

float64x4	shuffle_bytes16 (float64x4 a, float64x4 b, int256 mask)
	Selects bytes from two vectors according to a mask. More...


int128	permute_zbytes16 (int128 a, int128 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

float32x4	permute_zbytes16 (float32x4 a, int128 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

float64x2	permute_zbytes16 (float64x2 a, int128 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

int256	permute_zbytes16 (int256 a, int256 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

float32x8	permute_zbytes16 (float32x8 a, int256 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...

float64x4	permute_zbytes16 (float64x4 a, int256 mask)
	Selects bytes from a vector according to a mask, optionally selecting zero. More...


int128	shuffle_zbytes16 (int128 a, int128 b, int128 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

float32x4	shuffle_zbytes16 (float32x4 a, float32x4 b, int128 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

float64x2	shuffle_zbytes16 (float64x2 a, float64x2 b, int128 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

int256	shuffle_zbytes16 (int256 a, int256 b, int256 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

float32x8	shuffle_zbytes16 (float32x8 a, float32x8 b, int256 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...

float64x4	shuffle_zbytes16 (float64x4 a, float64x4 b, int256 mask)
	Selects bytes from two vectors according to a mask, optionally selecting zero. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
int128	permute (basic_int16x8 a)
	Permutes the 16-bit values within each 4 consecutive values of the vector. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int16x16	permute (basic_int16x16 a)
	Permutes the 16-bit values within each 4 consecutive values of the vector. More...


template<unsigned s0, unsigned s1>
basic_int16x8	permute (basic_int16x8 a)
	Permutes the 16-bit values within sets of two consecutive elements of the vector. More...

template<unsigned s0, unsigned s1>
basic_int16x16	permute (basic_int16x16 a)
	Permutes the 16-bit values within sets of two consecutive elements of the vector. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x4	permute (basic_int32x4 a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
basic_int32x8	permute (basic_int32x8 a)
	Permutes the values of each set of four consecutive 32-bit values. More...


template<unsigned s0, unsigned s1>
basic_int32x4	permute (basic_int32x4 a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1>
basic_int32x8	permute (basic_int32x8 a)
	Permutes the values of each set of four consecutive 32-bit values. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x4	permute (float32x4 a)
	Permutes the values of each set of four consecutive 32-bit floating point values. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
float32x8	permute (float32x8 a)
	Permutes the values of each set of four consecutive 32-bit floating point values. More...


template<unsigned s0, unsigned s1>
float32x4	permute (float32x4 a)
	Permutes the values of each set of four consecutive 32-bit floating-point values. More...

template<unsigned s0, unsigned s1>
float32x8	permute (float32x8 a)
	Permutes the values of each set of four consecutive 32-bit floating-point values. More...


template<unsigned s0, unsigned s1>
basic_int64x2	permute (basic_int64x2 a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1>
basic_int64x4	permute (basic_int64x4 a)
	Permutes the values of each set of four consecutive 32-bit values. More...


template<unsigned s0, unsigned s1>
float64x2	permute (float64x2 a)
	Permutes the values of each set of four consecutive 32-bit values. More...

template<unsigned s0, unsigned s1>
float64x4	permute (float64x4 a)
	Permutes the values of each set of four consecutive 32-bit values. More...


template<unsigned s0, unsigned s1>
float64x2	shuffle1 (float64x2 a, float64x2 b)
	Selects 64-bit floating-point values from two vectors. More...

template<unsigned s0, unsigned s1>
float64x4	shuffle1 (float64x4 a, float64x4 b)
	Selects 64-bit floating-point values from two vectors. More...


template<unsigned s0, unsigned s1>
basic_int64x2	shuffle1 (basic_int64x2 a, basic_int64x2 b)
	Selects 64-bit values from two vectors. More...

template<unsigned s0, unsigned s1>
basic_int64x4	shuffle1 (basic_int64x4 a, basic_int64x4 b)
	Selects 64-bit values from two vectors. More...


template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x4	shuffle2 (float32x4 a, float32x4 b)
	Selects 32-bit floating-point values from two vectors. More...

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
float32x8	shuffle2 (float32x8 a, float32x8 b)
	Selects 32-bit floating-point values from two vectors. More...


template<unsigned s0, unsigned s1>
float32x4	shuffle2 (float32x4 a, float32x4 b)
	Selects 32-bit values from two vectors. More...

template<unsigned s0, unsigned s1>
float32x8	shuffle2 (float32x8 a, float32x8 b)
	Selects 32-bit values from two vectors. More...


template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x4	shuffle2 (basic_int32x4 a, basic_int32x4 b)
	Selects 32-bit values from two vectors. More...

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>
basic_int32x8	shuffle2 (basic_int32x8 a, basic_int32x8 b)
	Selects 32-bit values from two vectors. More...


template<unsigned s0, unsigned s1>
basic_int32x4	shuffle2 (basic_int32x4 a, basic_int32x4 b)
	Selects 32-bit values from two vectors. More...

template<unsigned s0, unsigned s1>
basic_int32x8	shuffle2 (basic_int32x8 a, basic_int32x8 b)
	Selects 32-bit values from two vectors. More...


void	transpose2 (basic_int32x4 &a0, basic_int32x4 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	transpose2 (basic_int32x8 &a0, basic_int32x8 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...


void	transpose2 (basic_int64x2 &a0, basic_int64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (basic_int64x4 &a0, basic_int64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...


void	transpose2 (float32x4 &a0, float32x4 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...

void	transpose2 (float32x8 &a0, float32x8 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...


void	transpose2 (float64x2 &a0, float64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	transpose2 (float64x4 &a0, float64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...


void	transpose4 (basic_int32x4 &a0, basic_int32x4 &a1, basic_int32x4 &a2, basic_int32x4 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...


void	transpose4 (basic_int8x16 &a0, basic_int8x16 &a1, basic_int8x16 &a2, basic_int8x16 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (basic_int32x8 &a0, basic_int32x8 &a1, basic_int32x8 &a2, basic_int32x8 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	transpose4 (basic_int8x32 &a0, basic_int8x32 &a1, basic_int8x32 &a2, basic_int8x32 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...


void	transpose4 (basic_int16x8 &a0, basic_int16x8 &a1, basic_int16x8 &a2, basic_int16x8 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	transpose4 (basic_int16x16 &a0, basic_int16x16 &a1, basic_int16x16 &a2, basic_int16x16 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...


void	transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...

void	transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...

Function Documentation

uint8x16 simdpp::abs ( int8x16 a )

inline

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

uint8x32 simdpp::abs ( int8x32 a )

inline

Computes absolute value of 8-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

uint16x8 simdpp::abs ( int16x8 a )

inline

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

uint16x16 simdpp::abs ( int16x16 a )

inline

Computes absolute value of 16-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-5 instructions.

uint32x4 simdpp::abs ( int32x4 a )

inline

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

uint32x8 simdpp::abs ( int32x8 a )

inline

Computes absolute value of 32-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-SSE3 this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 1-3 instructions.

256-bit version:

In SSE2-SSE3 this intrinsic results in at least 6 instructions.
In SSSE3-AVX and NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

uint64x2 simdpp::abs ( int64x2 a )

inline

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In NEON this intrinsic results in at least 6 instructions.
Not vectorized in ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 10 instructions.
In NEON this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
Not vectorized in ALTIVEC.

uint64x4 simdpp::abs ( int64x4 a )

inline

Computes absolute value of 64-bit integer values.

r0 = abs(a0)
...
rN = abs(aN)

128-bit version:

In SSE2-AVX this intrinsic results in at least 5 instructions.
In NEON this intrinsic results in at least 6 instructions.
Not vectorized in ALTIVEC.

256-bit version:

In SSE2-AVX this intrinsic results in at least 10 instructions.
In NEON this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
Not vectorized in ALTIVEC.

uint8x16 simdpp::avg	(	uint8x16	a,
		uint8x16	b
	)

inline

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x32 simdpp::avg	(	uint8x32	a,
		uint8x32	b
	)

inline

Computes rounded average of the unsigned 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int8x16 simdpp::avg	(	int8x16	a,
		int8x16	b
	)

inline

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int8x32 simdpp::avg	(	int8x32	a,
		int8x32	b
	)

inline

Computes rounded average of signed 8-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x8 simdpp::avg	(	uint16x8	a,
		uint16x8	b
	)

inline

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x16 simdpp::avg	(	uint16x16	a,
		uint16x16	b
	)

inline

Computes rounded average of unsigned 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x8 simdpp::avg	(	int16x8	a,
		int16x8	b
	)

inline

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x16 simdpp::avg	(	int16x16	a,
		int16x16	b
	)

inline

Computes rounded average of signed 16-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4-5 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8-9 instructions.
In AVX2 this intrinsic results in at least 4-5 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x4 simdpp::avg	(	uint32x4	a,
		uint32x4	b
	)

inline

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 12-13 instructions.
In AVX2 this intrinsic results in at least 6-7 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x8 simdpp::avg	(	uint32x8	a,
		uint32x8	b
	)

inline

Computes rounded average of unsigned 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 12-13 instructions.
In AVX2 this intrinsic results in at least 6-7 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::avg	(	int32x4	a,
		int32x4	b
	)

inline

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 18-19 instructions.
In AVX2 this intrinsic results in at least 9-10 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x8 simdpp::avg	(	int32x8	a,
		int32x8	b
	)

inline

Computes rounded average of signed 32-bit values.

r0 = (a0 + b0 + 1) / 2
...
rN = (aN + bN + 1) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 9-10 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 18-19 instructions.
In AVX2 this intrinsic results in at least 9-10 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x16 simdpp::avg_trunc	(	uint8x16	a,
		uint8x16	b
	)

inline

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x32 simdpp::avg_trunc	(	uint8x32	a,
		uint8x32	b
	)

inline

Computes truncated average of the unsigned 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int8x16 simdpp::avg_trunc	(	int8x16	a,
		int8x16	b
	)

inline

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int8x32 simdpp::avg_trunc	(	int8x32	a,
		int8x32	b
	)

inline

Computes truncated average of signed 8-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x8 simdpp::avg_trunc	(	uint16x8	a,
		uint16x8	b
	)

inline

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x16 simdpp::avg_trunc	(	uint16x16	a,
		uint16x16	b
	)

inline

Computes truncated average of unsigned 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x8 simdpp::avg_trunc	(	int16x8	a,
		int16x8	b
	)

inline

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x16 simdpp::avg_trunc	(	int16x16	a,
		int16x16	b
	)

inline

Computes truncated average of signed 16-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x4 simdpp::avg_trunc	(	uint32x4	a,
		uint32x4	b
	)

inline

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x8 simdpp::avg_trunc	(	uint32x8	a,
		uint32x8	b
	)

inline

Computes truncated average of unsigned 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::avg_trunc	(	int32x4	a,
		int32x4	b
	)

inline

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 8 instructions.
In NEON this intrinsic results in at least 2 instructions.

int32x8 simdpp::avg_trunc	(	int32x8	a,
		int32x8	b
	)

inline

Computes truncated average of signed 32-bit values.

r0 = (a0 + b0) / 2
...
rN = (aN + bN) / 2

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 14-15 instructions.
In AVX2 this intrinsic results in at least 7-8 instructions.
In ALTIVEC this intrinsic results in at least 8 instructions.
In NEON this intrinsic results in at least 2 instructions.

float32x4 simdpp::ceil ( float32x4 a )

inline

Rounds the values a vector towards positive infinity.

r0 = ceil(a0)
...
rN = ceil(aN)

128-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 13-15 instructions.
In NEON this intrinsic results in at least 11-13 instructions.

256-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 26-28 instructions.
In NEON this intrinsic results in at least 22-24 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::ceil ( float32x8 a )

inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
In NEON this intrinsic results in at least 10-11 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
In NEON this intrinsic results in at least 20-21 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

mask_float32x4 simdpp::cmp_ge	(	float32x4	a,
		float32x4	b
	)

inline

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float32x8 simdpp::cmp_ge	(	float32x8	a,
		float32x8	b
	)

inline

Compares the values of two float32x4 vectors for greater-than or equal.

r0 = (a0 >= b0) ? 0xffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float64x2 simdpp::cmp_ge	(	float64x2	a,
		float64x2	b
	)

inline

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 >= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN >= bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::cmp_ge	(	float64x4	a,
		float64x4	b
	)

inline

mask_float64x2 simdpp::cmp_gt	(	float64x2	a,
		float64x2	b
	)

inline

Compares the values of two float64x2 vectors for greater-than.

r0 = (a0 > b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN > bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::cmp_gt	(	float64x4	a,
		float64x4	b
	)

inline

mask_float32x4 simdpp::cmp_le	(	float32x4	a,
		float32x4	b
	)

inline

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float32x8 simdpp::cmp_le	(	float32x8	a,
		float32x8	b
	)

inline

Compares the values of two float32x4 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float64x2 simdpp::cmp_le	(	float64x2	a,
		float64x2	b
	)

inline

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::cmp_le	(	float64x4	a,
		float64x4	b
	)

inline

Compares the values of two float64x2 vectors for less-than or equal.

r0 = (a0 <= b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN <= bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_int8x16 simdpp::cmp_lt	(	int8x16	a,
		int8x16	b
	)

inline

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int8x32 simdpp::cmp_lt	(	int8x32	a,
		int8x32	b
	)

inline

Compares the values of two signed int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int8x16 simdpp::cmp_lt	(	uint8x16	a,
		uint8x16	b
	)

inline

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int8x32 simdpp::cmp_lt	(	uint8x32	a,
		uint8x32	b
	)

inline

Compares the values of two unsigned int8x16 vectors for less-than.

r0 = (a0 < b0) ? 0xff : 0x0
...
rN = (aN < bN) ? 0xff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int16x8 simdpp::cmp_lt	(	int16x8	a,
		int16x8	b
	)

inline

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int16x16 simdpp::cmp_lt	(	int16x16	a,
		int16x16	b
	)

inline

Compares the values of two signed int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int16x8 simdpp::cmp_lt	(	uint16x8	a,
		uint16x8	b
	)

inline

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int16x16 simdpp::cmp_lt	(	uint16x16	a,
		uint16x16	b
	)

inline

Compares the values of two unsigned int16x8 vectors for less-than.

r0 = (a0 < b0) ? 0xffff : 0x0
...
rN = (aN < bN) ? 0xffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int32x4 simdpp::cmp_lt	(	int32x4	a,
		int32x4	b
	)

inline

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int32x8 simdpp::cmp_lt	(	int32x8	a,
		int32x8	b
	)

inline

Compares the values of two signed int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int32x4 simdpp::cmp_lt	(	uint32x4	a,
		uint32x4	b
	)

inline

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_int32x8 simdpp::cmp_lt	(	uint32x8	a,
		uint32x8	b
	)

inline

Compares the values of two unsigned int32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In XOP this intrinsic results in at least 1 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 6-7 instructions.
In AVX2 this intrinsic results in at least 3-4 instructions.
In XOP, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float32x4 simdpp::cmp_lt	(	float32x4	a,
		float32x4	b
	)

inline

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float32x8 simdpp::cmp_lt	(	float32x8	a,
		float32x8	b
	)

inline

Compares the values of two float32x4 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffff : 0x0

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

mask_float64x2 simdpp::cmp_lt	(	float64x2	a,
		float64x2	b
	)

inline

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::cmp_lt	(	float64x4	a,
		float64x4	b
	)

inline

Compares the values of two float64x2 vectors for less-than.

r0 = (a0 < b0) ? 0xffffffffffffffff : 0x0
...
rN = (aN < bN) ? 0xffffffffffffffff : 0x0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

float32x4 simdpp::div	(	float32x4	a,
		float32x4	b
	)

inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 10 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 12 instructions.
In ALTIVEC this intrinsic results in at least 19 instructions.

float32x8 simdpp::div	(	float32x8	a,
		float32x8	b
	)

inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 10 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 12 instructions.
In ALTIVEC this intrinsic results in at least 19 instructions.

float64x2 simdpp::div	(	float64x2	a,
		float64x2	b
	)

inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

float64x4 simdpp::div	(	float64x4	a,
		float64x4	b
	)

inline

Divides the values of two vectors.

r0 = a0 / b0
...
rN = aN / bN

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned P>

uint8x16 simdpp::div_p	(	uint8x16	num,
		uint8x16	den
	)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN

128-bit version:: The operations costs at least 9 instructions per bit of precision.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
In AVX2 this intrinsic results in at least 4 instructions.

template<unsigned P>

uint16x8 simdpp::div_p	(	uint16x8	num,
		uint16x8	den
	)

Divides one 8-bit unsigned number by another.

The precision of the operation is configurable: only P least significant bits of both numerator and denumerator are considered.

r0 = num0 / den0
...
rN = numN / denN

128-bit version:: The operations costs at least 9 instructions per bit of precision.

256-bit version:

In SSE2-AVX and NEON this intrinsic results in at least 10 instructions.
In AVX2 this intrinsic results in at least 4 instructions.

template<unsigned id>

uint16_t simdpp::extract ( basic_int16x8 a )

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int16_t simdpp::extract ( int16x8 a )

Extracts the id-th element from int16x8 vector.

r = a[id]

This function may have very high latency.

In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint32_t simdpp::extract ( basic_int32x4 a )

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int32_t simdpp::extract ( int32x4 a )

Extracts the id-th element from int32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint64_t simdpp::extract ( basic_int64x2 a )

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In SSE4_1 this intrinsic results in at least 1 instructions.
In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
In SSE4_1_32bit this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

int64_t simdpp::extract ( int64x2 a )

Extracts an element from int64x2 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In SSE4_1 this intrinsic results in at least 1 instructions.
In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 3-4 instructions.
In SSE4_1_32bit this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

float simdpp::extract ( float32x4 a )

Extracts an element from float32x4 vector.

r = a[id]

This function may have very high latency.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

double simdpp::extract ( float64x2 a )

Extracts an element from float64x2 vector.

r = a[id]

This function may have very high latency.

In NEON and ALTIVEC this intrinsic results in at least 2 instructions.

template<unsigned id>

uint16_t simdpp::extract_bits ( uint8x16 a )

Extracts specific bit from each byte of each element of a int8x16 vector.

The default template argument selects the bits from each byte in most efficient way.

r = (a[0] & 0x80 >> 7) | (a[1] & 0x80 >> 6) | ... | (a[15] & 0x80 << 8)

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 7-9 instructions.
In ALTIVEC this intrinsic results in at least 9-11 instructions.

uint16_t simdpp::extract_bits_any ( uint8x16 a )

inline

Extracts a bit from each byte of each element of a int8x16 vector.

This operation is only sensible if each byte within the vector is either 0x00 or 0xff.

r = ((a[0] & 0x??) ? 0x01 : 0) |
    ((a[1] & 0x??) ? 0x02 : 0) |
    ...
    ((a[15] & 0x??) ? 0x80 : 0)

In NEON this intrinsic results in at least 6-7 instructions.
In ALTIVEC this intrinsic results in at least 8-9 instructions.

float32x4 simdpp::floor ( float32x4 a )

inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
In NEON this intrinsic results in at least 10-11 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
In NEON this intrinsic results in at least 20-21 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::floor ( float32x8 a )

inline

Rounds the values of a vector towards negative infinity.

r0 = floor(a0)
...
rN = floor(aN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-14 instructions.
In NEON this intrinsic results in at least 10-11 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 24-26 instructions.
In NEON this intrinsic results in at least 20-21 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

Arch simdpp::get_arch_gcc_builtin_cpu_supports ( )

inline

Retrieves supported architecture using GCC __builtin_cpu_supports function.

Works only on x86.

Arch simdpp::get_arch_linux_cpuinfo ( )

inline

Retrieves supported architecture from Linux /proc/cpuinfo file.

Works on X86 and ARM.

mask_float64x2 simdpp::isnan ( float64x2 a )

inline

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::isnan ( float64x4 a )

inline

Checks whether elements in a are IEEE754 NaN.

r0 = isnan(a0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float32x4 simdpp::isnan2	(	float32x4	a,
		float32x4	b
	)

inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 6 instructions.

mask_float32x8 simdpp::isnan2	(	float32x8	a,
		float32x8	b
	)

inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffff : 0

128-bit version:

In NEON and ALTIVEC this intrinsic results in at least 3 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 6 instructions.

mask_float64x2 simdpp::isnan2	(	float64x2	a,
		float64x2	b
	)

inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

mask_float64x4 simdpp::isnan2	(	float64x4	a,
		float64x4	b
	)

inline

Checks whether corresponding elements in either a or b are IEEE754 NaN.

r0 = isnan(a0) || isnan(b0) ? 0xffffffffffffffff : 0
...
rN = isnan(aN) || isnan(bN) ? 0xffffffffffffffff : 0

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

void simdpp::load_packed2	(	float32x4 &	a,
		float32x4 &	b,
		const float *	p
	)

inline

Loads 32-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:: a = [ *(p), *(p+2), *(p+4), ... , *(p+6) ]

b = [ *(p+1), *(p+3), *(p+5), ... , *(p+7) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+2), *(p+4), ... , *(p+14) ]

b = [ *(p+1), *(p+3), *(p+5), ... , *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed2	(	float32x8 &	a,
		float32x8 &	b,
		const float *	p
	)

inline

void simdpp::load_packed2	(	float64x2 &	a,
		float64x2 &	b,
		const double *	p
	)

inline

Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:: a = [ *(p), *(p+2) ]

b = [ *(p+1), *(p+3) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+2), *(p+4), *(p+14) ]

b = [ *(p+1), *(p+3), *(p+5), *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed2	(	float64x4 &	a,
		float64x4 &	b,
		const double *	p
	)

inline

Loads 64-bit float values packed in pairs, de-interleaves them and stores the result into two vectors.

128-bit version:: a = [ *(p), *(p+2) ]

b = [ *(p+1), *(p+3) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+2), *(p+4), *(p+14) ]

b = [ *(p+1), *(p+3), *(p+5), *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int8x16 &	a,
		basic_int8x16 &	b,
		basic_int8x16 &	c,
		const void *	p
	)

inline

Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+93) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+94) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+95) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int8x32 &	a,
		basic_int8x32 &	b,
		basic_int8x32 &	c,
		const void *	p
	)

inline

Loads 8-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+93) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+94) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+95) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int16x8 &	a,
		basic_int16x8 &	b,
		basic_int16x8 &	c,
		const void *	p
	)

inline

Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int16x16 &	a,
		basic_int16x16 &	b,
		basic_int16x16 &	c,
		const void *	p
	)

inline

Loads 16-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+45) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+46) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+47) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int32x4 &	a,
		basic_int32x4 &	b,
		basic_int32x4 &	c,
		const void *	p
	)

inline

Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int32x8 &	a,
		basic_int32x8 &	b,
		basic_int32x8 &	c,
		const void *	p
	)

inline

Loads 32-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int64x2 &	a,
		basic_int64x2 &	b,
		basic_int64x2 &	c,
		const void *	p
	)

inline

Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3) ]

b = [ *(p+1), *(p+4) ]

c = [ *(p+2), *(p+5) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	basic_int64x4 &	a,
		basic_int64x4 &	b,
		basic_int64x4 &	c,
		const void *	p
	)

inline

Loads 64-bit values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3) ]

b = [ *(p+1), *(p+4) ]

c = [ *(p+2), *(p+5) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	float32x4 &	a,
		float32x4 &	b,
		float32x4 &	c,
		const float *	p
	)

inline

Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	float32x8 &	a,
		float32x8 &	b,
		float32x8 &	c,
		const float *	p
	)

inline

Loads 32-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), ... , *(p+21) ]

b = [ *(p+1), *(p+4), *(p+7), ... , *(p+22) ]

c = [ *(p+2), *(p+5), *(p+8), ... , *(p+23) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	float64x2 &	a,
		float64x2 &	b,
		float64x2 &	c,
		const double *	p
	)

inline

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3) ]

b = [ *(p+1), *(p+4) ]

c = [ *(p+2), *(p+5) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 32 bytes.

void simdpp::load_packed3	(	float64x4 &	a,
		float64x4 &	b,
		float64x4 &	c,
		const double *	p
	)

inline

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: a = [ *(p), *(p+3) ]

b = [ *(p+1), *(p+4) ]

c = [ *(p+2), *(p+5) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+3), *(p+6), *(p+9) ]

b = [ *(p+1), *(p+4), *(p+7), *(p+10) ]

c = [ *(p+2), *(p+5), *(p+8), *(p+11) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int8x16 &	a,
		basic_int8x16 &	b,
		basic_int8x16 &	c,
		basic_int8x16 &	d,
		const void *	p
	)

inline

Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+124) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+125) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+126) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+127) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int8x32 &	a,
		basic_int8x32 &	b,
		basic_int8x32 &	c,
		basic_int8x32 &	d,
		const void *	p
	)

inline

Loads 8-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+124) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+125) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+126) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+127) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int16x8 &	a,
		basic_int16x8 &	b,
		basic_int16x8 &	c,
		basic_int16x8 &	d,
		const void *	p
	)

inline

Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int16x16 &	a,
		basic_int16x16 &	b,
		basic_int16x16 &	c,
		basic_int16x16 &	d,
		const void *	p
	)

inline

Loads 16-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+60) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+61) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+62) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+63) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int32x4 &	a,
		basic_int32x4 &	b,
		basic_int32x4 &	c,
		basic_int32x4 &	d,
		const void *	p
	)

inline

Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int32x8 &	a,
		basic_int32x8 &	b,
		basic_int32x8 &	c,
		basic_int32x8 &	d,
		const void *	p
	)

inline

Loads 32-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int64x2 &	a,
		basic_int64x2 &	b,
		basic_int64x2 &	c,
		basic_int64x2 &	d,
		const void *	p
	)

inline

Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4) ]

b = [ *(p+1), *(p+5) ]

c = [ *(p+2), *(p+6) ]

d = [ *(p+3), *(p+7) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	basic_int64x4 &	a,
		basic_int64x4 &	b,
		basic_int64x4 &	c,
		basic_int64x4 &	d,
		const void *	p
	)

inline

Loads 64-bit values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4) ]

b = [ *(p+1), *(p+5) ]

c = [ *(p+2), *(p+6) ]

d = [ *(p+3), *(p+7) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	float32x4 &	a,
		float32x4 &	b,
		float32x4 &	c,
		float32x4 &	d,
		const float *	p
	)

inline

Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	float32x8 &	a,
		float32x8 &	b,
		float32x8 &	c,
		float32x8 &	d,
		const float *	p
	)

inline

Loads 32-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), ... , *(p+28) ]

b = [ *(p+1), *(p+5), *(p+9), ... , *(p+29) ]

c = [ *(p+2), *(p+6), *(p+10), ... , *(p+30) ]

d = [ *(p+3), *(p+7), *(p+11), ... , *(p+31) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	float64x2 &	a,
		float64x2 &	b,
		float64x2 &	c,
		float64x2 &	d,
		const double *	p
	)

inline

Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4) ]

b = [ *(p+1), *(p+5) ]

c = [ *(p+2), *(p+6) ]

d = [ *(p+3), *(p+7) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 32 bytes.

void simdpp::load_packed4	(	float64x4 &	a,
		float64x4 &	b,
		float64x4 &	c,
		float64x4 &	d,
		const double *	p
	)

inline

Loads 64-bit floating-point values packed in quartets, de-interleaves them and stores the result into four vectors.

128-bit version:: a = [ *(p), *(p+4) ]

b = [ *(p+1), *(p+5) ]

c = [ *(p+2), *(p+6) ]

d = [ *(p+3), *(p+7) ]

p must be aligned to 16 bytes.

256-bit version:: a = [ *(p), *(p+4), *(p+8), *(p+12) ]

b = [ *(p+1), *(p+5), *(p+9), *(p+13) ]

c = [ *(p+2), *(p+6), *(p+10), *(p+14) ]

d = [ *(p+3), *(p+7), *(p+11), *(p+15) ]

p must be aligned to 32 bytes.

template<int s0, int s1, int s2, int s3>

basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>

basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>

basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>

basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of eight adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>

basic_int8x16 simdpp::make_shuffle_bytes16_mask ( basic_int8x16 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7, int s8, int s9, int s10, int s11, int s12, int s13, int s14, int s15>

basic_int8x32 simdpp::make_shuffle_bytes16_mask ( basic_int8x32 & mask )

Makes a mask to shuffle an int8x16 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,15] select elements from the first vector. Values [16,32] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 16 ? a[s0] : b[s0-16])
r1 = (s1 == -1) ? 0 : (s0 < 16 ? a[s1] : b[s1-16])
...
r15 = (s15 == -1) ? 0 : (s15 < 16 ? a[s15] : b[s15-16])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])
...
r6 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+6] : b[s0+4])
r7 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+6] : b[s1+4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>

basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>

basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of four adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])
...
r12 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0+12] : b[s0+8])
r13 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1+12] : b[s1+8])
r14 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2+12] : b[s2+8])
r15 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3+12] : b[s3+8])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>

basic_int16x8 simdpp::make_shuffle_bytes16_mask ( basic_int16x8 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7>

basic_int16x16 simdpp::make_shuffle_bytes16_mask ( basic_int16x16 & mask )

Makes a mask to shuffle an int16x8 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,7] select elements from the first vector. Values [8,15] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0] : b[s0-8])
...
r7 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7] : b[s7-8])
r8 = (s0 == -1) ? 0 : (s0 < 8 ? a[s0+8] : b[s0])
...
r15 = (s7 == -1) ? 0 : (s7 < 8 ? a[s7+8] : b[s7])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int32x4 simdpp::make_shuffle_bytes16_mask ( basic_int32x4 & mask )

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int32x8 simdpp::make_shuffle_bytes16_mask ( basic_int32x8 & mask )

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

All elements within vectors are grouped into sets of two adjacent elements. Elements within each set of the resulting vector can be selected only from corresponding sets of the source vectors.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0-2])
r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1-2])
r2 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0+2] : b[s0])
r3 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1+2] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>

basic_int32x4 simdpp::make_shuffle_bytes16_mask ( basic_int32x4 & mask )

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1, int s2, int s3>

basic_int32x8 simdpp::make_shuffle_bytes16_mask ( basic_int32x8 & mask )

Makes a mask to shuffle an int32x4 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,3] select elements from the first vector. Values [4,7] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 4 ? a[s0] : b[s0-4])
r1 = (s1 == -1) ? 0 : (s1 < 4 ? a[s1] : b[s1-4])
r2 = (s2 == -1) ? 0 : (s2 < 4 ? a[s2] : b[s2-4])
r3 = (s3 == -1) ? 0 : (s3 < 4 ? a[s3] : b[s3-4])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int64x2 simdpp::make_shuffle_bytes16_mask ( basic_int64x2 & mask )

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])

r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

template<int s0, int s1>

basic_int64x4 simdpp::make_shuffle_bytes16_mask ( basic_int64x4 & mask )

Makes a mask to shuffle an int64x2 vector using permute_bytes16, shuffle_bytes16, permute_zbytes16 or shuffle_zbytes16 functions.

The template arguments define which elements to select from each element group: Values [0,1] select elements from the first vector. Values [2,3] select elements from the second vector. The mask can only be used in shuffle_bytes16 or shuffle_zbytes16 Value [-1] sets the corresponding element to zero. The mask can only be used in permute_zbytes16 or shuffle_zbytes16

128-bit version:

The created mask will cause shuffle_bytes16 to perform as follows:

r0 = (s0 == -1) ? 0 : (s0 < 2 ? a[s0] : b[s0])

r1 = (s1 == -1) ? 0 : (s1 < 2 ? a[s1] : b[s1])

256-bit version:

The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

int8x16 simdpp::max	(	int8x16	a,
		int8x16	b
	)

inline

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int8x32 simdpp::max	(	int8x32	a,
		int8x32	b
	)

inline

Computes maximum of the signed 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x16 simdpp::max	(	uint8x16	a,
		uint8x16	b
	)

inline

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x32 simdpp::max	(	uint8x32	a,
		uint8x32	b
	)

inline

Computes maximum of the unsigned 8-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x8 simdpp::max	(	int16x8	a,
		int16x8	b
	)

inline

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x16 simdpp::max	(	int16x16	a,
		int16x16	b
	)

inline

Computes maximum of the signed 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x8 simdpp::max	(	uint16x8	a,
		uint16x8	b
	)

inline

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x16 simdpp::max	(	uint16x16	a,
		uint16x16	b
	)

inline

Computes maximum of the unsigned 16-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::max	(	int32x4	a,
		int32x4	b
	)

inline

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x8 simdpp::max	(	int32x8	a,
		int32x8	b
	)

inline

Computes maximum of the signed 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x4 simdpp::max	(	uint32x4	a,
		uint32x4	b
	)

inline

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x8 simdpp::max	(	uint32x8	a,
		uint32x8	b
	)

inline

Computes maximum of the unsigned 32-bit values.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::max	(	float32x4	a,
		float32x4	b
	)

inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::max	(	float32x8	a,
		float32x8	b
	)

inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::max	(	float64x2	a,
		float64x2	b
	)

inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

float64x4 simdpp::max	(	float64x4	a,
		float64x4	b
	)

inline

Computes maxima of the values of two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = max(a0, b0)
...
rN = max(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

uint8x16 simdpp::min	(	uint8x16	a,
		uint8x16	b
	)

inline

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint8x32 simdpp::min	(	uint8x32	a,
		uint8x32	b
	)

inline

Computes minimum of the unsigned 8-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x8 simdpp::min	(	int16x8	a,
		int16x8	b
	)

inline

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int16x16 simdpp::min	(	int16x16	a,
		int16x16	b
	)

inline

Computes minimum of the signed 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x8 simdpp::min	(	uint16x8	a,
		uint16x8	b
	)

inline

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint16x16 simdpp::min	(	uint16x16	a,
		uint16x16	b
	)

inline

Computes minimum of the unsigned 16-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x4 simdpp::min	(	int32x4	a,
		int32x4	b
	)

inline

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

int32x8 simdpp::min	(	int32x8	a,
		int32x8	b
	)

inline

Computes minimum of the signed 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 4 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 8 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x4 simdpp::min	(	uint32x4	a,
		uint32x4	b
	)

inline

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

uint32x8 simdpp::min	(	uint32x8	a,
		uint32x8	b
	)

inline

Computes minimum of the unsigned 32-bit values.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

In SSE2-SSSE3 this intrinsic results in at least 6-7 instructions.

256-bit version:

In SSE2-SSSE3 this intrinsic results in at least 12-13 instructions.
In SSE4.1-AVX, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::min	(	float32x4	a,
		float32x4	b
	)

inline

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::min	(	float32x8	a,
		float32x8	b
	)

inline

Computes minimum of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::min	(	float64x2	a,
		float64x2	b
	)

inline

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

float64x4 simdpp::min	(	float64x4	a,
		float64x4	b
	)

inline

Computes minima of the values in two vectors.

If at least one of the values is NaN, or both values are zeroes, it is unspecified which value will be returned.

r0 = min(a0, b0)
...
rN = min(aN, bN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

Not vectorized in NEON and .
In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1>

basic_int16x8 simdpp::permute ( basic_int16x8 a )

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int16x16 simdpp::permute ( basic_int16x16 a )

Permutes the 16-bit values within sets of two consecutive elements of the vector.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
r4 = a[s0+4]
r5 = a[s1+4]
...

: 128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In NEON and ALTIVEC this intrinsic results in at least 1-2 instructions.

: 256-bit version:

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

basic_int32x4 simdpp::permute ( basic_int32x4 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

basic_int32x8 simdpp::permute ( basic_int32x8 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int32x4 simdpp::permute ( basic_int32x4 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int32x8 simdpp::permute ( basic_int32x8 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

float32x4 simdpp::permute ( float32x4 a )

Permutes the values of each set of four consecutive 32-bit floating point values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

float32x8 simdpp::permute ( float32x8 a )

Permutes the values of each set of four consecutive 32-bit floating point values.

The selector values must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
256-bit version:
r4 = a[s0+4]
...
r7 = a[s3+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

float32x4 simdpp::permute ( float32x4 a )

Permutes the values of each set of four consecutive 32-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

float32x8 simdpp::permute ( float32x8 a )

Permutes the values of each set of four consecutive 32-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s0+2]
r3 = a[s1+2]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = a[s0+6]
r7 = a[s1+6]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

basic_int64x4 simdpp::permute ( basic_int64x4 a )

Permutes the values of each set of four consecutive 64-bit values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]

In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

float64x4 simdpp::permute ( float64x4 a )

Permutes the values of each set of four consecutive 64-bit floating-point values.

The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = a[s2]
r3 = a[s3]

In SSE2-AVX this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-4 instructions.

template<unsigned s0, unsigned s1>

basic_int64x2 simdpp::permute ( basic_int64x2 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

template<unsigned s0, unsigned s1>

basic_int64x4 simdpp::permute ( basic_int64x4 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]

128-bit version:

In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

template<unsigned s0, unsigned s1>

float64x2 simdpp::permute ( float64x2 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s0, unsigned s1>

float64x4 simdpp::permute ( float64x4 a )

Permutes the values of each set of four consecutive 32-bit values.

The selector values must be in range [0; 1].

r0 = a[s0]
r1 = a[s1]
256-bit version:
r2 = a[s0+2]
r3 = a[s1+2]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

int128 simdpp::permute_zbytes16	(	int128	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x4 simdpp::permute_zbytes16	(	float32x4	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float64x2 simdpp::permute_zbytes16	(	float64x2	a,
		int128	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

int256 simdpp::permute_zbytes16	(	int256	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x8 simdpp::permute_zbytes16	(	float32x8	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float64x4 simdpp::permute_zbytes16	(	float64x4	a,
		int256	mask
	)

inline

Selects bytes from a vector according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-4 must be zero or the behavior is undefined Bits 3-0 define the element within the given vector.

128-bit version:

Not implemented for SSE2-SSE3.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3-AVX this intrinsic results in at least 2 instructions.
In AVX2 this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x4 simdpp::rcp_e ( float32x4 a )

inline

Computes approximate reciprocal.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::rcp_e ( float32x8 a )

inline

Computes approximate reciprocal.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1.0f / a0)
...
rN = approx(1.0f / aN)

256-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::rcp_rh	(	float32x4	x,
		float32x4	a
	)

inline

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
float32x4 x;
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 4-5 instructions.

float32x8 simdpp::rcp_rh	(	float32x8	x,
		float32x8	a
	)

inline

Computes one Newton-Rhapson iterations for reciprocal.

x is the current estimate, a are the values to estimate reciprocal for.

r0 = x0 * (2 - x0*a0)
...
rN = xN * (2 - xN*aN)

Using this function, one can the division can be implemented as follows:

// a/b
float32x4 x;
x = rcp_e(b);
x = rcp_rh(x, b);
x = rcp_rh(x, b);
return mul(a, x);

Precision can be controlled by selecting the number of rcp_rh steps.

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 3-4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 3-4 instructions.
In SSE2-SSE4.1 this intrinsic results in at least 6-7 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 4-5 instructions.

float32x4 simdpp::rsqrt_e ( float32x4 a )

inline

Computes approximate reciprocal square root.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))

128-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::rsqrt_e ( float32x8 a )

inline

Computes approximate reciprocal square root.

Relative error is as follows:

1/2 ULP for NULL and NEON
~1/2730 for SSE2
1/4096 for ALTIVEC
1/256 for NEON_FLT_SP

r0 = approx(1 / sqrt(a0))
...
rN = approx(1 / sqrt(aN))

128-bit version:

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::rsqrt_rh	(	float32x4	x,
		float32x4	a
	)

inline

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5

128-bit version:

In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
In NEON this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 7 instructions.
In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 8-10 instructions.

float32x8 simdpp::rsqrt_rh	(	float32x8	x,
		float32x8	a
	)

inline

Computes one Newton-Rhapson iteration for inverse of square root.

x is the current estimate, a are the values to estimate the inverse square root for.

r0 = x0 * (3 - a0*x0*x0) * 0.5
...
rN = xN * (3 - aN*xN*xN) * 0.5

128-bit version:

In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 5-7 instructions.
In NEON this intrinsic results in at least 3 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

256-bit version:

In AVX-AVX2 this intrinsic results in at least 7 instructions.
In SSE2, SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 10-12 instructions.
In NEON this intrinsic results in at least 6 instructions.
In ALTIVEC this intrinsic results in at least 8-10 instructions.

template<unsigned s0, unsigned s1>

float64x2 simdpp::shuffle1	(	float64x2	a,
		float64x2	b
	)

Selects 64-bit floating-point values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s0, unsigned s1>

float64x4 simdpp::shuffle1	(	float64x4	a,
		float64x4	b
	)

Selects 64-bit floating-point values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

template<unsigned s0, unsigned s1>

basic_int64x2 simdpp::shuffle1	(	basic_int64x2	a,
		basic_int64x2	b
	)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int64x4 simdpp::shuffle1	(	basic_int64x4	a,
		basic_int64x4	b
	)

Selects 64-bit values from two vectors.

The first value in each pair of values must come from a, the second - from b. The selector values must be in range [0; 1].

r0 = a[s0]
r1 = b[s1]
256-bit version:
r2 = a[s0+2]
r3 = b[s1+2]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 1-2 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>

float32x4 simdpp::shuffle2	(	float32x4	a,
		float32x4	b
	)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>

float32x8 simdpp::shuffle2	(	float32x8	a,
		float32x8	b
	)

Selects 32-bit floating-point values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 1-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

float32x4 simdpp::shuffle2	(	float32x4	a,
		float32x4	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

float32x8 simdpp::shuffle2	(	float32x8	a,
		float32x8	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]

128-bit version:

In ALTIVEC this intrinsic results in at least 1-2 instructions.
In NEON this intrinsic results in at least 2-4 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>

basic_int32x4 simdpp::shuffle2	(	basic_int32x4	a,
		basic_int32x4	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned a0, unsigned a1, unsigned b0, unsigned b1>

basic_int32x8 simdpp::shuffle2	(	basic_int32x8	a,
		basic_int32x8	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[a0]
r1 = a[a1]
r2 = b[b0]
r3 = b[b1]
256-bit version:
r4 = a[a0+4]
r5 = a[a1+4]
r6 = b[b0+4]
r7 = b[b1+4]

128-bit version:

In NEON this intrinsic results in at least 1-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int32x4 simdpp::shuffle2	(	basic_int32x4	a,
		basic_int32x4	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

template<unsigned s0, unsigned s1>

basic_int32x8 simdpp::shuffle2	(	basic_int32x8	a,
		basic_int32x8	b
	)

Selects 32-bit values from two vectors.

The first two values in each four consecutive values must come from a, the last two - from b. The selector values must be in range [0; 3].

r0 = a[s0]
r1 = a[s1]
r2 = b[s0]
r3 = b[s1]
256-bit version:
r4 = a[s0+4]
r5 = a[s1+4]
r6 = b[s0+4]
r7 = b[s1+4]

128-bit version:

In NEON this intrinsic results in at least 2-4 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4-8 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

int128 simdpp::shuffle_bytes16	(	int128	a,
		int128	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float32x4 simdpp::shuffle_bytes16	(	float32x4	a,
		float32x4	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float64x2 simdpp::shuffle_bytes16	(	float64x2	a,
		float64x2	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

int256 simdpp::shuffle_bytes16	(	int256	a,
		int256	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::shuffle_bytes16	(	float32x8	a,
		float32x8	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

float64x4 simdpp::shuffle_bytes16	(	float64x4	a,
		float64x4	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask.

Each byte within the mask defines which element to select: Bits 7-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 6 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 4 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 12 instructions.
In SSE4.1-AVX this intrinsic results in at least 8 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2 instructions.

int128 simdpp::shuffle_zbytes16	(	int128	a,
		int128	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x4 simdpp::shuffle_zbytes16	(	float32x4	a,
		float32x4	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float64x2 simdpp::shuffle_zbytes16	(	float64x2	a,
		float64x2	b,
		int128	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

int256 simdpp::shuffle_zbytes16	(	int256	a,
		int256	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x8 simdpp::shuffle_zbytes16	(	float32x8	a,
		float32x8	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float64x4 simdpp::shuffle_zbytes16	(	float64x4	a,
		float64x4	b,
		int256	mask
	)

inline

Selects bytes from two vectors according to a mask, optionally selecting zero.

Each byte within the mask defines which element to select: Bit 7 results in the result byte being zeroed, if set. Bits 6-5 must be zero or the behavior is undefined Bit 4 defines which vector to select. 0 corresponds to a, 1 to b. Bits 3-0 define the element within the selected vector.

128-bit version:

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 9 instructions.
In SSE4.1-AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 1 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 1-2 instructions.

256-bit version:: The vectors will be shuffled as if the 128-bit version was applied to the lower and higher halves of the vectors separately.

Not implemented for SSE2-SSE3.
In SSSE3 this intrinsic results in at least 18 instructions.
In SSE4.1-AVX this intrinsic results in at least 12 instructions.
In AVX2 this intrinsic results in at least 6 instructions.
In XOP this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-3 instructions.

float32x4 simdpp::sqrt ( float32x4 a )

inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

In NEON this intrinsic results in at least 5 instructions.
In ALTIVEC this intrinsic results in at least 5-7 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 10 instructions.
In ALTIVEC this intrinsic results in at least 10-12 instructions.

float32x8 simdpp::sqrt ( float32x8 a )

inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

In NEON this intrinsic results in at least 5 instructions.
In ALTIVEC this intrinsic results in at least 5-7 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 10 instructions.
In ALTIVEC this intrinsic results in at least 10-12 instructions.

float64x2 simdpp::sqrt ( float64x2 a )

inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

float64x4 simdpp::sqrt ( float64x4 a )

inline

Computes square root.

r0 = sqrt(a0)
...
rN = sqrt(aN)

128-bit version:

Not vectorized in NEON and .

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

void simdpp::transpose16	(	basic_int8x16 &	a0,
		basic_int8x16 &	a1,
		basic_int8x16 &	a2,
		basic_int8x16 &	a3,
		basic_int8x16 &	a4,
		basic_int8x16 &	a5,
		basic_int8x16 &	a6,
		basic_int8x16 &	a7,
		basic_int8x16 &	a8,
		basic_int8x16 &	a9,
		basic_int8x16 &	a10,
		basic_int8x16 &	a11,
		basic_int8x16 &	a12,
		basic_int8x16 &	a13,
		basic_int8x16 &	a14,
		basic_int8x16 &	a15
	)

inline

Transposes a 16x16 8-bit matrix within sixteen int8x16 vectors.

r0 =  [ a0_0; ...; a15_0 ]
r1 =  [ a0_1; ...; a15_1 ]
...
r15 = [ a0_15; ...; a15_15 ]

128-bit version:

In SSE2-AVX2 and NEON this intrinsic results in at least 32 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 64 instructions.
In AVX2 this intrinsic results in at least 32 instructions.
In NEON this intrinsic results in at least 64 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose16	(	basic_int8x32 &	a0,
		basic_int8x32 &	a1,
		basic_int8x32 &	a2,
		basic_int8x32 &	a3,
		basic_int8x32 &	a4,
		basic_int8x32 &	a5,
		basic_int8x32 &	a6,
		basic_int8x32 &	a7,
		basic_int8x32 &	a8,
		basic_int8x32 &	a9,
		basic_int8x32 &	a10,
		basic_int8x32 &	a11,
		basic_int8x32 &	a12,
		basic_int8x32 &	a13,
		basic_int8x32 &	a14,
		basic_int8x32 &	a15
	)

inline

void simdpp::transpose8	(	basic_int16x8 &	a0,
		basic_int16x8 &	a1,
		basic_int16x8 &	a2,
		basic_int16x8 &	a3,
		basic_int16x8 &	a4,
		basic_int16x8 &	a5,
		basic_int16x8 &	a6,
		basic_int16x8 &	a7
	)

inline

Transposes a 8x8 16-bit matrix within eight int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ...; a7_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ...; a7_1 ]
...
r7 = [ a0_7; a1_7; a2_7; a3_7 ...; a7_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 24 instructions.
In NEON this intrinsic results in at least 12 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 48 instructions.
In AVX2 this intrinsic results in at least 24 instructions.
In NEON this intrinsic results in at least 24 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose8	(	basic_int16x16 &	a0,
		basic_int16x16 &	a1,
		basic_int16x16 &	a2,
		basic_int16x16 &	a3,
		basic_int16x16 &	a4,
		basic_int16x16 &	a5,
		basic_int16x16 &	a6,
		basic_int16x16 &	a7
	)

inline

float32x4 simdpp::trunc ( float32x4 a )

inline

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)

128-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
In NEON this intrinsic results in at least 5-6 instructions.

256-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
In NEON this intrinsic results in at least 10-11 instructions.
In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.

float32x8 simdpp::trunc ( float32x8 a )

inline

Rounds the values of a vector towards zero.

r0 = trunc(a0)
...
rN = trunc(aN)

128-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 7-9 instructions.
In NEON this intrinsic results in at least 5-6 instructions.

256-bit version:

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 14-16 instructions.
In NEON this intrinsic results in at least 10-11 instructions.
In SSE4.1 and ALTIVEC this intrinsic results in at least 2 instructions.

Namespaces

Classes

Typedefs

Functions

Function Documentation