Functions
void	prefetch_l1 (const void *ptr)
	Prefetches data to level 1 cache. More...

void	prefetch_l2 (const void *ptr)
	Prefetches data to level 2 cache. More...

void	prefetch_l3 (const void *ptr)
	Prefetches data to level 3 cache. More...

void	prefetch_nt (const void *ptr)
	Prefetches data to a non-temporary buffer to be read once. More...

template<class = void>
bool	test_zero (uint8x16 a)
	Tests no bits are set in 128-bit integer vector. More...

template<class = void>
bool	test_zero (uint16x8 a)

template<class = void>
bool	test_zero (uint32x4 a)

template<class = void>
bool	test_zero (uint64x2 a)

template<class = void>
bool	test_zero (uint8x16 a, uint8x16 mask)
	Tests if 128-bit integer a consists only from zeros if a mask mask is applied. More...

template<class = void>
bool	test_zero (uint16x8 a, uint16x8 mask)

template<class = void>
bool	test_zero (uint32x4 a, uint32x4 mask)

template<class = void>
bool	test_zero (uint64x2 a, uint64x2 mask)

template<class = void>
bool	test_ones (uint8x16 a)
	Tests if all bits are set in a 128-bit integer. More...

template<class = void>
bool	test_ones (uint16x8 a)

template<class = void>
bool	test_ones (uint32x4 a)

template<class = void>
bool	test_ones (uint64x2 a)

template<class = void>
bool	test_ones (uint8x16 a, uint8x16 mask)
	Tests if 128-bit integer consists only from ones when a mask is applied. More...

template<class = void>
bool	test_ones (uint16x8 a, uint16x8 mask)

template<class = void>
bool	test_ones (uint32x4 a, uint32x4 mask)

template<class = void>
bool	test_ones (uint64x2 a, uint64x2 mask)

uint32x4	to_int32x4 (int8x16 a)
	Sign extends the values of a signed int8x16 vector to 32-bits. More...

uint32x4	to_int32x4 (uint8x16 a)
	Extends the values of a unsigned int8x16 vector to 32-bits. More...

uint64x2	to_int64x2 (int8x16 a)
	Sign extends the values of a signed int8x16 vector to 64-bits. More...

uint64x2	to_int64x2 (int16x8 a)
	Sign extends the values of a signed int16x8 vector to 64-bits. More...

uint64x2	to_int64x2 (uint8x16 a)
	Extends the values of a unsigned int8x16 vector to 64-bits. More...

uint32x4	to_int32x4_r (float32x4 a)
	Converts the values of a float32x4 vector into signed int32_t representation. More...

uint32x4	to_int32x4 (float64x2 a)
	Converts the values of a float64x2 vector into int32_t representation using truncation. More...

uint32x4	to_int32x4_r (float64x2 a)
	Converts the values of a float64x2 vector into int32_t representation. More...

float32x4	hadd2 (float32x4 a, float32x4 b)
	Adds the values in adjacent pairs of two float32x4 vectors. More...

float32x8	hadd2 (float32x8 a, float32x8 b)
	Adds the values in adjacent pairs of two float32x8 vectors. More...

float64x2	hadd2 (float64x2 a, float64x2 b)
	Adds the values in adjacent pairs of two float64x2 vectors. More...

float32x4	hadd4 (float32x4 a)
	Sums the values of a float32x4 vector. More...

float32x4	hadd4 (float32x4 a, float32x4 b, float32x4 c, float32x4 d)
	Sums the values within each of four float32x4 vector. More...

float32x4	hsub2 (float32x4 a, float32x4 b)
	Subtracts the values in adjacent pairs of two float32x4 vectors. More...

float32x8	hsub2 (float32x8 a, float32x8 b)
	Subtracts the values in adjacent pairs of two float32x8 vectors. More...

float64x2	hsub2 (float64x2 a, float64x2 b)
	Subtracts the values in adjacent pairs of two float64x2 vectors. More...

float32x4	sub_add (float32x4 a, float32x4 b)
	Adds or substracts the values of two float32x4 vectors. More...

float64x2	sub_add (float64x2 a, float64x2 b)
	Adds or subtracts the values of two float64x2 vectors. More...

int8x16	copysign (int8x16 a, int8x16 b)
	Copies sign from the values of one int8x16 vector to another. More...

int16x8	copysign (int16x8 a, int16x8 b)
	Copies sign from the values of one int16x8 vector to another. More...

int32x4	copysign (int32x4 a, int32x4 b)
	Copies sign from the values of one int32x4 vector to another. More...

uint16x8	hadd2 (uint16x8 a, uint16x8 b)
	Adds values in adjacent pairs of two int16x8 vectors. More...

uint32x4	hadd2 (uint32x4 a, uint32x4 b)
	Adds values in adjacent pairs of two int32x4 vectors. More...

uint64x2	hadd2 (uint64x2 a, uint64x2 b)
	Adds values in adjacent pairs of two int64x2 vectors. More...

int16x8	hadds2 (int16x8 a, int16x8 b)
	Adds and saturates values in adjacent pairs of two signed int16x8 vectors. More...

uint32x4	hadd4 (uint32x4 a, uint32x4 b, uint32x4 c, uint32x4 d)
	Sums the values within each of four int32x4 vector. More...

uint16x8	hsub2 (uint16x8 a, uint16x8 b)
	Subtracts values in adjacent pairs of two int16x8 vectors. More...

uint32x4	hsub2 (uint32x4 a, uint32x4 b)
	Subtracts values in adjacent pairs of two int32x4 vectors. More...

uint64x2	hsub2 (uint64x2 a, uint64x2 b)
	Subtracts values in adjacent pairs of two int64x2 vectors. More...

int16x8	hsubs2 (int16x8 a, int16x8 b)
	Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors. More...

void	store_masked (void *p, uint8x16 a, uint8x16 mask)
	Stores bytes in an 128-bit integer vector according to a mask. More...

void	store_masked (void *p, uint16x8 a, uint16x8 mask)

void	store_masked (void *p, uint32x4 a, uint32x4 mask)

void	store_masked (void *p, uint64x2 a, uint64x2 mask)


uint8x16	extract_lo (uint8x32 a)
	Extracts the lower half of a 256-bit vector. More...

uint16x8	extract_lo (uint16x16 a)
	Extracts the lower half of a 256-bit vector. More...

uint32x4	extract_lo (uint32x8 a)
	Extracts the lower half of a 256-bit vector. More...

uint64x2	extract_lo (uint64x4 a)
	Extracts the lower half of a 256-bit vector. More...

float32x4	extract_lo (float32x8 a)
	Extracts the lower half of a 256-bit vector. More...

float64x2	extract_lo (float64x4 a)
	Extracts the lower half of a 256-bit vector. More...


uint8x16	extract_hi (uint8x32 a)
	Extracts the higher half of a 256-bit vector. More...

uint16x8	extract_hi (uint16x16 a)
	Extracts the higher half of a 256-bit vector. More...

uint32x4	extract_hi (uint32x8 a)
	Extracts the higher half of a 256-bit vector. More...

uint64x2	extract_hi (uint64x4 a)
	Extracts the higher half of a 256-bit vector. More...

float32x4	extract_hi (float32x8 a)
	Extracts the higher half of a 256-bit vector. More...

float64x2	extract_hi (float64x4 a)
	Extracts the higher half of a 256-bit vector. More...


template<unsigned P, unsigned N>
void	load_lane (uint8x16 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...

template<unsigned P, unsigned N>
void	load_lane (uint16x8 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...

template<unsigned P, unsigned N>
void	load_lane (uint32x4 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...

template<unsigned P, unsigned N>
void	load_lane (uint64x2 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...

template<unsigned P, unsigned N>
void	load_lane (float32x4 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...

template<unsigned P, unsigned N>
void	load_lane (float64x2 &a, const void *p)
	Loads the first N elements of a 128-bit vector from memory. More...


template<unsigned P, unsigned N>
void	store_lane (void *p, uint8x16 a)
	Stores the first N elements of a 128-bit vector to memory. More...

template<unsigned P, unsigned N>
void	store_lane (void *p, uint16x8 a)
	Stores the first N elements of a 128-bit vector to memory. More...

template<unsigned P, unsigned N>
void	store_lane (void *p, uint32x4 a)
	Stores the first N elements of a 128-bit vector to memory. More...

template<unsigned P, unsigned N>
void	store_lane (void *p, uint64x2 a)
	Stores the first N elements of a 128-bit vector to memory. More...

template<unsigned P, unsigned N>
void	store_lane (void *p, float32x4 a)
	Stores the first N elements of a 128-bit vector to memory. More...

template<unsigned P, unsigned N>
void	store_lane (void *p, float64x2 a)
	Stores the first N elements of a 128-bit vector to memory. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8	permute_lo (uint16x8 a)
	Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16	permute_lo (uint16x16 a)
	Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More...


template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8	permute_hi (uint16x8 a)
	Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More...

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16	permute_hi (uint16x16 a)
	Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More...

Function Documentation

int8x16 simdpp::sse::copysign	(	int8x16	a,
		int8x16	b
	)

inline

Copies sign from the values of one int8x16 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
...
r15 = (b15 > 0) ? a15 : ((b15 == 0) ? 0 : -a15)

Not implemented for SSE2 and SSE3.

int16x8 simdpp::sse::copysign	(	int16x8	a,
		int16x8	b
	)

inline

Copies sign from the values of one int16x8 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
...
r7 = (b7 > 0) ? a7 : ((b7 == 0) ? 0 : -a7)

Not implemented for SSE2 and SSE3.

int32x4 simdpp::sse::copysign	(	int32x4	a,
		int32x4	b
	)

inline

Copies sign from the values of one int32x4 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
r1 = (b1 > 0) ? a1 : ((b1 == 0) ? 0 : -a1)
r2 = (b2 > 0) ? a2 : ((b2 == 0) ? 0 : -a2)
r3 = (b3 > 0) ? a3 : ((b3 == 0) ? 0 : -a3)

Not implemented for SSE2 and SSE3.

uint8x16 simdpp::sse::extract_hi ( uint8x32 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint16x8 simdpp::sse::extract_hi ( uint16x16 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint32x4 simdpp::sse::extract_hi ( uint32x8 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint64x2 simdpp::sse::extract_hi ( uint64x4 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

float32x4 simdpp::sse::extract_hi ( float32x8 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

float64x2 simdpp::sse::extract_hi ( float64x4 a)

inline

Extracts the higher half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint8x16 simdpp::sse::extract_lo ( uint8x32 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint16x8 simdpp::sse::extract_lo ( uint16x16 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint32x4 simdpp::sse::extract_lo ( uint32x8 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

uint64x2 simdpp::sse::extract_lo ( uint64x4 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

float32x4 simdpp::sse::extract_lo ( float32x8 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

float64x2 simdpp::sse::extract_lo ( float64x4 a)

inline

Extracts the lower half of a 256-bit vector.

This intrinsic results in at least 0 instructions.

float32x4 simdpp::sse::hadd2	(	float32x4	a,
		float32x4	b
	)

inline

Adds the values in adjacent pairs of two float32x4 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3

Not implemented for SSE2.

float32x8 simdpp::sse::hadd2	(	float32x8	a,
		float32x8	b
	)

inline

Adds the values in adjacent pairs of two float32x8 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3
r4 = a4 + a5
r5 = a6 + a7
r6 = b4 + b5
r7 = b6 + b7    

Not implemented for SSE2 and SSE3.

float64x2 simdpp::sse::hadd2	(	float64x2	a,
		float64x2	b
	)

inline

Adds the values in adjacent pairs of two float64x2 vectors.

r0 = a0 + a1

r1 = b0 + b1

Not implemented for SSE2.

uint16x8 simdpp::sse::hadd2	(	uint16x8	a,
		uint16x8	b
	)

inline

Adds values in adjacent pairs of two int16x8 vectors.

r0 = a0 + a1
...
r3 = a6 + a7
r4 = b0 + b1
...
r7 = b6 + b7

Not implemented for SSE2 and SSE3.

uint32x4 simdpp::sse::hadd2	(	uint32x4	a,
		uint32x4	b
	)

inline

Adds values in adjacent pairs of two int32x4 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3

Not implemented for SSE2 and SSE3.

uint64x2 simdpp::sse::hadd2	(	uint64x2	a,
		uint64x2	b
	)

inline

Adds values in adjacent pairs of two int64x2 vectors.

r0 = a0 + a1

r1 = b0 + b1

This intrinsic results in at least 3 instructions.

float32x4 simdpp::sse::hadd4 ( float32x4 a)

inline

Sums the values of a float32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = 0.0f
r2 = 0.0f
r3 = 0.0f

Not implemented for SSE2.

float32x4 simdpp::sse::hadd4	(	float32x4	a,
		float32x4	b,
		float32x4	c,
		float32x4	d
	)

inline

Sums the values within each of four float32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = b0 + b1 + b2 + b3
r2 = c0 + c1 + c2 + c3
r3 = d0 + d1 + d2 + d3

In SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 3 instructions.
Not implemented for SSE2.

uint32x4 simdpp::sse::hadd4	(	uint32x4	a,
		uint32x4	b,
		uint32x4	c,
		uint32x4	d
	)

inline

Sums the values within each of four int32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = b0 + b1 + b2 + b3
r2 = c0 + c1 + c2 + c3
r3 = d0 + d1 + d2 + d3

Not implemented for SSE2 and SSE3.
This intrinsic results in at least 3 instructions.

int16x8 simdpp::sse::hadds2	(	int16x8	a,
		int16x8	b
	)

inline

Adds and saturates values in adjacent pairs of two signed int16x8 vectors.

r0 = signed_saturate(a0 + a1)
...
r3 = signed_saturate(a6 + a7)
r4 = signed_saturate(b0 + b1)
...
r7 = signed_saturate(b6 + b7)

Not implemented for SSE2 and SSE3.

float32x4 simdpp::sse::hsub2	(	float32x4	a,
		float32x4	b
	)

inline

Subtracts the values in adjacent pairs of two float32x4 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3

Not implemented for SSE2.

float32x8 simdpp::sse::hsub2	(	float32x8	a,
		float32x8	b
	)

inline

Subtracts the values in adjacent pairs of two float32x8 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3
r4 = a4 - a5
r5 = a6 - a7
r6 = b4 - b5
r7 = b6 - b7

Not implemented for SSE2 and SSE3.

float64x2 simdpp::sse::hsub2	(	float64x2	a,
		float64x2	b
	)

inline

Subtracts the values in adjacent pairs of two float64x2 vectors.

r0 = a0 - a1

r1 = b0 - b1

Not implemented for SSE2.

uint16x8 simdpp::sse::hsub2	(	uint16x8	a,
		uint16x8	b
	)

inline

Subtracts values in adjacent pairs of two int16x8 vectors.

r0 = a0 - a1
...
r3 = a6 - a7
r4 = b0 - b1
...
r7 = b6 - b7

Not implemented for SSE2 and SSE3.

uint32x4 simdpp::sse::hsub2	(	uint32x4	a,
		uint32x4	b
	)

inline

Subtracts values in adjacent pairs of two int32x4 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3

Not implemented for SSE2 and SSE3.

uint64x2 simdpp::sse::hsub2	(	uint64x2	a,
		uint64x2	b
	)

inline

Subtracts values in adjacent pairs of two int64x2 vectors.

r0 = a0 - a1

r1 = b0 - b1

This intrinsic results in at least 3 instructions.

int16x8 simdpp::sse::hsubs2	(	int16x8	a,
		int16x8	b
	)

inline

Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors.

r0 = signed_saturate(a0 - a1)
...
r3 = signed_saturate(a6 - a7)
r4 = signed_saturate(b0 - b1)
...
r7 = signed_saturate(b6 - b7)

Not implemented for SSE2 and SSE3.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	uint8x16 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	uint16x8 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	uint32x4 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	uint64x2 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	float32x4 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>

void simdpp::sse::load_lane	(	float64x2 &	a,
		const void *	p
	)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

uint16x8 simdpp::sse::permute_hi ( uint16x8 a)

Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a0
...
r3 = a3
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a8
...
r11 = a11
r12 = a[s0+12]
...
r15 = a[s3+12]

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

uint16x16 simdpp::sse::permute_hi ( uint16x16 a)

Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a0
...
r3 = a3
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a8
...
r11 = a11
r12 = a[s0+12]
...
r15 = a[s3+12]

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

uint16x8 simdpp::sse::permute_lo ( uint16x8 a)

Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a4
...
r7 = a7
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a12
...
r15 = a15

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>

uint16x16 simdpp::sse::permute_lo ( uint16x16 a)

Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a4
...
r7 = a7
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a12
...
r15 = a15

256-bit version:

In SSE2-AVX this intrinsic results in at least 2 instructions.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		uint8x16	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		uint16x8	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		uint32x4	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		uint64x2	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		float32x4	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>

void simdpp::sse::store_lane	(	void *	p,
		float64x2	a
	)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

void simdpp::sse::store_masked	(	void *	p,
		uint8x16	a,
		uint8x16	mask
	)

inline

Stores bytes in an 128-bit integer vector according to a mask.

The highest bit in the corresponding byte in the mask defines whether the byte will be saved. p does not need to be aligned to 16 bytes.

void simdpp::sse::store_masked	(	void *	p,
		uint16x8	a,
		uint16x8	mask
	)

inline

void simdpp::sse::store_masked	(	void *	p,
		uint32x4	a,
		uint32x4	mask
	)

inline

void simdpp::sse::store_masked	(	void *	p,
		uint64x2	a,
		uint64x2	mask
	)

inline

float32x4 simdpp::sse::sub_add	(	float32x4	a,
		float32x4	b
	)

inline

Adds or substracts the values of two float32x4 vectors.

r0 = a0 - b0
r1 = a1 + b1
r2 = a2 - b2
r3 = a3 + b3

Not implemented for SSE2.

float64x2 simdpp::sse::sub_add	(	float64x2	a,
		float64x2	b
	)

inline

Adds or subtracts the values of two float64x2 vectors.

r0 = a0 - b0

r1 = a1 + b1

Not implemented for SSE2.

template<class = void>

bool simdpp::sse::test_ones ( uint8x16 a)

Tests if all bits are set in a 128-bit integer.

Returns true if a has all bits set, false otherwise.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
In SSE4.1 this intrinsic results in at least 2 instructions.

template<class = void>

bool simdpp::sse::test_ones ( uint16x8 a)

template<class = void>

bool simdpp::sse::test_ones ( uint32x4 a)

template<class = void>

bool simdpp::sse::test_ones ( uint64x2 a)

template<class = void>

bool simdpp::sse::test_ones	(	uint8x16	a,
		uint8x16	mask
	)

Tests if 128-bit integer consists only from ones when a mask is applied.

Returns true if a & mask has all mask bits set, false otherwise.

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
In SSE4.1 this intrinsic results in at least 1 instructions.

template<class = void>

bool simdpp::sse::test_ones	(	uint16x8	a,
		uint16x8	mask
	)

template<class = void>

bool simdpp::sse::test_ones	(	uint32x4	a,
		uint32x4	mask
	)

template<class = void>

bool simdpp::sse::test_ones	(	uint64x2	a,
		uint64x2	mask
	)

template<class = void>

bool simdpp::sse::test_zero ( uint8x16 a)

Tests no bits are set in 128-bit integer vector.

Returns true if a has all bits unset, false otherwise

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
In SSE4.1 this intrinsic results in at least 2 instructions.

template<class = void>

bool simdpp::sse::test_zero ( uint16x8 a)

template<class = void>

bool simdpp::sse::test_zero ( uint32x4 a)

template<class = void>

bool simdpp::sse::test_zero ( uint64x2 a)

template<class = void>

bool simdpp::sse::test_zero	(	uint8x16	a,
		uint8x16	mask
	)

Tests if 128-bit integer a consists only from zeros if a mask mask is applied.

Returns true if a & mask has all bits unset, false otherwise

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
In SSE4.1 this intrinsic results in at least 1 instructions.

template<class = void>

bool simdpp::sse::test_zero	(	uint16x8	a,
		uint16x8	mask
	)

template<class = void>

bool simdpp::sse::test_zero	(	uint32x4	a,
		uint32x4	mask
	)

template<class = void>

bool simdpp::sse::test_zero	(	uint64x2	a,
		uint64x2	mask
	)

uint32x4 simdpp::sse::to_int32x4 ( int8x16 a)

inline

Sign extends the values of a signed int8x16 vector to 32-bits.

r0 = (int32_t) a0
...
r3 = (int32_t) a3

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.

uint32x4 simdpp::sse::to_int32x4 ( uint8x16 a)

inline

Extends the values of a unsigned int8x16 vector to 32-bits.

r0 = (uint32_t) a0
...
r3 = (uint32_t) a3

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.

uint32x4 simdpp::sse::to_int32x4 ( float64x2 a)

inline

Converts the values of a float64x2 vector into int32_t representation using truncation.

If the value can not be represented by int32_t, 0x80000000 is returned

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = 0
r3 = 0

uint32x4 simdpp::sse::to_int32x4_r ( float32x4 a)

inline

Converts the values of a float32x4 vector into signed int32_t representation.

If the value can not be represented by int32_t, 0x80000000 is returned If only inexact conversion can be performed, the current rounding mode is used.

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = (int32_t) a2
r3 = (int32_t) a3

uint32x4 simdpp::sse::to_int32x4_r ( float64x2 a)

inline

Converts the values of a float64x2 vector into int32_t representation.

If the value can not be represented by int32_t, 0x80000000 is returned If only inexact conversion can be performed, it is rounded according to the current rounding mode.

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = 0
r3 = 0

uint64x2 simdpp::sse::to_int64x2 ( int8x16 a)

inline

Sign extends the values of a signed int8x16 vector to 64-bits.

r0 = (int64_t) a0

r1 = (int64_t) a1

Not implemented for SSE2, SSE3 and SSSE3.

uint64x2 simdpp::sse::to_int64x2 ( int16x8 a)

inline

Sign extends the values of a signed int16x8 vector to 64-bits.

r0 = (int64_t) a0

r1 = (int64_t) a1

Not implemented for SSE2, SSE3 and SSSE3.

uint64x2 simdpp::sse::to_int64x2 ( uint8x16 a)

inline

Extends the values of a unsigned int8x16 vector to 64-bits.

r0 = (uint64_t) a0

r1 = (uint64_t) a1

In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.

Functions

Function Documentation