libsimdpp  1.0
simdpp::sse Namespace Reference

Functions

void prefetch_l1 (const void *ptr)
 Prefetches data to level 1 cache. More...
 
void prefetch_l2 (const void *ptr)
 Prefetches data to level 2 cache. More...
 
void prefetch_l3 (const void *ptr)
 Prefetches data to level 3 cache. More...
 
void prefetch_nt (const void *ptr)
 Prefetches data to a non-temporary buffer to be read once. More...
 
template<class = void>
bool test_zero (uint8x16 a)
 Tests no bits are set in 128-bit integer vector. More...
 
template<class = void>
bool test_zero (uint16x8 a)
 
template<class = void>
bool test_zero (uint32x4 a)
 
template<class = void>
bool test_zero (uint64x2 a)
 
template<class = void>
bool test_zero (uint8x16 a, uint8x16 mask)
 Tests if 128-bit integer a consists only from zeros if a mask mask is applied. More...
 
template<class = void>
bool test_zero (uint16x8 a, uint16x8 mask)
 
template<class = void>
bool test_zero (uint32x4 a, uint32x4 mask)
 
template<class = void>
bool test_zero (uint64x2 a, uint64x2 mask)
 
template<class = void>
bool test_ones (uint8x16 a)
 Tests if all bits are set in a 128-bit integer. More...
 
template<class = void>
bool test_ones (uint16x8 a)
 
template<class = void>
bool test_ones (uint32x4 a)
 
template<class = void>
bool test_ones (uint64x2 a)
 
template<class = void>
bool test_ones (uint8x16 a, uint8x16 mask)
 Tests if 128-bit integer consists only from ones when a mask is applied. More...
 
template<class = void>
bool test_ones (uint16x8 a, uint16x8 mask)
 
template<class = void>
bool test_ones (uint32x4 a, uint32x4 mask)
 
template<class = void>
bool test_ones (uint64x2 a, uint64x2 mask)
 
uint32x4 to_int32x4 (int8x16 a)
 Sign extends the values of a signed int8x16 vector to 32-bits. More...
 
uint32x4 to_int32x4 (uint8x16 a)
 Extends the values of a unsigned int8x16 vector to 32-bits. More...
 
uint64x2 to_int64x2 (int8x16 a)
 Sign extends the values of a signed int8x16 vector to 64-bits. More...
 
uint64x2 to_int64x2 (int16x8 a)
 Sign extends the values of a signed int16x8 vector to 64-bits. More...
 
uint64x2 to_int64x2 (uint8x16 a)
 Extends the values of a unsigned int8x16 vector to 64-bits. More...
 
uint32x4 to_int32x4_r (float32x4 a)
 Converts the values of a float32x4 vector into signed int32_t representation. More...
 
uint32x4 to_int32x4 (float64x2 a)
 Converts the values of a float64x2 vector into int32_t representation using truncation. More...
 
uint32x4 to_int32x4_r (float64x2 a)
 Converts the values of a float64x2 vector into int32_t representation. More...
 
float32x4 hadd2 (float32x4 a, float32x4 b)
 Adds the values in adjacent pairs of two float32x4 vectors. More...
 
float32x8 hadd2 (float32x8 a, float32x8 b)
 Adds the values in adjacent pairs of two float32x8 vectors. More...
 
float64x2 hadd2 (float64x2 a, float64x2 b)
 Adds the values in adjacent pairs of two float64x2 vectors. More...
 
float32x4 hadd4 (float32x4 a)
 Sums the values of a float32x4 vector. More...
 
float32x4 hadd4 (float32x4 a, float32x4 b, float32x4 c, float32x4 d)
 Sums the values within each of four float32x4 vector. More...
 
float32x4 hsub2 (float32x4 a, float32x4 b)
 Subtracts the values in adjacent pairs of two float32x4 vectors. More...
 
float32x8 hsub2 (float32x8 a, float32x8 b)
 Subtracts the values in adjacent pairs of two float32x8 vectors. More...
 
float64x2 hsub2 (float64x2 a, float64x2 b)
 Subtracts the values in adjacent pairs of two float64x2 vectors. More...
 
float32x4 sub_add (float32x4 a, float32x4 b)
 Adds or substracts the values of two float32x4 vectors. More...
 
float64x2 sub_add (float64x2 a, float64x2 b)
 Adds or subtracts the values of two float64x2 vectors. More...
 
int8x16 copysign (int8x16 a, int8x16 b)
 Copies sign from the values of one int8x16 vector to another. More...
 
int16x8 copysign (int16x8 a, int16x8 b)
 Copies sign from the values of one int16x8 vector to another. More...
 
int32x4 copysign (int32x4 a, int32x4 b)
 Copies sign from the values of one int32x4 vector to another. More...
 
uint16x8 hadd2 (uint16x8 a, uint16x8 b)
 Adds values in adjacent pairs of two int16x8 vectors. More...
 
uint32x4 hadd2 (uint32x4 a, uint32x4 b)
 Adds values in adjacent pairs of two int32x4 vectors. More...
 
uint64x2 hadd2 (uint64x2 a, uint64x2 b)
 Adds values in adjacent pairs of two int64x2 vectors. More...
 
int16x8 hadds2 (int16x8 a, int16x8 b)
 Adds and saturates values in adjacent pairs of two signed int16x8 vectors. More...
 
uint32x4 hadd4 (uint32x4 a, uint32x4 b, uint32x4 c, uint32x4 d)
 Sums the values within each of four int32x4 vector. More...
 
uint16x8 hsub2 (uint16x8 a, uint16x8 b)
 Subtracts values in adjacent pairs of two int16x8 vectors. More...
 
uint32x4 hsub2 (uint32x4 a, uint32x4 b)
 Subtracts values in adjacent pairs of two int32x4 vectors. More...
 
uint64x2 hsub2 (uint64x2 a, uint64x2 b)
 Subtracts values in adjacent pairs of two int64x2 vectors. More...
 
int16x8 hsubs2 (int16x8 a, int16x8 b)
 Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors. More...
 
void store_masked (void *p, uint8x16 a, uint8x16 mask)
 Stores bytes in an 128-bit integer vector according to a mask. More...
 
void store_masked (void *p, uint16x8 a, uint16x8 mask)
 
void store_masked (void *p, uint32x4 a, uint32x4 mask)
 
void store_masked (void *p, uint64x2 a, uint64x2 mask)
 
uint8x16 extract_lo (uint8x32 a)
 Extracts the lower half of a 256-bit vector. More...
 
uint16x8 extract_lo (uint16x16 a)
 Extracts the lower half of a 256-bit vector. More...
 
uint32x4 extract_lo (uint32x8 a)
 Extracts the lower half of a 256-bit vector. More...
 
uint64x2 extract_lo (uint64x4 a)
 Extracts the lower half of a 256-bit vector. More...
 
float32x4 extract_lo (float32x8 a)
 Extracts the lower half of a 256-bit vector. More...
 
float64x2 extract_lo (float64x4 a)
 Extracts the lower half of a 256-bit vector. More...
 
uint8x16 extract_hi (uint8x32 a)
 Extracts the higher half of a 256-bit vector. More...
 
uint16x8 extract_hi (uint16x16 a)
 Extracts the higher half of a 256-bit vector. More...
 
uint32x4 extract_hi (uint32x8 a)
 Extracts the higher half of a 256-bit vector. More...
 
uint64x2 extract_hi (uint64x4 a)
 Extracts the higher half of a 256-bit vector. More...
 
float32x4 extract_hi (float32x8 a)
 Extracts the higher half of a 256-bit vector. More...
 
float64x2 extract_hi (float64x4 a)
 Extracts the higher half of a 256-bit vector. More...
 
template<unsigned P, unsigned N>
void load_lane (uint8x16 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void load_lane (uint16x8 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void load_lane (uint32x4 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void load_lane (uint64x2 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void load_lane (float32x4 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void load_lane (float64x2 &a, const void *p)
 Loads the first N elements of a 128-bit vector from memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, uint8x16 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, uint16x8 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, uint32x4 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, uint64x2 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, float32x4 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned P, unsigned N>
void store_lane (void *p, float64x2 a)
 Stores the first N elements of a 128-bit vector to memory. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8 permute_lo (uint16x8 a)
 Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16 permute_lo (uint16x16 a)
 Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8 permute_hi (uint16x8 a)
 Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More...
 
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16 permute_hi (uint16x16 a)
 Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More...
 

Function Documentation

int8x16 simdpp::sse::copysign ( int8x16  a,
int8x16  b 
)
inline

Copies sign from the values of one int8x16 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
...
r15 = (b15 > 0) ? a15 : ((b15 == 0) ? 0 : -a15)
  • Not implemented for SSE2 and SSE3.
int16x8 simdpp::sse::copysign ( int16x8  a,
int16x8  b 
)
inline

Copies sign from the values of one int16x8 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
...
r7 = (b7 > 0) ? a7 : ((b7 == 0) ? 0 : -a7)
  • Not implemented for SSE2 and SSE3.
int32x4 simdpp::sse::copysign ( int32x4  a,
int32x4  b 
)
inline

Copies sign from the values of one int32x4 vector to another.

r0 = (b0 > 0) ? a0 : ((b0 == 0) ? 0 : -a0)
r1 = (b1 > 0) ? a1 : ((b1 == 0) ? 0 : -a1)
r2 = (b2 > 0) ? a2 : ((b2 == 0) ? 0 : -a2)
r3 = (b3 > 0) ? a3 : ((b3 == 0) ? 0 : -a3)
  • Not implemented for SSE2 and SSE3.
uint8x16 simdpp::sse::extract_hi ( uint8x32  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint16x8 simdpp::sse::extract_hi ( uint16x16  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint32x4 simdpp::sse::extract_hi ( uint32x8  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint64x2 simdpp::sse::extract_hi ( uint64x4  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
float32x4 simdpp::sse::extract_hi ( float32x8  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
float64x2 simdpp::sse::extract_hi ( float64x4  a)
inline

Extracts the higher half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint8x16 simdpp::sse::extract_lo ( uint8x32  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint16x8 simdpp::sse::extract_lo ( uint16x16  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint32x4 simdpp::sse::extract_lo ( uint32x8  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
uint64x2 simdpp::sse::extract_lo ( uint64x4  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
float32x4 simdpp::sse::extract_lo ( float32x8  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
float64x2 simdpp::sse::extract_lo ( float64x4  a)
inline

Extracts the lower half of a 256-bit vector.

  • This intrinsic results in at least 0 instructions.
float32x4 simdpp::sse::hadd2 ( float32x4  a,
float32x4  b 
)
inline

Adds the values in adjacent pairs of two float32x4 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3
  • Not implemented for SSE2.
float32x8 simdpp::sse::hadd2 ( float32x8  a,
float32x8  b 
)
inline

Adds the values in adjacent pairs of two float32x8 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3
r4 = a4 + a5
r5 = a6 + a7
r6 = b4 + b5
r7 = b6 + b7
  • Not implemented for SSE2 and SSE3.
float64x2 simdpp::sse::hadd2 ( float64x2  a,
float64x2  b 
)
inline

Adds the values in adjacent pairs of two float64x2 vectors.

r0 = a0 + a1
r1 = b0 + b1
  • Not implemented for SSE2.
uint16x8 simdpp::sse::hadd2 ( uint16x8  a,
uint16x8  b 
)
inline

Adds values in adjacent pairs of two int16x8 vectors.

r0 = a0 + a1
...
r3 = a6 + a7
r4 = b0 + b1
...
r7 = b6 + b7
  • Not implemented for SSE2 and SSE3.
uint32x4 simdpp::sse::hadd2 ( uint32x4  a,
uint32x4  b 
)
inline

Adds values in adjacent pairs of two int32x4 vectors.

r0 = a0 + a1
r1 = a2 + a3
r2 = b0 + b1
r3 = b2 + b3
  • Not implemented for SSE2 and SSE3.
uint64x2 simdpp::sse::hadd2 ( uint64x2  a,
uint64x2  b 
)
inline

Adds values in adjacent pairs of two int64x2 vectors.

r0 = a0 + a1
r1 = b0 + b1
  • This intrinsic results in at least 3 instructions.
float32x4 simdpp::sse::hadd4 ( float32x4  a)
inline

Sums the values of a float32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = 0.0f
r2 = 0.0f
r3 = 0.0f
  • Not implemented for SSE2.
float32x4 simdpp::sse::hadd4 ( float32x4  a,
float32x4  b,
float32x4  c,
float32x4  d 
)
inline

Sums the values within each of four float32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = b0 + b1 + b2 + b3
r2 = c0 + c1 + c2 + c3
r3 = d0 + d1 + d2 + d3
  • In SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 3 instructions.
  • Not implemented for SSE2.
uint32x4 simdpp::sse::hadd4 ( uint32x4  a,
uint32x4  b,
uint32x4  c,
uint32x4  d 
)
inline

Sums the values within each of four int32x4 vector.

r0 = a0 + a1 + a2 + a3
r1 = b0 + b1 + b2 + b3
r2 = c0 + c1 + c2 + c3
r3 = d0 + d1 + d2 + d3
  • Not implemented for SSE2 and SSE3.
  • This intrinsic results in at least 3 instructions.
int16x8 simdpp::sse::hadds2 ( int16x8  a,
int16x8  b 
)
inline

Adds and saturates values in adjacent pairs of two signed int16x8 vectors.

r0 = signed_saturate(a0 + a1)
...
r3 = signed_saturate(a6 + a7)
r4 = signed_saturate(b0 + b1)
...
r7 = signed_saturate(b6 + b7)
  • Not implemented for SSE2 and SSE3.
float32x4 simdpp::sse::hsub2 ( float32x4  a,
float32x4  b 
)
inline

Subtracts the values in adjacent pairs of two float32x4 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3
  • Not implemented for SSE2.
float32x8 simdpp::sse::hsub2 ( float32x8  a,
float32x8  b 
)
inline

Subtracts the values in adjacent pairs of two float32x8 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3
r4 = a4 - a5
r5 = a6 - a7
r6 = b4 - b5
r7 = b6 - b7
  • Not implemented for SSE2 and SSE3.
float64x2 simdpp::sse::hsub2 ( float64x2  a,
float64x2  b 
)
inline

Subtracts the values in adjacent pairs of two float64x2 vectors.

r0 = a0 - a1
r1 = b0 - b1
  • Not implemented for SSE2.
uint16x8 simdpp::sse::hsub2 ( uint16x8  a,
uint16x8  b 
)
inline

Subtracts values in adjacent pairs of two int16x8 vectors.

r0 = a0 - a1
...
r3 = a6 - a7
r4 = b0 - b1
...
r7 = b6 - b7
  • Not implemented for SSE2 and SSE3.
uint32x4 simdpp::sse::hsub2 ( uint32x4  a,
uint32x4  b 
)
inline

Subtracts values in adjacent pairs of two int32x4 vectors.

r0 = a0 - a1
r1 = a2 - a3
r2 = b0 - b1
r3 = b2 - b3
  • Not implemented for SSE2 and SSE3.
uint64x2 simdpp::sse::hsub2 ( uint64x2  a,
uint64x2  b 
)
inline

Subtracts values in adjacent pairs of two int64x2 vectors.

r0 = a0 - a1
r1 = b0 - b1
  • This intrinsic results in at least 3 instructions.
int16x8 simdpp::sse::hsubs2 ( int16x8  a,
int16x8  b 
)
inline

Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors.

r0 = signed_saturate(a0 - a1)
...
r3 = signed_saturate(a6 - a7)
r4 = signed_saturate(b0 - b1)
...
r7 = signed_saturate(b6 - b7)
  • Not implemented for SSE2 and SSE3.
template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( uint8x16 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( uint16x8 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( uint32x4 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( uint64x2 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( float32x4 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned P, unsigned N>
void simdpp::sse::load_lane ( float64x2 &  a,
const void *  p 
)

Loads the first N elements of a 128-bit vector from memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.

template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8 simdpp::sse::permute_hi ( uint16x8  a)

Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a0
...
r3 = a3
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a8
...
r11 = a11
r12 = a[s0+12]
...
r15 = a[s3+12]
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16 simdpp::sse::permute_hi ( uint16x16  a)

Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a0
...
r3 = a3
r4 = a[s0+4]
...
r7 = a[s3+4]
256-bit version:
r8 = a8
...
r11 = a11
r12 = a[s0+12]
...
r15 = a[s3+12]
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x8 simdpp::sse::permute_lo ( uint16x8  a)

Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a4
...
r7 = a7
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a12
...
r15 = a15
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
uint16x16 simdpp::sse::permute_lo ( uint16x16  a)

Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.

The selector values s0, s1, s2 and s3 must be in range [0; 3].

r0 = a[s0]
...
r3 = a[s3]
r4 = a4
...
r7 = a7
256-bit version:
r8 = a[s0+8]
...
r11 = a[s3+8]
r12 = a12
...
r15 = a15
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 2 instructions.
template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
uint8x16  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
uint16x8  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
uint32x4  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
uint64x2  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
float32x4  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

template<unsigned P, unsigned N>
void simdpp::sse::store_lane ( void *  p,
float64x2  a 
)

Stores the first N elements of a 128-bit vector to memory.

N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.

void simdpp::sse::store_masked ( void *  p,
uint8x16  a,
uint8x16  mask 
)
inline

Stores bytes in an 128-bit integer vector according to a mask.

The highest bit in the corresponding byte in the mask defines whether the byte will be saved. p does not need to be aligned to 16 bytes.

void simdpp::sse::store_masked ( void *  p,
uint16x8  a,
uint16x8  mask 
)
inline
void simdpp::sse::store_masked ( void *  p,
uint32x4  a,
uint32x4  mask 
)
inline
void simdpp::sse::store_masked ( void *  p,
uint64x2  a,
uint64x2  mask 
)
inline
float32x4 simdpp::sse::sub_add ( float32x4  a,
float32x4  b 
)
inline

Adds or substracts the values of two float32x4 vectors.

r0 = a0 - b0
r1 = a1 + b1
r2 = a2 - b2
r3 = a3 + b3
  • Not implemented for SSE2.
float64x2 simdpp::sse::sub_add ( float64x2  a,
float64x2  b 
)
inline

Adds or subtracts the values of two float64x2 vectors.

r0 = a0 - b0
r1 = a1 + b1
  • Not implemented for SSE2.
template<class = void>
bool simdpp::sse::test_ones ( uint8x16  a)

Tests if all bits are set in a 128-bit integer.

Returns true if a has all bits set, false otherwise.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
  • In SSE4.1 this intrinsic results in at least 2 instructions.
template<class = void>
bool simdpp::sse::test_ones ( uint16x8  a)
template<class = void>
bool simdpp::sse::test_ones ( uint32x4  a)
template<class = void>
bool simdpp::sse::test_ones ( uint64x2  a)
template<class = void>
bool simdpp::sse::test_ones ( uint8x16  a,
uint8x16  mask 
)

Tests if 128-bit integer consists only from ones when a mask is applied.

Returns true if a & mask has all mask bits set, false otherwise.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
  • In SSE4.1 this intrinsic results in at least 1 instructions.
template<class = void>
bool simdpp::sse::test_ones ( uint16x8  a,
uint16x8  mask 
)
template<class = void>
bool simdpp::sse::test_ones ( uint32x4  a,
uint32x4  mask 
)
template<class = void>
bool simdpp::sse::test_ones ( uint64x2  a,
uint64x2  mask 
)
template<class = void>
bool simdpp::sse::test_zero ( uint8x16  a)

Tests no bits are set in 128-bit integer vector.

Returns true if a has all bits unset, false otherwise

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
  • In SSE4.1 this intrinsic results in at least 2 instructions.
template<class = void>
bool simdpp::sse::test_zero ( uint16x8  a)
template<class = void>
bool simdpp::sse::test_zero ( uint32x4  a)
template<class = void>
bool simdpp::sse::test_zero ( uint64x2  a)
template<class = void>
bool simdpp::sse::test_zero ( uint8x16  a,
uint8x16  mask 
)

Tests if 128-bit integer a consists only from zeros if a mask mask is applied.

Returns true if a & mask has all bits unset, false otherwise

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
  • In SSE4.1 this intrinsic results in at least 1 instructions.
template<class = void>
bool simdpp::sse::test_zero ( uint16x8  a,
uint16x8  mask 
)
template<class = void>
bool simdpp::sse::test_zero ( uint32x4  a,
uint32x4  mask 
)
template<class = void>
bool simdpp::sse::test_zero ( uint64x2  a,
uint64x2  mask 
)
uint32x4 simdpp::sse::to_int32x4 ( int8x16  a)
inline

Sign extends the values of a signed int8x16 vector to 32-bits.

r0 = (int32_t) a0
...
r3 = (int32_t) a3
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
uint32x4 simdpp::sse::to_int32x4 ( uint8x16  a)
inline

Extends the values of a unsigned int8x16 vector to 32-bits.

r0 = (uint32_t) a0
...
r3 = (uint32_t) a3
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
uint32x4 simdpp::sse::to_int32x4 ( float64x2  a)
inline

Converts the values of a float64x2 vector into int32_t representation using truncation.

If the value can not be represented by int32_t, 0x80000000 is returned

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = 0
r3 = 0
uint32x4 simdpp::sse::to_int32x4_r ( float32x4  a)
inline

Converts the values of a float32x4 vector into signed int32_t representation.

If the value can not be represented by int32_t, 0x80000000 is returned If only inexact conversion can be performed, the current rounding mode is used.

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = (int32_t) a2
r3 = (int32_t) a3
uint32x4 simdpp::sse::to_int32x4_r ( float64x2  a)
inline

Converts the values of a float64x2 vector into int32_t representation.

If the value can not be represented by int32_t, 0x80000000 is returned If only inexact conversion can be performed, it is rounded according to the current rounding mode.

r0 = (int32_t) a0
r1 = (int32_t) a1
r2 = 0
r3 = 0
uint64x2 simdpp::sse::to_int64x2 ( int8x16  a)
inline

Sign extends the values of a signed int8x16 vector to 64-bits.

r0 = (int64_t) a0
r1 = (int64_t) a1
  • Not implemented for SSE2, SSE3 and SSSE3.
uint64x2 simdpp::sse::to_int64x2 ( int16x8  a)
inline

Sign extends the values of a signed int16x8 vector to 64-bits.

r0 = (int64_t) a0
r1 = (int64_t) a1
  • Not implemented for SSE2, SSE3 and SSSE3.
uint64x2 simdpp::sse::to_int64x2 ( uint8x16  a)
inline

Extends the values of a unsigned int8x16 vector to 64-bits.

r0 = (uint64_t) a0
r1 = (uint64_t) a1
  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.