libsimdpp
1.0
|
Functions | |
void | prefetch_l1 (const void *ptr) |
Prefetches data to level 1 cache. More... | |
void | prefetch_l2 (const void *ptr) |
Prefetches data to level 2 cache. More... | |
void | prefetch_l3 (const void *ptr) |
Prefetches data to level 3 cache. More... | |
void | prefetch_nt (const void *ptr) |
Prefetches data to a non-temporary buffer to be read once. More... | |
template<class = void> | |
bool | test_zero (uint8x16 a) |
Tests no bits are set in 128-bit integer vector. More... | |
template<class = void> | |
bool | test_zero (uint16x8 a) |
template<class = void> | |
bool | test_zero (uint32x4 a) |
template<class = void> | |
bool | test_zero (uint64x2 a) |
template<class = void> | |
bool | test_zero (uint8x16 a, uint8x16 mask) |
Tests if 128-bit integer a consists only from zeros if a mask mask is applied. More... | |
template<class = void> | |
bool | test_zero (uint16x8 a, uint16x8 mask) |
template<class = void> | |
bool | test_zero (uint32x4 a, uint32x4 mask) |
template<class = void> | |
bool | test_zero (uint64x2 a, uint64x2 mask) |
template<class = void> | |
bool | test_ones (uint8x16 a) |
Tests if all bits are set in a 128-bit integer. More... | |
template<class = void> | |
bool | test_ones (uint16x8 a) |
template<class = void> | |
bool | test_ones (uint32x4 a) |
template<class = void> | |
bool | test_ones (uint64x2 a) |
template<class = void> | |
bool | test_ones (uint8x16 a, uint8x16 mask) |
Tests if 128-bit integer consists only from ones when a mask is applied. More... | |
template<class = void> | |
bool | test_ones (uint16x8 a, uint16x8 mask) |
template<class = void> | |
bool | test_ones (uint32x4 a, uint32x4 mask) |
template<class = void> | |
bool | test_ones (uint64x2 a, uint64x2 mask) |
uint32x4 | to_int32x4 (int8x16 a) |
Sign extends the values of a signed int8x16 vector to 32-bits. More... | |
uint32x4 | to_int32x4 (uint8x16 a) |
Extends the values of a unsigned int8x16 vector to 32-bits. More... | |
uint64x2 | to_int64x2 (int8x16 a) |
Sign extends the values of a signed int8x16 vector to 64-bits. More... | |
uint64x2 | to_int64x2 (int16x8 a) |
Sign extends the values of a signed int16x8 vector to 64-bits. More... | |
uint64x2 | to_int64x2 (uint8x16 a) |
Extends the values of a unsigned int8x16 vector to 64-bits. More... | |
uint32x4 | to_int32x4_r (float32x4 a) |
Converts the values of a float32x4 vector into signed int32_t representation. More... | |
uint32x4 | to_int32x4 (float64x2 a) |
Converts the values of a float64x2 vector into int32_t representation using truncation. More... | |
uint32x4 | to_int32x4_r (float64x2 a) |
Converts the values of a float64x2 vector into int32_t representation. More... | |
float32x4 | hadd2 (float32x4 a, float32x4 b) |
Adds the values in adjacent pairs of two float32x4 vectors. More... | |
float32x8 | hadd2 (float32x8 a, float32x8 b) |
Adds the values in adjacent pairs of two float32x8 vectors. More... | |
float64x2 | hadd2 (float64x2 a, float64x2 b) |
Adds the values in adjacent pairs of two float64x2 vectors. More... | |
float32x4 | hadd4 (float32x4 a) |
Sums the values of a float32x4 vector. More... | |
float32x4 | hadd4 (float32x4 a, float32x4 b, float32x4 c, float32x4 d) |
Sums the values within each of four float32x4 vector. More... | |
float32x4 | hsub2 (float32x4 a, float32x4 b) |
Subtracts the values in adjacent pairs of two float32x4 vectors. More... | |
float32x8 | hsub2 (float32x8 a, float32x8 b) |
Subtracts the values in adjacent pairs of two float32x8 vectors. More... | |
float64x2 | hsub2 (float64x2 a, float64x2 b) |
Subtracts the values in adjacent pairs of two float64x2 vectors. More... | |
float32x4 | sub_add (float32x4 a, float32x4 b) |
Adds or substracts the values of two float32x4 vectors. More... | |
float64x2 | sub_add (float64x2 a, float64x2 b) |
Adds or subtracts the values of two float64x2 vectors. More... | |
int8x16 | copysign (int8x16 a, int8x16 b) |
Copies sign from the values of one int8x16 vector to another. More... | |
int16x8 | copysign (int16x8 a, int16x8 b) |
Copies sign from the values of one int16x8 vector to another. More... | |
int32x4 | copysign (int32x4 a, int32x4 b) |
Copies sign from the values of one int32x4 vector to another. More... | |
uint16x8 | hadd2 (uint16x8 a, uint16x8 b) |
Adds values in adjacent pairs of two int16x8 vectors. More... | |
uint32x4 | hadd2 (uint32x4 a, uint32x4 b) |
Adds values in adjacent pairs of two int32x4 vectors. More... | |
uint64x2 | hadd2 (uint64x2 a, uint64x2 b) |
Adds values in adjacent pairs of two int64x2 vectors. More... | |
int16x8 | hadds2 (int16x8 a, int16x8 b) |
Adds and saturates values in adjacent pairs of two signed int16x8 vectors. More... | |
uint32x4 | hadd4 (uint32x4 a, uint32x4 b, uint32x4 c, uint32x4 d) |
Sums the values within each of four int32x4 vector. More... | |
uint16x8 | hsub2 (uint16x8 a, uint16x8 b) |
Subtracts values in adjacent pairs of two int16x8 vectors. More... | |
uint32x4 | hsub2 (uint32x4 a, uint32x4 b) |
Subtracts values in adjacent pairs of two int32x4 vectors. More... | |
uint64x2 | hsub2 (uint64x2 a, uint64x2 b) |
Subtracts values in adjacent pairs of two int64x2 vectors. More... | |
int16x8 | hsubs2 (int16x8 a, int16x8 b) |
Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors. More... | |
void | store_masked (void *p, uint8x16 a, uint8x16 mask) |
Stores bytes in an 128-bit integer vector according to a mask. More... | |
void | store_masked (void *p, uint16x8 a, uint16x8 mask) |
void | store_masked (void *p, uint32x4 a, uint32x4 mask) |
void | store_masked (void *p, uint64x2 a, uint64x2 mask) |
uint8x16 | extract_lo (uint8x32 a) |
Extracts the lower half of a 256-bit vector. More... | |
uint16x8 | extract_lo (uint16x16 a) |
Extracts the lower half of a 256-bit vector. More... | |
uint32x4 | extract_lo (uint32x8 a) |
Extracts the lower half of a 256-bit vector. More... | |
uint64x2 | extract_lo (uint64x4 a) |
Extracts the lower half of a 256-bit vector. More... | |
float32x4 | extract_lo (float32x8 a) |
Extracts the lower half of a 256-bit vector. More... | |
float64x2 | extract_lo (float64x4 a) |
Extracts the lower half of a 256-bit vector. More... | |
uint8x16 | extract_hi (uint8x32 a) |
Extracts the higher half of a 256-bit vector. More... | |
uint16x8 | extract_hi (uint16x16 a) |
Extracts the higher half of a 256-bit vector. More... | |
uint32x4 | extract_hi (uint32x8 a) |
Extracts the higher half of a 256-bit vector. More... | |
uint64x2 | extract_hi (uint64x4 a) |
Extracts the higher half of a 256-bit vector. More... | |
float32x4 | extract_hi (float32x8 a) |
Extracts the higher half of a 256-bit vector. More... | |
float64x2 | extract_hi (float64x4 a) |
Extracts the higher half of a 256-bit vector. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (uint8x16 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (uint16x8 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (uint32x4 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (uint64x2 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (float32x4 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (float64x2 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, uint8x16 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, uint16x8 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, uint32x4 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, uint64x2 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, float32x4 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, float64x2 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
uint16x8 | permute_lo (uint16x8 a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
uint16x16 | permute_lo (uint16x16 a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
uint16x8 | permute_hi (uint16x8 a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
uint16x16 | permute_hi (uint16x16 a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More... | |
Function Documentation
|
inline |
Copies sign from the values of one int8x16 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Copies sign from the values of one int16x8 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Copies sign from the values of one int32x4 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Adds the values in adjacent pairs of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Adds the values in adjacent pairs of two float32x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Adds the values in adjacent pairs of two float64x2 vectors.
- Not implemented for SSE2.
|
inline |
Adds values in adjacent pairs of two int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Adds values in adjacent pairs of two int32x4 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Adds values in adjacent pairs of two int64x2 vectors.
- This intrinsic results in at least 3 instructions.
|
inline |
Sums the values of a float32x4 vector.
- Not implemented for SSE2.
|
inline |
Sums the values within each of four float32x4 vector.
- In SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 3 instructions.
- Not implemented for SSE2.
|
inline |
Sums the values within each of four int32x4 vector.
- Not implemented for SSE2 and SSE3.
- This intrinsic results in at least 3 instructions.
|
inline |
Adds and saturates values in adjacent pairs of two signed int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts the values in adjacent pairs of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Subtracts the values in adjacent pairs of two float32x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts the values in adjacent pairs of two float64x2 vectors.
- Not implemented for SSE2.
|
inline |
Subtracts values in adjacent pairs of two int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts values in adjacent pairs of two int32x4 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts values in adjacent pairs of two int64x2 vectors.
- This intrinsic results in at least 3 instructions.
|
inline |
Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors.
- Not implemented for SSE2 and SSE3.
void simdpp::sse::load_lane | ( | uint8x16 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | uint16x8 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | uint32x4 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | uint64x2 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | float32x4 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | float64x2 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
uint16x8 simdpp::sse::permute_hi | ( | uint16x8 | a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
uint16x16 simdpp::sse::permute_hi | ( | uint16x16 | a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
uint16x8 simdpp::sse::permute_lo | ( | uint16x8 | a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
uint16x16 simdpp::sse::permute_lo | ( | uint16x16 | a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
void simdpp::sse::store_lane | ( | void * | p, |
uint8x16 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
uint16x8 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
uint32x4 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
uint64x2 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
float32x4 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
float64x2 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
|
inline |
Stores bytes in an 128-bit integer vector according to a mask.
The highest bit in the corresponding byte in the mask defines whether the byte will be saved. p does not need to be aligned to 16 bytes.
|
inline |
|
inline |
|
inline |
|
inline |
Adds or substracts the values of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Adds or subtracts the values of two float64x2 vectors.
- Not implemented for SSE2.
bool simdpp::sse::test_ones | ( | uint8x16 | a) |
Tests if all bits are set in a 128-bit integer.
Returns true
if a has all bits set, false
otherwise.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
- In SSE4.1 this intrinsic results in at least 2 instructions.
bool simdpp::sse::test_ones | ( | uint16x8 | a) |
bool simdpp::sse::test_ones | ( | uint32x4 | a) |
bool simdpp::sse::test_ones | ( | uint64x2 | a) |
bool simdpp::sse::test_ones | ( | uint8x16 | a, |
uint8x16 | mask | ||
) |
Tests if 128-bit integer consists only from ones when a mask is applied.
Returns true
if a & mask
has all mask bits set, false
otherwise.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
- In SSE4.1 this intrinsic results in at least 1 instructions.
bool simdpp::sse::test_ones | ( | uint16x8 | a, |
uint16x8 | mask | ||
) |
bool simdpp::sse::test_ones | ( | uint32x4 | a, |
uint32x4 | mask | ||
) |
bool simdpp::sse::test_ones | ( | uint64x2 | a, |
uint64x2 | mask | ||
) |
bool simdpp::sse::test_zero | ( | uint8x16 | a) |
Tests no bits are set in 128-bit integer vector.
Returns true
if a
has all bits unset, false
otherwise
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
- In SSE4.1 this intrinsic results in at least 2 instructions.
bool simdpp::sse::test_zero | ( | uint16x8 | a) |
bool simdpp::sse::test_zero | ( | uint32x4 | a) |
bool simdpp::sse::test_zero | ( | uint64x2 | a) |
bool simdpp::sse::test_zero | ( | uint8x16 | a, |
uint8x16 | mask | ||
) |
Tests if 128-bit integer a consists only from zeros if a mask mask is applied.
Returns true
if a & mask
has all bits unset, false
otherwise
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
- In SSE4.1 this intrinsic results in at least 1 instructions.
bool simdpp::sse::test_zero | ( | uint16x8 | a, |
uint16x8 | mask | ||
) |
bool simdpp::sse::test_zero | ( | uint32x4 | a, |
uint32x4 | mask | ||
) |
bool simdpp::sse::test_zero | ( | uint64x2 | a, |
uint64x2 | mask | ||
) |
|
inline |
Sign extends the values of a signed int8x16 vector to 32-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
|
inline |
Extends the values of a unsigned int8x16 vector to 32-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
|
inline |
Converts the values of a float64x2 vector into int32_t representation using truncation.
If the value can not be represented by int32_t, 0x80000000
is returned
|
inline |
Converts the values of a float32x4 vector into signed int32_t representation.
If the value can not be represented by int32_t, 0x80000000
is returned If only inexact conversion can be performed, the current rounding mode is used.
|
inline |
Converts the values of a float64x2 vector into int32_t representation.
If the value can not be represented by int32_t, 0x80000000
is returned If only inexact conversion can be performed, it is rounded according to the current rounding mode.
|
inline |
Sign extends the values of a signed int8x16 vector to 64-bits.
- Not implemented for SSE2, SSE3 and SSSE3.
|
inline |
Sign extends the values of a signed int16x8 vector to 64-bits.
- Not implemented for SSE2, SSE3 and SSSE3.
|
inline |
Extends the values of a unsigned int8x16 vector to 64-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
Generated on Tue Apr 8 2014 03:14:35 for libsimdpp by
