libsimdpp
0.9.3
|
Functions | |
basic_int32x4 | to_int32x4 (int8x16 a) |
Sign extends the values of a signed int8x16 vector to 32-bits. More... | |
basic_int32x4 | to_int32x4 (uint8x16 a) |
Extends the values of a unsigned int8x16 vector to 32-bits. More... | |
basic_int64x2 | to_int64x2 (int8x16 a) |
Sign extends the values of a signed int8x16 vector to 64-bits. More... | |
basic_int64x2 | to_int64x2 (int16x8 a) |
Sign extends the values of a signed int16x8 vector to 64-bits. More... | |
basic_int64x2 | to_int64x2 (uint8x16 a) |
Extends the values of a unsigned int8x16 vector to 64-bits. More... | |
basic_int32x4 | to_int32x4_r (float32x4 a) |
Converts the values of a float32x4 vector into signed int32_t representation. More... | |
basic_int32x4 | to_int32x4 (float64x2 a) |
Converts the values of a float64x2 vector into int32_t representation using truncation. More... | |
basic_int32x4 | to_int32x4_r (float64x2 a) |
Converts the values of a float64x2 vector into int32_t representation. More... | |
float32x4 | hadd2 (float32x4 a, float32x4 b) |
Adds the values in adjacent pairs of two float32x4 vectors. More... | |
float64x2 | hadd2 (float64x2 a, float64x2 b) |
Adds the values in adjacent pairs of two float64x2 vectors. More... | |
float32x4 | hadd4 (float32x4 a) |
Sums the values of a float32x4 vector. More... | |
float32x4 | hadd4 (float32x4 a, float32x4 b, float32x4 c, float32x4 d) |
Sums the values within each of four float32x4 vector. More... | |
float32x4 | hsub2 (float32x4 a, float32x4 b) |
Subtracts the values in adjacent pairs of two float32x4 vectors. More... | |
float64x2 | hsub2 (float64x2 a, float64x2 b) |
Subtracts the values in adjacent pairs of two float64x2 vectors. More... | |
float32x4 | sub_add (float32x4 a, float32x4 b) |
Adds or substracts the values of two float32x4 vectors. More... | |
float64x2 | sub_add (float64x2 a, float64x2 b) |
Adds or subtracts the values of two float64x2 vectors. More... | |
int128 | copysign (int8x16 a, int8x16 b) |
Copies sign from the values of one int8x16 vector to another. More... | |
int128 | copysign (int16x8 a, int16x8 b) |
Copies sign from the values of one int16x8 vector to another. More... | |
int128 | copysign (int32x4 a, int32x4 b) |
Copies sign from the values of one int32x4 vector to another. More... | |
int128 | hadd2 (basic_int16x8 a, basic_int16x8 b) |
Adds values in adjacent pairs of two int16x8 vectors. More... | |
int128 | hadd2 (basic_int32x4 a, basic_int32x4 b) |
Adds values in adjacent pairs of two int32x4 vectors. More... | |
int128 | hadd2 (basic_int64x2 a, basic_int64x2 b) |
Adds values in adjacent pairs of two int64x2 vectors. More... | |
int128 | hadds2 (int16x8 a, int16x8 b) |
Adds and saturates values in adjacent pairs of two signed int16x8 vectors. More... | |
int128 | hadd4 (basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d) |
Sums the values within each of four int32x4 vector. More... | |
int128 | hsub2 (basic_int16x8 a, basic_int16x8 b) |
Subtracts values in adjacent pairs of two int16x8 vectors. More... | |
int128 | hsub2 (basic_int32x4 a, basic_int32x4 b) |
Subtracts values in adjacent pairs of two int32x4 vectors. More... | |
int128 | hsub2 (basic_int64x2 a, basic_int64x2 b) |
Subtracts values in adjacent pairs of two int64x2 vectors. More... | |
int128 | hsubs2 (int16x8 a, int16x8 b) |
Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors. More... | |
void | store_masked (void *p, int128 a, int128 mask) |
Stores bytes in an 128-bit integer vector according to a mask. More... | |
int128 | extract_lo (int256 a) |
Extracts the lower half of a 256-bit vector. More... | |
basic_int8x16 | extract_lo (basic_int8x32 a) |
Extracts the lower half of a 256-bit vector. More... | |
basic_int16x8 | extract_lo (basic_int16x16 a) |
Extracts the lower half of a 256-bit vector. More... | |
basic_int32x4 | extract_lo (basic_int32x8 a) |
Extracts the lower half of a 256-bit vector. More... | |
basic_int64x2 | extract_lo (basic_int64x4 a) |
Extracts the lower half of a 256-bit vector. More... | |
float32x4 | extract_lo (float32x8 a) |
Extracts the lower half of a 256-bit vector. More... | |
float64x2 | extract_lo (float64x4 a) |
Extracts the lower half of a 256-bit vector. More... | |
int128 | extract_hi (int256 a) |
Extracts the higher half of a 256-bit vector. More... | |
basic_int8x16 | extract_hi (basic_int8x32 a) |
Extracts the higher half of a 256-bit vector. More... | |
basic_int16x8 | extract_hi (basic_int16x16 a) |
Extracts the higher half of a 256-bit vector. More... | |
basic_int32x4 | extract_hi (basic_int32x8 a) |
Extracts the higher half of a 256-bit vector. More... | |
basic_int64x2 | extract_hi (basic_int64x4 a) |
Extracts the higher half of a 256-bit vector. More... | |
float32x4 | extract_hi (float32x8 a) |
Extracts the higher half of a 256-bit vector. More... | |
float64x2 | extract_hi (float64x4 a) |
Extracts the higher half of a 256-bit vector. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (basic_int8x16 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (basic_int16x8 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (basic_int32x4 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (basic_int64x2 &a, const void *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | load_lane (float32x4 &a, const float *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
float64x2 | load_lane (float64x2 &a, const double *p) |
Loads the first N elements of a 128-bit vector from memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, basic_int8x16 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, basic_int16x8 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, basic_int32x4 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (void *p, basic_int64x2 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (float *p, float32x4 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned P, unsigned N> | |
void | store_lane (double *p, float64x2 a) |
Stores the first N elements of a 128-bit vector to memory. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
basic_int16x8 | permute_lo (basic_int16x8 a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
basic_int16x16 | permute_lo (basic_int16x16 a) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
basic_int16x8 | permute_hi (basic_int16x8 a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More... | |
template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> | |
basic_int16x16 | permute_hi (basic_int16x16 a) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees. More... | |
Function Documentation
|
inline |
Copies sign from the values of one int8x16 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Copies sign from the values of one int16x8 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Copies sign from the values of one int32x4 vector to another.
- Not implemented for SSE2 and SSE3.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the higher half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Extracts the lower half of a 256-bit vector.
- This intrinsic results in at least 0 instructions.
|
inline |
Adds the values in adjacent pairs of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Adds the values in adjacent pairs of two float64x2 vectors.
- Not implemented for SSE2.
|
inline |
Adds values in adjacent pairs of two int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Adds values in adjacent pairs of two int32x4 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Adds values in adjacent pairs of two int64x2 vectors.
- This intrinsic results in at least 3 instructions.
|
inline |
Sums the values of a float32x4 vector.
- Not implemented for SSE2.
|
inline |
Sums the values within each of four float32x4 vector.
- In SSE3, SSSE3 and SSE4.1 this intrinsic results in at least 3 instructions.
- Not implemented for SSE2.
|
inline |
Sums the values within each of four int32x4 vector.
- Not implemented for SSE2 and SSE3.
- This intrinsic results in at least 3 instructions.
|
inline |
Adds and saturates values in adjacent pairs of two signed int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts the values in adjacent pairs of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Subtracts the values in adjacent pairs of two float64x2 vectors.
- Not implemented for SSE2.
|
inline |
Subtracts values in adjacent pairs of two int16x8 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts values in adjacent pairs of two int32x4 vectors.
- Not implemented for SSE2 and SSE3.
|
inline |
Subtracts values in adjacent pairs of two int64x2 vectors.
- This intrinsic results in at least 3 instructions.
|
inline |
Subtracts and saturates values in adjacent pairs of two signed int16x8 vectors.
- Not implemented for SSE2 and SSE3.
void simdpp::sse::load_lane | ( | basic_int8x16 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | basic_int16x8 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | basic_int32x4 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | basic_int64x2 & | a, |
const void * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
void simdpp::sse::load_lane | ( | float32x4 & | a, |
const float * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
float64x2 simdpp::sse::load_lane | ( | float64x2 & | a, |
const double * | p | ||
) |
Loads the first N elements of a 128-bit vector from memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
If N is M/2, then the values of non-loaded elements are preserved, otherwise, they are set to zero.
basic_int16x8 simdpp::sse::permute_hi | ( | basic_int16x8 | a | ) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
basic_int16x16 simdpp::sse::permute_hi | ( | basic_int16x16 | a | ) |
Permutes the last 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
basic_int16x8 simdpp::sse::permute_lo | ( | basic_int16x8 | a | ) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
basic_int16x16 simdpp::sse::permute_lo | ( | basic_int16x16 | a | ) |
Permutes the first 4 16-bit values in of each set of 8 consecutive valuees.
The selector values s0, s1, s2 and s3 must be in range [0; 3].
- 256-bit version:
- In SSE2-AVX this intrinsic results in at least 2 instructions.
void simdpp::sse::store_lane | ( | void * | p, |
basic_int8x16 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
basic_int16x8 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
basic_int32x4 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | void * | p, |
basic_int64x2 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | float * | p, |
float32x4 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
void simdpp::sse::store_lane | ( | double * | p, |
float64x2 | a | ||
) |
Stores the first N elements of a 128-bit vector to memory.
N must be a power of 2 and at least M/4 where M is the number of elements within vector. P must be 0 or M/2 if N == M/2.
|
inline |
Stores bytes in an 128-bit integer vector according to a mask.
The highest bit in the corresponding byte in the mask defines whether the byte will be saved. p does not need to be aligned to 16 bytes.
|
inline |
Adds or substracts the values of two float32x4 vectors.
- Not implemented for SSE2.
|
inline |
Adds or subtracts the values of two float64x2 vectors.
- Not implemented for SSE2.
|
inline |
Sign extends the values of a signed int8x16 vector to 32-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
|
inline |
Extends the values of a unsigned int8x16 vector to 32-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 3 instructions.
|
inline |
Converts the values of a float64x2 vector into int32_t representation using truncation.
If the value can not be represented by int32_t, 0x80000000
is returned
|
inline |
Converts the values of a float32x4 vector into signed int32_t representation.
If the value can not be represented by int32_t, 0x80000000
is returned If only inexact conversion can be performed, the current rounding mode is used.
|
inline |
Converts the values of a float64x2 vector into int32_t representation.
If the value can not be represented by int32_t, 0x80000000
is returned If only inexact conversion can be performed, it is rounded according to the current rounding mode.
|
inline |
Sign extends the values of a signed int8x16 vector to 64-bits.
- Not implemented for SSE2, SSE3 and SSSE3.
|
inline |
Sign extends the values of a signed int16x8 vector to 64-bits.
- Not implemented for SSE2, SSE3 and SSSE3.
|
inline |
Extends the values of a unsigned int8x16 vector to 64-bits.
- In SSE2, SSE3 and SSSE3 this intrinsic results in at least 4 instructions.
Generated on Thu Oct 31 2013 04:08:52 for libsimdpp by 1.8.3.1