libsimdpp
0.9.3
|
Functions | |
void | simdpp::store (void *p, int128 a) |
Stores a 128-bit or 256-bit integer vector to an aligned memory location. More... | |
void | simdpp::store (void *p, int256 a) |
void | simdpp::store (float *p, float32x4 a) |
void | simdpp::store (float *p, float32x8 a) |
void | simdpp::store (double *p, float64x2 a) |
void | simdpp::store (double *p, float64x4 a) |
void | simdpp::stream (void *p, int128 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::stream (void *p, int256 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::stream (float *p, float32x4 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::stream (float *p, float32x8 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::stream (double *p, float64x2 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::stream (double *p, float64x4 a) |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More... | |
void | simdpp::store_first (void *p, basic_int8x16 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int8x32 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int16x8 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int16x16 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int32x4 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int32x8 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int64x2 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (void *p, basic_int64x4 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (float *p, float32x4 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (float *p, float32x8 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (double *p, float64x2 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_first (double *p, float64x4 a, unsigned n) |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int8x16 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int8x32 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int16x8 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int16x16 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int32x4 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int32x8 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int64x2 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (void *p, basic_int64x4 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (float *p, float32x4 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (float *p, float32x8 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (double *p, float64x2 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_last (double *p, float64x4 a, unsigned n) |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More... | |
void | simdpp::store_packed2 (void *p, basic_int8x16 a, basic_int8x16 b) |
Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int8x32 a, basic_int8x32 b) |
Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int16x8 a, basic_int16x8 b) |
Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int16x16 a, basic_int16x16 b) |
Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int32x4 a, basic_int32x4 b) |
Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int32x8 a, basic_int32x8 b) |
Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int64x2 a, basic_int64x2 b) |
Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (void *p, basic_int64x4 a, basic_int64x4 b) |
Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (float *p, float32x4 a, float32x4 b) |
Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (float *p, float32x8 a, float32x8 b) |
Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (double *p, float64x2 a, float64x2 b) |
Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed2 (double *p, float64x4 a, float64x4 b) |
Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c) |
Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c) |
Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c) |
Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c) |
Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c) |
Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c) |
Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c) |
Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c) |
Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (float *p, float32x4 a, float32x4 b, float32x4 c) |
Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (float *p, float32x8 a, float32x8 b, float32x8 c) |
Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (double *p, float64x2 a, float64x2 b, float64x2 c) |
Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed3 (double *p, float64x4 a, float64x4 b, float64x4 c) |
Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c, basic_int8x16 d) |
Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c, basic_int8x32 d) |
Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c, basic_int16x8 d) |
Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c, basic_int16x16 d) |
Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d) |
Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c, basic_int32x8 d) |
Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c, basic_int64x2 d) |
Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c, basic_int64x4 d) |
Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (float *p, float32x4 a, float32x4 b, float32x4 c, float32x4 d) |
Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (float *p, float32x8 a, float32x8 b, float32x8 c, float32x8 d) |
Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (double *p, float64x2 a, float64x2 b, float64x2 c, float64x2 d) |
Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More... | |
void | simdpp::store_packed4 (double *p, float64x4 a, float64x4 b, float64x4 c, float64x4 d) |
Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More... | |
Detailed Description
Function Documentation
|
inline |
Stores a 128-bit or 256-bit integer vector to an aligned memory location.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.
n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.
This function results in several instructions. It is best not to use it in inner loops.
- 128-bit version:
- p must be aligned to 16 bytes.
- 256-bit version:
- p must be aligned to 32 bytes.
|
inline |
Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b
|
inline |
Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b
|
inline |
Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
|
inline |
Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
|
inline |
Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), *(p+6) ] = a[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
|
inline |
Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), *(p+6) ] = a[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
|
inline |
Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2) ] = a[ *(p+1), *(p+3) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
|
inline |
Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2) ] = a[ *(p+1), *(p+3) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
|
inline |
Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
|
inline |
Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
|
inline |
Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2) ] = a[ *(p+1), *(p+3) ] = b
- 256-bit version:
- p must be aligned to 32 bytes[ *(p), *(p+2), *(p+4), *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
|
inline |
Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+2) ] = a[ *(p+1), *(p+3) ] = b
- 256-bit version:
- p must be aligned to 32 bytes[ *(p), *(p+2), *(p+4), *(p+14) ] = a[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
|
inline |
Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c
|
inline |
Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c
|
inline |
Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
|
inline |
Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
|
inline |
Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
|
inline |
Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
|
inline |
Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3) ] = a[ *(p+1), *(p+4) ] = b[ *(p+2), *(p+5) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
|
inline |
Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3) ] = a[ *(p+1), *(p+4) ] = b[ *(p+2), *(p+5) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
|
inline |
Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
|
inline |
Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
|
inline |
Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.
Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3) ] = a[ *(p+1), *(p+4) ] = b[ *(p+2), *(p+5) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
|
inline |
Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.
Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+3) ] = a[ *(p+1), *(p+4) ] = b[ *(p+2), *(p+5) ] = c
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+3), *(p+6), *(p+9) ] = a[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
|
inline |
Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d
|
inline |
Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d
|
inline |
Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
|
inline |
Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
|
inline |
Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
|
inline |
Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
|
inline |
Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4) ] = a[ *(p+1), *(p+5) ] = b[ *(p+2), *(p+6) ] = c[ *(p+3), *(p+7) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
|
inline |
Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4) ] = a[ *(p+1), *(p+5) ] = b[ *(p+2), *(p+6) ] = c[ *(p+3), *(p+7) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
|
inline |
Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
|
inline |
Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
|
inline |
Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4) ] = a[ *(p+1), *(p+5) ] = b[ *(p+2), *(p+6) ] = c[ *(p+3), *(p+7) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
|
inline |
Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.
- 128-bit version:
- p must be aligned to 16 bytes.[ *(p), *(p+4) ] = a[ *(p+1), *(p+5) ] = b[ *(p+2), *(p+6) ] = c[ *(p+3), *(p+7) ] = d
- 256-bit version:
- p must be aligned to 32 bytes.[ *(p), *(p+4), *(p+8), *(p+12) ] = a[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
|
inline |
Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.
- 128-bit version:
p must be aligned to 16 bytes.
- 256-bit version:
p must be aligned to 32 bytes.
- In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
- In AVX (integer vectors) this intrinsic results in at least 2 instructions.
Generated on Thu Oct 31 2013 04:08:51 for libsimdpp by 1.8.3.1