libsimdpp  0.9.3
Operations: store a register to memory

Functions

void simdpp::store (void *p, int128 a)
 Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...
 
void simdpp::store (void *p, int256 a)
 
void simdpp::store (float *p, float32x4 a)
 
void simdpp::store (float *p, float32x8 a)
 
void simdpp::store (double *p, float64x2 a)
 
void simdpp::store (double *p, float64x4 a)
 
void simdpp::stream (void *p, int128 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::stream (void *p, int256 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::stream (float *p, float32x4 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::stream (float *p, float32x8 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::stream (double *p, float64x2 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::stream (double *p, float64x4 a)
 Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...
 
void simdpp::store_first (void *p, basic_int8x16 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int8x32 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int16x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int16x16 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int32x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int32x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int64x2 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (void *p, basic_int64x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (float *p, float32x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (float *p, float32x8 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (double *p, float64x2 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_first (double *p, float64x4 a, unsigned n)
 Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int8x16 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int8x32 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int16x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int16x16 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int32x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int32x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int64x2 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (void *p, basic_int64x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (float *p, float32x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (float *p, float32x8 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (double *p, float64x2 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_last (double *p, float64x4 a, unsigned n)
 Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...
 
void simdpp::store_packed2 (void *p, basic_int8x16 a, basic_int8x16 b)
 Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int8x32 a, basic_int8x32 b)
 Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int16x8 a, basic_int16x8 b)
 Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int16x16 a, basic_int16x16 b)
 Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int32x4 a, basic_int32x4 b)
 Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int32x8 a, basic_int32x8 b)
 Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int64x2 a, basic_int64x2 b)
 Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (void *p, basic_int64x4 a, basic_int64x4 b)
 Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (float *p, float32x4 a, float32x4 b)
 Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (float *p, float32x8 a, float32x8 b)
 Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (double *p, float64x2 a, float64x2 b)
 Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed2 (double *p, float64x4 a, float64x4 b)
 Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c)
 Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c)
 Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c)
 Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c)
 Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c)
 Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c)
 Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c)
 Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c)
 Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (float *p, float32x4 a, float32x4 b, float32x4 c)
 Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (float *p, float32x8 a, float32x8 b, float32x8 c)
 Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (double *p, float64x2 a, float64x2 b, float64x2 c)
 Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed3 (double *p, float64x4 a, float64x4 b, float64x4 c)
 Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c, basic_int8x16 d)
 Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c, basic_int8x32 d)
 Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c, basic_int16x8 d)
 Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c, basic_int16x16 d)
 Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d)
 Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c, basic_int32x8 d)
 Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c, basic_int64x2 d)
 Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c, basic_int64x4 d)
 Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (float *p, float32x4 a, float32x4 b, float32x4 c, float32x4 d)
 Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (float *p, float32x8 a, float32x8 b, float32x8 c, float32x8 d)
 Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (double *p, float64x2 a, float64x2 b, float64x2 c, float64x2 d)
 Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 
void simdpp::store_packed4 (double *p, float64x4 a, float64x4 b, float64x4 c, float64x4 d)
 Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...
 

Detailed Description

Function Documentation

void simdpp::store ( void *  p,
int128  a 
)
inline

Stores a 128-bit or 256-bit integer vector to an aligned memory location.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::store ( void *  p,
int256  a 
)
inline
void simdpp::store ( float *  p,
float32x4  a 
)
inline
void simdpp::store ( float *  p,
float32x8  a 
)
inline
void simdpp::store ( double *  p,
float64x2  a 
)
inline
void simdpp::store ( double *  p,
float64x4  a 
)
inline
void simdpp::store_first ( void *  p,
basic_int8x16  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int8x32  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int16x8  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int16x16  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int32x4  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int32x8  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int64x2  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( void *  p,
basic_int64x4  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( float *  p,
float32x4  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( float *  p,
float32x8  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( double *  p,
float64x2  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_first ( double *  p,
float64x4  a,
unsigned  n 
)
inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int8x16  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int8x32  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int16x8  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int16x16  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int32x4  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int32x8  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int64x2  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( void *  p,
basic_int64x4  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( float *  p,
float32x4  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( float *  p,
float32x8  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( double *  p,
float64x2  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_last ( double *  p,
float64x4  a,
unsigned  n 
)
inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:
p must be aligned to 16 bytes.
256-bit version:
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int8x16  a,
basic_int8x16  b 
)
inline

Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int8x32  a,
basic_int8x32  b 
)
inline

Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int16x8  a,
basic_int16x8  b 
)
inline

Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int16x16  a,
basic_int16x16  b 
)
inline

Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int32x4  a,
basic_int32x4  b 
)
inline

Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), *(p+6) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int32x8  a,
basic_int32x8  b 
)
inline

Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), *(p+6) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int64x2  a,
basic_int64x2  b 
)
inline

Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2) ] = a
[ *(p+1), *(p+3) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( void *  p,
basic_int64x4  a,
basic_int64x4  b 
)
inline

Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2) ] = a
[ *(p+1), *(p+3) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( float *  p,
float32x4  a,
float32x4  b 
)
inline

Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( float *  p,
float32x8  a,
float32x8  b 
)
inline

Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b
p must be aligned to 32 bytes.
void simdpp::store_packed2 ( double *  p,
float64x2  a,
float64x2  b 
)
inline

Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2) ] = a
[ *(p+1), *(p+3) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
p must be aligned to 32 bytes
void simdpp::store_packed2 ( double *  p,
float64x4  a,
float64x4  b 
)
inline

Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+2) ] = a
[ *(p+1), *(p+3) ] = b
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+2), *(p+4), *(p+14) ] = a
[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b
p must be aligned to 32 bytes
void simdpp::store_packed3 ( void *  p,
basic_int8x16  a,
basic_int8x16  b,
basic_int8x16  c 
)
inline

Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int8x32  a,
basic_int8x32  b,
basic_int8x32  c 
)
inline

Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int16x8  a,
basic_int16x8  b,
basic_int16x8  c 
)
inline

Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int16x16  a,
basic_int16x16  b,
basic_int16x16  c 
)
inline

Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int32x4  a,
basic_int32x4  b,
basic_int32x4  c 
)
inline

Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int32x8  a,
basic_int32x8  b,
basic_int32x8  c 
)
inline

Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int64x2  a,
basic_int64x2  b,
basic_int64x2  c 
)
inline

Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3) ] = a
[ *(p+1), *(p+4) ] = b
[ *(p+2), *(p+5) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( void *  p,
basic_int64x4  a,
basic_int64x4  b,
basic_int64x4  c 
)
inline

Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3) ] = a
[ *(p+1), *(p+4) ] = b
[ *(p+2), *(p+5) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( float *  p,
float32x4  a,
float32x4  b,
float32x4  c 
)
inline

Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( float *  p,
float32x8  a,
float32x8  b,
float32x8  c 
)
inline

Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a
[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b
[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( double *  p,
float64x2  a,
float64x2  b,
float64x2  c 
)
inline

Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
[ *(p), *(p+3) ] = a
[ *(p+1), *(p+4) ] = b
[ *(p+2), *(p+5) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed3 ( double *  p,
float64x4  a,
float64x4  b,
float64x4  c 
)
inline

Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:
[ *(p), *(p+3) ] = a
[ *(p+1), *(p+4) ] = b
[ *(p+2), *(p+5) ] = c
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+3), *(p+6), *(p+9) ] = a
[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b
[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int8x16  a,
basic_int8x16  b,
basic_int8x16  c,
basic_int8x16  d 
)
inline

Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int8x32  a,
basic_int8x32  b,
basic_int8x32  c,
basic_int8x32  d 
)
inline

Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int16x8  a,
basic_int16x8  b,
basic_int16x8  c,
basic_int16x8  d 
)
inline

Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int16x16  a,
basic_int16x16  b,
basic_int16x16  c,
basic_int16x16  d 
)
inline

Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int32x4  a,
basic_int32x4  b,
basic_int32x4  c,
basic_int32x4  d 
)
inline

Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int32x8  a,
basic_int32x8  b,
basic_int32x8  c,
basic_int32x8  d 
)
inline

Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int64x2  a,
basic_int64x2  b,
basic_int64x2  c,
basic_int64x2  d 
)
inline

Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4) ] = a
[ *(p+1), *(p+5) ] = b
[ *(p+2), *(p+6) ] = c
[ *(p+3), *(p+7) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( void *  p,
basic_int64x4  a,
basic_int64x4  b,
basic_int64x4  c,
basic_int64x4  d 
)
inline

Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4) ] = a
[ *(p+1), *(p+5) ] = b
[ *(p+2), *(p+6) ] = c
[ *(p+3), *(p+7) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( float *  p,
float32x4  a,
float32x4  b,
float32x4  c,
float32x4  d 
)
inline

Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( float *  p,
float32x8  a,
float32x8  b,
float32x8  c,
float32x8  d 
)
inline

Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a
[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b
[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c
[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( double *  p,
float64x2  a,
float64x2  b,
float64x2  c,
float64x2  d 
)
inline

Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4) ] = a
[ *(p+1), *(p+5) ] = b
[ *(p+2), *(p+6) ] = c
[ *(p+3), *(p+7) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 32 bytes.
void simdpp::store_packed4 ( double *  p,
float64x4  a,
float64x4  b,
float64x4  c,
float64x4  d 
)
inline

Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:
[ *(p), *(p+4) ] = a
[ *(p+1), *(p+5) ] = b
[ *(p+2), *(p+6) ] = c
[ *(p+3), *(p+7) ] = d
p must be aligned to 16 bytes.
256-bit version:
[ *(p), *(p+4), *(p+8), *(p+12) ] = a
[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b
[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c
[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d
p must be aligned to 32 bytes.
void simdpp::stream ( void *  p,
int128  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::stream ( void *  p,
int256  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::stream ( float *  p,
float32x4  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::stream ( float *  p,
float32x8  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::stream ( double *  p,
float64x2  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.
void simdpp::stream ( double *  p,
float64x4  a 
)
inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:
*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:
*(p) = a[0..255]

p must be aligned to 32 bytes.

  • In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
  • In AVX (integer vectors) this intrinsic results in at least 2 instructions.