Functions
void	simdpp::store (void *p, int128 a)
	Stores a 128-bit or 256-bit integer vector to an aligned memory location. More...

void	simdpp::store (void *p, int256 a)

void	simdpp::store (float *p, float32x4 a)

void	simdpp::store (float *p, float32x8 a)

void	simdpp::store (double *p, float64x2 a)

void	simdpp::store (double *p, float64x4 a)

void	simdpp::stream (void *p, int128 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::stream (void *p, int256 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::stream (float *p, float32x4 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::stream (float *p, float32x8 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::stream (double *p, float64x2 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::stream (double *p, float64x4 a)
	Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible. More...

void	simdpp::store_first (void *p, basic_int8x16 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int8x32 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int16x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int16x16 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int32x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int32x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int64x2 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (void *p, basic_int64x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (float *p, float32x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (float *p, float32x8 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (double *p, float64x2 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_first (double *p, float64x4 a, unsigned n)
	Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int8x16 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int8x32 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int16x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int16x16 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int32x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int32x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int64x2 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (void *p, basic_int64x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (float *p, float32x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (float *p, float32x8 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (double *p, float64x2 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_last (double *p, float64x4 a, unsigned n)
	Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory. More...

void	simdpp::store_packed2 (void *p, basic_int8x16 a, basic_int8x16 b)
	Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int8x32 a, basic_int8x32 b)
	Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int16x8 a, basic_int16x8 b)
	Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int16x16 a, basic_int16x16 b)
	Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int32x4 a, basic_int32x4 b)
	Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int32x8 a, basic_int32x8 b)
	Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int64x2 a, basic_int64x2 b)
	Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (void *p, basic_int64x4 a, basic_int64x4 b)
	Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (float *p, float32x4 a, float32x4 b)
	Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (float *p, float32x8 a, float32x8 b)
	Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (double *p, float64x2 a, float64x2 b)
	Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed2 (double *p, float64x4 a, float64x4 b)
	Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c)
	Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c)
	Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c)
	Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c)
	Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c)
	Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c)
	Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c)
	Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c)
	Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (float *p, float32x4 a, float32x4 b, float32x4 c)
	Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (float *p, float32x8 a, float32x8 b, float32x8 c)
	Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (double *p, float64x2 a, float64x2 b, float64x2 c)
	Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed3 (double *p, float64x4 a, float64x4 b, float64x4 c)
	Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int8x16 a, basic_int8x16 b, basic_int8x16 c, basic_int8x16 d)
	Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int8x32 a, basic_int8x32 b, basic_int8x32 c, basic_int8x32 d)
	Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int16x8 a, basic_int16x8 b, basic_int16x8 c, basic_int16x8 d)
	Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int16x16 a, basic_int16x16 b, basic_int16x16 c, basic_int16x16 d)
	Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int32x4 a, basic_int32x4 b, basic_int32x4 c, basic_int32x4 d)
	Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int32x8 a, basic_int32x8 b, basic_int32x8 c, basic_int32x8 d)
	Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int64x2 a, basic_int64x2 b, basic_int64x2 c, basic_int64x2 d)
	Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (void *p, basic_int64x4 a, basic_int64x4 b, basic_int64x4 c, basic_int64x4 d)
	Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (float *p, float32x4 a, float32x4 b, float32x4 c, float32x4 d)
	Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (float *p, float32x8 a, float32x8 b, float32x8 c, float32x8 d)
	Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (double *p, float64x2 a, float64x2 b, float64x2 c, float64x2 d)
	Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

void	simdpp::store_packed4 (double *p, float64x4 a, float64x4 b, float64x4 c, float64x4 d)
	Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p. More...

Detailed Description

Function Documentation

void simdpp::store	(	void *	p,
		int128	a
	)

inline

Stores a 128-bit or 256-bit integer vector to an aligned memory location.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::store	(	void *	p,
		int256	a
	)

inline

void simdpp::store	(	float *	p,
		float32x4	a
	)

inline

void simdpp::store	(	float *	p,
		float32x8	a
	)

inline

void simdpp::store	(	double *	p,
		float64x2	a
	)

inline

void simdpp::store	(	double *	p,
		float64x4	a
	)

inline

void simdpp::store_first	(	void *	p,
		basic_int8x16	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int8x32	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int16x8	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int16x16	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int32x4	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int32x8	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int64x2	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	void *	p,
		basic_int64x4	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	float *	p,
		float32x4	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	float *	p,
		float32x8	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	double *	p,
		float64x2	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_first	(	double *	p,
		float64x4	a,
		unsigned	n
	)

inline

Stores the first n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p) = a0
*(p+1) = a1
...
*(p+n-1) = a{n-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int8x16	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int8x32	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int16x8	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int16x16	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int32x4	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int32x8	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int64x2	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	void *	p,
		basic_int64x4	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	float *	p,
		float32x4	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	float *	p,
		float32x8	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	double *	p,
		float64x2	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_last	(	double *	p,
		float64x4	a,
		unsigned	n
	)

inline

Stores the last n elements of an 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory.

n must be in range [0..N-1] where N is the number of elements in the vector. If n is zero, no store is made.

*(p+N-n) = a{N-n}
...
*(p+N-2) = a{N-2}
*(p+N-1) = a{N-1}

This function results in several instructions. It is best not to use it in inner loops.

128-bit version:: p must be aligned to 16 bytes.

256-bit version:: p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int8x16	a,
		basic_int8x16	b
	)

inline

Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int8x32	a,
		basic_int8x32	b
	)

inline

Interleaves 8-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+62) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+63) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int16x8	a,
		basic_int16x8	b
	)

inline

Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int16x16	a,
		basic_int16x16	b
	)

inline

Interleaves 16-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+30) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+31) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int32x4	a,
		basic_int32x4	b
	)

inline

Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), *(p+6) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int32x8	a,
		basic_int32x8	b
	)

inline

Interleaves 32-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), *(p+6) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+7) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int64x2	a,
		basic_int64x2	b
	)

inline

Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2) ] = a

[ *(p+1), *(p+3) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	void *	p,
		basic_int64x4	a,
		basic_int64x4	b
	)

inline

Interleaves 64-bit values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2) ] = a

[ *(p+1), *(p+3) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	float *	p,
		float32x4	a,
		float32x4	b
	)

inline

Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	float *	p,
		float32x8	a,
		float32x8	b
	)

inline

Interleaves 32-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+6) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+7) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), ... , *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), ... , *(p+15) ] = b

p must be aligned to 32 bytes.

void simdpp::store_packed2	(	double *	p,
		float64x2	a,
		float64x2	b
	)

inline

Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2) ] = a

[ *(p+1), *(p+3) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b

p must be aligned to 32 bytes

void simdpp::store_packed2	(	double *	p,
		float64x4	a,
		float64x4	b
	)

inline

Interleaves 64-bit floating-point values from two vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+2) ] = a

[ *(p+1), *(p+3) ] = b

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+2), *(p+4), *(p+14) ] = a

[ *(p+1), *(p+3), *(p+5), *(p+15) ] = b

p must be aligned to 32 bytes

void simdpp::store_packed3	(	void *	p,
		basic_int8x16	a,
		basic_int8x16	b,
		basic_int8x16	c
	)

inline

Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int8x32	a,
		basic_int8x32	b,
		basic_int8x32	c
	)

inline

Interleaves 8-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+93) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+94) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+95) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int16x8	a,
		basic_int16x8	b,
		basic_int16x8	c
	)

inline

Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int16x16	a,
		basic_int16x16	b,
		basic_int16x16	c
	)

inline

Interleaves 16-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+45) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+46) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+47) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int32x4	a,
		basic_int32x4	b,
		basic_int32x4	c
	)

inline

Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int32x8	a,
		basic_int32x8	b,
		basic_int32x8	c
	)

inline

Interleaves 32-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int64x2	a,
		basic_int64x2	b,
		basic_int64x2	c
	)

inline

Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3) ] = a

[ *(p+1), *(p+4) ] = b

[ *(p+2), *(p+5) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	void *	p,
		basic_int64x4	a,
		basic_int64x4	b,
		basic_int64x4	c
	)

inline

Interleaves 64-bit values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3) ] = a

[ *(p+1), *(p+4) ] = b

[ *(p+2), *(p+5) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	float *	p,
		float32x4	a,
		float32x4	b,
		float32x4	c
	)

inline

Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	float *	p,
		float32x8	a,
		float32x8	b,
		float32x8	c
	)

inline

Interleaves 32-bit floating-point values from three vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), ... , *(p+21) ] = a

[ *(p+1), *(p+4), *(p+7), ... , *(p+22) ] = b

[ *(p+2), *(p+5), *(p+8), ... , *(p+23) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	double *	p,
		float64x2	a,
		float64x2	b,
		float64x2	c
	)

inline

Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: [ *(p), *(p+3) ] = a

[ *(p+1), *(p+4) ] = b

[ *(p+2), *(p+5) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed3	(	double *	p,
		float64x4	a,
		float64x4	b,
		float64x4	c
	)

inline

Interleaves 64-bit floating-point values from three vectors and stores the result into successive locations starting from p.

Loads 64-bit floating point values packed in triplets, de-interleaves them and stores the result into three vectors.

128-bit version:: [ *(p), *(p+3) ] = a

[ *(p+1), *(p+4) ] = b

[ *(p+2), *(p+5) ] = c

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+3), *(p+6), *(p+9) ] = a

[ *(p+1), *(p+4), *(p+7), *(p+10) ] = b

[ *(p+2), *(p+5), *(p+8), *(p+11) ] = c

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int8x16	a,
		basic_int8x16	b,
		basic_int8x16	c,
		basic_int8x16	d
	)

inline

Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int8x32	a,
		basic_int8x32	b,
		basic_int8x32	c,
		basic_int8x32	d
	)

inline

Interleaves 8-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+124) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+125) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+126) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+127) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int16x8	a,
		basic_int16x8	b,
		basic_int16x8	c,
		basic_int16x8	d
	)

inline

Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int16x16	a,
		basic_int16x16	b,
		basic_int16x16	c,
		basic_int16x16	d
	)

inline

Interleaves 16-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+60) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+61) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+62) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+63) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int32x4	a,
		basic_int32x4	b,
		basic_int32x4	c,
		basic_int32x4	d
	)

inline

Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int32x8	a,
		basic_int32x8	b,
		basic_int32x8	c,
		basic_int32x8	d
	)

inline

Interleaves 32-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int64x2	a,
		basic_int64x2	b,
		basic_int64x2	c,
		basic_int64x2	d
	)

inline

Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4) ] = a

[ *(p+1), *(p+5) ] = b

[ *(p+2), *(p+6) ] = c

[ *(p+3), *(p+7) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	void *	p,
		basic_int64x4	a,
		basic_int64x4	b,
		basic_int64x4	c,
		basic_int64x4	d
	)

inline

Interleaves 64-bit values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4) ] = a

[ *(p+1), *(p+5) ] = b

[ *(p+2), *(p+6) ] = c

[ *(p+3), *(p+7) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	float *	p,
		float32x4	a,
		float32x4	b,
		float32x4	c,
		float32x4	d
	)

inline

Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	float *	p,
		float32x8	a,
		float32x8	b,
		float32x8	c,
		float32x8	d
	)

inline

Interleaves 32-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), ... , *(p+28) ] = a

[ *(p+1), *(p+5), *(p+9), ... , *(p+29) ] = b

[ *(p+2), *(p+6), *(p+10), ... , *(p+30) ] = c

[ *(p+3), *(p+7), *(p+11), ... , *(p+31) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	double *	p,
		float64x2	a,
		float64x2	b,
		float64x2	c,
		float64x2	d
	)

inline

Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4) ] = a

[ *(p+1), *(p+5) ] = b

[ *(p+2), *(p+6) ] = c

[ *(p+3), *(p+7) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 32 bytes.

void simdpp::store_packed4	(	double *	p,
		float64x4	a,
		float64x4	b,
		float64x4	c,
		float64x4	d
	)

inline

Interleaves 64-bit floating-point values from four vectors and stores the result into successive locations starting from p.

128-bit version:: [ *(p), *(p+4) ] = a

[ *(p+1), *(p+5) ] = b

[ *(p+2), *(p+6) ] = c

[ *(p+3), *(p+7) ] = d

p must be aligned to 16 bytes.

256-bit version:: [ *(p), *(p+4), *(p+8), *(p+12) ] = a

[ *(p+1), *(p+5), *(p+9), *(p+13) ] = b

[ *(p+2), *(p+6), *(p+10), *(p+14) ] = c

[ *(p+3), *(p+7), *(p+11), *(p+15) ] = d

p must be aligned to 32 bytes.

void simdpp::stream	(	void *	p,
		int128	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::stream	(	void *	p,
		int256	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::stream	(	float *	p,
		float32x4	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::stream	(	float *	p,
		float32x8	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::stream	(	double *	p,
		float64x2	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

void simdpp::stream	(	double *	p,
		float64x4	a
	)

inline

Stores a 128-bit or 256-bit integer, 32-bit or 64-bit floating point vector to memory without polluting the caches, if possible.

128-bit version:

*(p) = a[0..127]

p must be aligned to 16 bytes.

256-bit version:

*(p) = a[0..255]

p must be aligned to 32 bytes.

In SSE2-SSE4.1, NEON and ALTIVEC this intrinsic results in at least 2 instructions.
In AVX (integer vectors) this intrinsic results in at least 2 instructions.

Functions

Detailed Description

Function Documentation