several vectors More...

Functions
void	simdpp::transpose2 (uint16x8 &a0, uint16x8 &a1)
	Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...

void	simdpp::transpose2 (int16x8 &a0, int16x8 &a1)

void	simdpp::transpose2 (uint16x16 &a0, uint16x16 &a1)

void	simdpp::transpose2 (int16x16 &a0, int16x16 &a1)

void	simdpp::transpose8 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3, uint8x16 &a4, uint8x16 &a5, uint8x16 &a6, uint8x16 &a7)
	Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...

void	simdpp::transpose8 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3, int8x16 &a4, int8x16 &a5, int8x16 &a6, int8x16 &a7)

void	simdpp::transpose8 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3, uint8x32 &a4, uint8x32 &a5, uint8x32 &a6, uint8x32 &a7)

void	simdpp::transpose8 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3, int8x32 &a4, int8x32 &a5, int8x32 &a6, int8x32 &a7)

void	simdpp::transpose2 (uint32x4 &a0, uint32x4 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	simdpp::transpose2 (int32x4 &a0, int32x4 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	simdpp::transpose2 (uint32x8 &a0, uint32x8 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	simdpp::transpose2 (int32x8 &a0, int32x8 &a1)
	Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...

void	simdpp::transpose2 (uint64x2 &a0, uint64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose2 (int64x2 &a0, int64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose2 (uint64x4 &a0, uint64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose2 (int64x4 &a0, int64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose2 (float32x4 &a0, float32x4 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...

void	simdpp::transpose2 (float32x8 &a0, float32x8 &a1)
	Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...

void	simdpp::transpose2 (float64x2 &a0, float64x2 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose2 (float64x4 &a0, float64x4 &a1)
	Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...

void	simdpp::transpose4 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	simdpp::transpose4 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	simdpp::transpose4 (uint32x8 &a0, uint32x8 &a1, uint32x8 &a2, uint32x8 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	simdpp::transpose4 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	simdpp::transpose4 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3)
	Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...

void	simdpp::transpose4 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	simdpp::transpose4 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	simdpp::transpose4 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	simdpp::transpose4 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3)
	Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...

void	simdpp::transpose4 (uint32x4 &a0, uint32x4 &a1, uint32x4 &a2, uint32x4 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...

void	simdpp::transpose4 (int32x4 &a0, int32x4 &a1, int32x4 &a2, int32x4 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...

void	simdpp::transpose4 (int32x8 &a0, int32x8 &a1, int32x8 &a2, int32x8 &a3)
	Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...

void	simdpp::transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...

void	simdpp::transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
	Transposes 4x4 32-bit matrix within four float32x4 vectors. More...

Detailed Description

several vectors

Function Documentation

void simdpp::transpose2	(	uint16x8 &	a0,
		uint16x8 &	a1
	)

inline

Transposes four 2x2 16-bit matrices within two int16x8 vectors.

r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]

r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	int16x8 &	a0,
		int16x8 &	a1
	)

inline

void simdpp::transpose2	(	uint16x16 &	a0,
		uint16x16 &	a1
	)

inline

void simdpp::transpose2	(	int16x16 &	a0,
		int16x16 &	a1
	)

inline

void simdpp::transpose2	(	uint32x4 &	a0,
		uint32x4 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	int32x4 &	a0,
		int32x4 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	uint32x8 &	a0,
		uint32x8 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	int32x8 &	a0,
		int32x8 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 8 instructions.
In AVX2 this intrinsic results in at least 4 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	uint64x2 &	a0,
		uint64x2 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	int64x2 &	a0,
		int64x2 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	uint64x4 &	a0,
		uint64x4 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	int64x4 &	a0,
		int64x4 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 4 instructions.
In AVX2 this intrinsic results in at least 2 instructions.
In NEON this intrinsic results in at least 2 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.

void simdpp::transpose2	(	float32x4 &	a0,
		float32x4 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two float32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1 this intrinsic results in at least 8 instructions.
In AVX-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.
In NEON this intrinsic results in at least 2 instructions.

void simdpp::transpose2	(	float32x8 &	a0,
		float32x8 &	a1
	)

inline

Transposes two 2x2 32-bit matrices within two float32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]

r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 2-4 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1 this intrinsic results in at least 8 instructions.
In AVX-AVX2 this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 4-6 instructions.
In NEON this intrinsic results in at least 2 instructions.

void simdpp::transpose2	(	float64x2 &	a0,
		float64x2 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1 this intrinsic results in at least 4 instructions.
In AVX-AVX2 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

void simdpp::transpose2	(	float64x4 &	a0,
		float64x4 &	a1
	)

inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]

r1 = [ a0_1; a1_1 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-SSE4.1 this intrinsic results in at least 4 instructions.
In AVX-AVX2 this intrinsic results in at least 2 instructions.
Not vectorized in NEON and .

void simdpp::transpose4	(	uint32x4 &	a0,
		uint32x4 &	a1,
		uint32x4 &	a2,
		uint32x4 &	a3
	)

inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose4	(	uint8x16 &	a0,
		uint8x16 &	a1,
		uint8x16 &	a2,
		uint8x16 &	a3
	)

inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 32 instructions.
In AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	int8x16 &	a0,
		int8x16 &	a1,
		int8x16 &	a2,
		int8x16 &	a3
	)

inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 32 instructions.
In AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	uint32x8 &	a0,
		uint32x8 &	a1,
		uint32x8 &	a2,
		uint32x8 &	a3
	)

inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 32 instructions.
In AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	uint8x32 &	a0,
		uint8x32 &	a1,
		uint8x32 &	a2,
		uint8x32 &	a3
	)

inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 32 instructions.
In AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	int8x32 &	a0,
		int8x32 &	a1,
		int8x32 &	a2,
		int8x32 &	a3
	)

inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 32 instructions.
In AVX2 this intrinsic results in at least 16 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	uint16x8 &	a0,
		uint16x8 &	a1,
		uint16x8 &	a2,
		uint16x8 &	a3
	)

inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	int16x8 &	a0,
		int16x8 &	a1,
		int16x8 &	a2,
		int16x8 &	a3
	)

inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	uint16x16 &	a0,
		uint16x16 &	a1,
		uint16x16 &	a2,
		uint16x16 &	a3
	)

inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	int16x16 &	a0,
		int16x16 &	a1,
		int16x16 &	a2,
		int16x16 &	a3
	)

inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:: The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

void simdpp::transpose4	(	int32x4 &	a0,
		int32x4 &	a1,
		int32x4 &	a2,
		int32x4 &	a3
	)

inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose4	(	int32x8 &	a0,
		int32x8 &	a1,
		int32x8 &	a2,
		int32x8 &	a3
	)

inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 24 instructions.
In AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose4	(	float32x4 &	a0,
		float32x4 &	a1,
		float32x4 &	a2,
		float32x4 &	a3
	)

inline

Transposes 4x4 32-bit matrix within four float32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 24 instructions.
In AVX-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose4	(	float32x8 &	a0,
		float32x8 &	a1,
		float32x8 &	a2,
		float32x8 &	a3
	)

inline

Transposes 4x4 32-bit matrix within four float32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 4 instructions.
In ALTIVEC this intrinsic results in at least 8-12 instructions.

256-bit version:

In SSE2-SSE4.1 this intrinsic results in at least 24 instructions.
In AVX-AVX2 this intrinsic results in at least 12 instructions.
In NEON this intrinsic results in at least 8 instructions.
In ALTIVEC this intrinsic results in at least 16-20 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose8	(	uint8x16 &	a0,
		uint8x16 &	a1,
		uint8x16 &	a2,
		uint8x16 &	a3,
		uint8x16 &	a4,
		uint8x16 &	a5,
		uint8x16 &	a6,
		uint8x16 &	a7
	)

inline

Transposes two 8x8 8-bit matrices within eight int8x16 vectors.

r0 = [ a0_0; ...; a7_0 ; a0_8; ...; a7_8 ]
r1 = [ a0_1; ...; a7_1 ; a0_9; ...; a7_9 ]
...
r7 = [ a0_7; ...; a7_7 ; a0_15; ...; a7_15 ]

128-bit version:

In SSE2-AVX2 this intrinsic results in at least 32 instructions.
In NEON this intrinsic results in at least 12 instructions.
In ALTIVEC this intrinsic results in at least 24-30 instructions.

256-bit version:

In SSE2-AVX this intrinsic results in at least 64 instructions.
In AVX2 this intrinsic results in at least 32 instructions.
In NEON this intrinsic results in at least 24 instructions.
In ALTIVEC this intrinsic results in at least 48-54 instructions.

The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.

void simdpp::transpose8	(	int8x16 &	a0,
		int8x16 &	a1,
		int8x16 &	a2,
		int8x16 &	a3,
		int8x16 &	a4,
		int8x16 &	a5,
		int8x16 &	a6,
		int8x16 &	a7
	)

inline

void simdpp::transpose8	(	uint8x32 &	a0,
		uint8x32 &	a1,
		uint8x32 &	a2,
		uint8x32 &	a3,
		uint8x32 &	a4,
		uint8x32 &	a5,
		uint8x32 &	a6,
		uint8x32 &	a7
	)

inline

void simdpp::transpose8	(	int8x32 &	a0,
		int8x32 &	a1,
		int8x32 &	a2,
		int8x32 &	a3,
		int8x32 &	a4,
		int8x32 &	a5,
		int8x32 &	a6,
		int8x32 &	a7
	)

inline

Functions

Detailed Description

Function Documentation