libsimdpp  1.0
Operations: transpose matrices consisting of

several vectors More...

Functions

void simdpp::transpose2 (uint16x8 &a0, uint16x8 &a1)
 Transposes four 2x2 16-bit matrices within two int16x8 vectors. More...
 
void simdpp::transpose2 (int16x8 &a0, int16x8 &a1)
 
void simdpp::transpose2 (uint16x16 &a0, uint16x16 &a1)
 
void simdpp::transpose2 (int16x16 &a0, int16x16 &a1)
 
void simdpp::transpose8 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3, uint8x16 &a4, uint8x16 &a5, uint8x16 &a6, uint8x16 &a7)
 Transposes two 8x8 8-bit matrices within eight int8x16 vectors. More...
 
void simdpp::transpose8 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3, int8x16 &a4, int8x16 &a5, int8x16 &a6, int8x16 &a7)
 
void simdpp::transpose8 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3, uint8x32 &a4, uint8x32 &a5, uint8x32 &a6, uint8x32 &a7)
 
void simdpp::transpose8 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3, int8x32 &a4, int8x32 &a5, int8x32 &a6, int8x32 &a7)
 
void simdpp::transpose2 (uint32x4 &a0, uint32x4 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void simdpp::transpose2 (int32x4 &a0, int32x4 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void simdpp::transpose2 (uint32x8 &a0, uint32x8 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void simdpp::transpose2 (int32x8 &a0, int32x8 &a1)
 Transposes two 2x2 32-bit matrices within two int32x4 vectors. More...
 
void simdpp::transpose2 (uint64x2 &a0, uint64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose2 (int64x2 &a0, int64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose2 (uint64x4 &a0, uint64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose2 (int64x4 &a0, int64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose2 (float32x4 &a0, float32x4 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void simdpp::transpose2 (float32x8 &a0, float32x8 &a1)
 Transposes two 2x2 32-bit matrices within two float32x4 vectors. More...
 
void simdpp::transpose2 (float64x2 &a0, float64x2 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose2 (float64x4 &a0, float64x4 &a1)
 Transposes a 2x2 64-bit matrix within two int64x2 vectors. More...
 
void simdpp::transpose4 (uint8x16 &a0, uint8x16 &a1, uint8x16 &a2, uint8x16 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void simdpp::transpose4 (int8x16 &a0, int8x16 &a1, int8x16 &a2, int8x16 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void simdpp::transpose4 (uint32x8 &a0, uint32x8 &a1, uint32x8 &a2, uint32x8 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void simdpp::transpose4 (uint8x32 &a0, uint8x32 &a1, uint8x32 &a2, uint8x32 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void simdpp::transpose4 (int8x32 &a0, int8x32 &a1, int8x32 &a2, int8x32 &a3)
 Transposes four 4x4 8-bit matrix within four int8x16 vectors. More...
 
void simdpp::transpose4 (uint16x8 &a0, uint16x8 &a1, uint16x8 &a2, uint16x8 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void simdpp::transpose4 (int16x8 &a0, int16x8 &a1, int16x8 &a2, int16x8 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void simdpp::transpose4 (uint16x16 &a0, uint16x16 &a1, uint16x16 &a2, uint16x16 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void simdpp::transpose4 (int16x16 &a0, int16x16 &a1, int16x16 &a2, int16x16 &a3)
 Transposes two 4x4 16-bit matrices within four int16x8 vectors. More...
 
void simdpp::transpose4 (uint32x4 &a0, uint32x4 &a1, uint32x4 &a2, uint32x4 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void simdpp::transpose4 (int32x4 &a0, int32x4 &a1, int32x4 &a2, int32x4 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void simdpp::transpose4 (int32x8 &a0, int32x8 &a1, int32x8 &a2, int32x8 &a3)
 Transposes a 4x4 32-bit matrix within four int32x4 vectors. More...
 
void simdpp::transpose4 (float32x4 &a0, float32x4 &a1, float32x4 &a2, float32x4 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 
void simdpp::transpose4 (float32x8 &a0, float32x8 &a1, float32x8 &a2, float32x8 &a3)
 Transposes 4x4 32-bit matrix within four float32x4 vectors. More...
 

Detailed Description

several vectors

Function Documentation

void simdpp::transpose2 ( uint16x8 &  a0,
uint16x8 &  a1 
)
inline

Transposes four 2x2 16-bit matrices within two int16x8 vectors.

r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]
r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( int16x8 &  a0,
int16x8 &  a1 
)
inline
void simdpp::transpose2 ( uint16x16 &  a0,
uint16x16 &  a1 
)
inline
void simdpp::transpose2 ( int16x16 &  a0,
int16x16 &  a1 
)
inline
void simdpp::transpose2 ( uint32x4 &  a0,
uint32x4 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( int32x4 &  a0,
int32x4 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( uint32x8 &  a0,
uint32x8 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( int32x8 &  a0,
int32x8 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two int32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 8 instructions.
  • In AVX2 this intrinsic results in at least 4 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( uint64x2 &  a0,
uint64x2 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( int64x2 &  a0,
int64x2 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( uint64x4 &  a0,
uint64x4 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( int64x4 &  a0,
int64x4 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 4 instructions.
  • In AVX2 this intrinsic results in at least 2 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
void simdpp::transpose2 ( float32x4 &  a0,
float32x4 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two float32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1 this intrinsic results in at least 8 instructions.
  • In AVX-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
void simdpp::transpose2 ( float32x8 &  a0,
float32x8 &  a1 
)
inline

Transposes two 2x2 32-bit matrices within two float32x4 vectors.

r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 2-4 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1 this intrinsic results in at least 8 instructions.
  • In AVX-AVX2 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 4-6 instructions.
  • In NEON this intrinsic results in at least 2 instructions.
void simdpp::transpose2 ( float64x2 &  a0,
float64x2 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1 this intrinsic results in at least 4 instructions.
  • In AVX-AVX2 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
void simdpp::transpose2 ( float64x4 &  a0,
float64x4 &  a1 
)
inline

Transposes a 2x2 64-bit matrix within two int64x2 vectors.

r0 = [ a0_0; a1_0 ]
r1 = [ a0_1; a1_1 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-SSE4.1 this intrinsic results in at least 4 instructions.
  • In AVX-AVX2 this intrinsic results in at least 2 instructions.
  • Not vectorized in NEON and .
void simdpp::transpose4 ( uint32x4 &  a0,
uint32x4 &  a1,
uint32x4 &  a2,
uint32x4 &  a3 
)
inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose4 ( uint8x16 &  a0,
uint8x16 &  a1,
uint8x16 &  a2,
uint8x16 &  a3 
)
inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 32 instructions.
  • In AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( int8x16 &  a0,
int8x16 &  a1,
int8x16 &  a2,
int8x16 &  a3 
)
inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 32 instructions.
  • In AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( uint32x8 &  a0,
uint32x8 &  a1,
uint32x8 &  a2,
uint32x8 &  a3 
)
inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 32 instructions.
  • In AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( uint8x32 &  a0,
uint8x32 &  a1,
uint8x32 &  a2,
uint8x32 &  a3 
)
inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 32 instructions.
  • In AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( int8x32 &  a0,
int8x32 &  a1,
int8x32 &  a2,
int8x32 &  a3 
)
inline

Transposes four 4x4 8-bit matrix within four int8x16 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 32 instructions.
  • In AVX2 this intrinsic results in at least 16 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( uint16x8 &  a0,
uint16x8 &  a1,
uint16x8 &  a2,
uint16x8 &  a3 
)
inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( int16x8 &  a0,
int16x8 &  a1,
int16x8 &  a2,
int16x8 &  a3 
)
inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( uint16x16 &  a0,
uint16x16 &  a1,
uint16x16 &  a2,
uint16x16 &  a3 
)
inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( int16x16 &  a0,
int16x16 &  a1,
int16x16 &  a2,
int16x16 &  a3 
)
inline

Transposes two 4x4 16-bit matrices within four int16x8 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
void simdpp::transpose4 ( int32x4 &  a0,
int32x4 &  a1,
int32x4 &  a2,
int32x4 &  a3 
)
inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose4 ( int32x8 &  a0,
int32x8 &  a1,
int32x8 &  a2,
int32x8 &  a3 
)
inline

Transposes a 4x4 32-bit matrix within four int32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 24 instructions.
  • In AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose4 ( float32x4 &  a0,
float32x4 &  a1,
float32x4 &  a2,
float32x4 &  a3 
)
inline

Transposes 4x4 32-bit matrix within four float32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 24 instructions.
  • In AVX-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose4 ( float32x8 &  a0,
float32x8 &  a1,
float32x8 &  a2,
float32x8 &  a3 
)
inline

Transposes 4x4 32-bit matrix within four float32x4 vectors.

r0 = [ a0_0; a1_0; a2_0; a3_0 ]
r1 = [ a0_1; a1_1; a2_1; a3_1 ]
r2 = [ a0_2; a1_2; a2_2; a3_2 ]
r3 = [ a0_3; a1_3; a2_3; a3_3 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 8-12 instructions.
256-bit version:
  • In SSE2-SSE4.1 this intrinsic results in at least 24 instructions.
  • In AVX-AVX2 this intrinsic results in at least 12 instructions.
  • In NEON this intrinsic results in at least 8 instructions.
  • In ALTIVEC this intrinsic results in at least 16-20 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose8 ( uint8x16 &  a0,
uint8x16 &  a1,
uint8x16 &  a2,
uint8x16 &  a3,
uint8x16 &  a4,
uint8x16 &  a5,
uint8x16 &  a6,
uint8x16 &  a7 
)
inline

Transposes two 8x8 8-bit matrices within eight int8x16 vectors.

r0 = [ a0_0; ...; a7_0 ; a0_8; ...; a7_8 ]
r1 = [ a0_1; ...; a7_1 ; a0_9; ...; a7_9 ]
...
r7 = [ a0_7; ...; a7_7 ; a0_15; ...; a7_15 ]
128-bit version:
  • In SSE2-AVX2 this intrinsic results in at least 32 instructions.
  • In NEON this intrinsic results in at least 12 instructions.
  • In ALTIVEC this intrinsic results in at least 24-30 instructions.
256-bit version:
  • In SSE2-AVX this intrinsic results in at least 64 instructions.
  • In AVX2 this intrinsic results in at least 32 instructions.
  • In NEON this intrinsic results in at least 24 instructions.
  • In ALTIVEC this intrinsic results in at least 48-54 instructions.
The lower and higher 128-bit halves are processed as if 128-bit instruction was applied to each of them separately.
void simdpp::transpose8 ( int8x16 &  a0,
int8x16 &  a1,
int8x16 &  a2,
int8x16 &  a3,
int8x16 &  a4,
int8x16 &  a5,
int8x16 &  a6,
int8x16 &  a7 
)
inline
void simdpp::transpose8 ( uint8x32 &  a0,
uint8x32 &  a1,
uint8x32 &  a2,
uint8x32 &  a3,
uint8x32 &  a4,
uint8x32 &  a5,
uint8x32 &  a6,
uint8x32 &  a7 
)
inline
void simdpp::transpose8 ( int8x32 &  a0,
int8x32 &  a1,
int8x32 &  a2,
int8x32 &  a3,
int8x32 &  a4,
int8x32 &  a5,
int8x32 &  a6,
int8x32 &  a7 
)
inline