libsimdpp  1.0
Operations: insert or extract a single element from a vector

Functions

template<unsigned id>
uint8x16 simdpp::insert (uint8x16 a, uint8_t x)
 Inserts an element into int8x16 vector at the position identified by id. More...
 
template<unsigned id>
uint16x8 simdpp::insert (uint16x8 a, uint16_t x)
 Inserts an element into int16x8 vector at the position identified by id. More...
 
template<unsigned id>
uint32x4 simdpp::insert (uint32x4 a, uint32_t x)
 Inserts an element into int32x4 vector at the position identified by id. More...
 
template<unsigned id>
uint64x2 simdpp::insert (uint64x2 a, uint64_t x)
 Inserts an element into int64x2 vector at the position identified by id. More...
 
template<unsigned id>
float32x4 simdpp::insert (float32x4 a, float x)
 Inserts an element into float32x4 vector at the position identified by id. More...
 
template<unsigned id>
float64x2 simdpp::insert (float64x2 a, double x)
 Inserts an element into float64x2 vector at the position identified by id. More...
 
template<unsigned id>
uint8_t simdpp::extract (uint8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<unsigned id>
int8_t simdpp::extract (int8x16 a)
 Extracts the id-th element from int8x16 vector. More...
 
template<class E1 , class E2 >
uint8x32 simdpp::combine (uint8< 16, E1 > a, uint8< 16, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint16x16 simdpp::combine (uint16< 8, E1 > a, uint16< 8, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint32x8 simdpp::combine (uint32< 4, E1 > a, uint32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
uint64x4 simdpp::combine (uint64< 2, E1 > a, uint64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int16x16 simdpp::combine (int16< 8, E1 > a, int16< 8, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int32x8 simdpp::combine (int32< 4, E1 > a, int32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
int64x4 simdpp::combine (int64< 2, E1 > a, int64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
float32x8 simdpp::combine (float32< 4, E1 > a, float32< 4, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<class E1 , class E2 >
float64x4 simdpp::combine (float64< 2, E1 > a, float64< 2, E2 > b)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint8< N *2 > simdpp::combine (uint8< N, E1 > a1, uint8< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint16< N *2 > simdpp::combine (uint16< N, E1 > a1, uint16< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint32< N *2 > simdpp::combine (uint32< N, E1 > a1, uint32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
uint64< N *2 > simdpp::combine (uint64< N, E1 > a1, uint64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int8< N *2 > simdpp::combine (int8< N, E1 > a1, int8< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int16< N *2 > simdpp::combine (int16< N, E1 > a1, int16< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int32< N *2 > simdpp::combine (int32< N, E1 > a1, int32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
int64< N *2 > simdpp::combine (int64< N, E1 > a1, int64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
float32< N *2 > simdpp::combine (float32< N, E1 > a1, float32< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 
template<unsigned N, class E1 , class E2 >
float64< N *2 > simdpp::combine (float64< N, E1 > a1, float64< N, E2 > a2)
 Combines two 128-bit vectors into a 256-bit vector. More...
 

Detailed Description

Function Documentation

template<class E1 , class E2 >
uint8x32 simdpp::combine ( uint8< 16, E1 >  a,
uint8< 16, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
uint16x16 simdpp::combine ( uint16< 8, E1 >  a,
uint16< 8, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
uint32x8 simdpp::combine ( uint32< 4, E1 >  a,
uint32< 4, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
uint64x4 simdpp::combine ( uint64< 2, E1 >  a,
uint64< 2, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
int16x16 simdpp::combine ( int16< 8, E1 >  a,
int16< 8, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
int32x8 simdpp::combine ( int32< 4, E1 >  a,
int32< 4, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
int64x4 simdpp::combine ( int64< 2, E1 >  a,
int64< 2, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
float32x8 simdpp::combine ( float32< 4, E1 >  a,
float32< 4, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<class E1 , class E2 >
float64x4 simdpp::combine ( float64< 2, E1 >  a,
float64< 2, E2 >  b 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
uint8<N*2> simdpp::combine ( uint8< N, E1 >  a1,
uint8< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
uint16<N*2> simdpp::combine ( uint16< N, E1 >  a1,
uint16< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
uint32<N*2> simdpp::combine ( uint32< N, E1 >  a1,
uint32< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
uint64<N*2> simdpp::combine ( uint64< N, E1 >  a1,
uint64< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
int8<N*2> simdpp::combine ( int8< N, E1 >  a1,
int8< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
int16<N*2> simdpp::combine ( int16< N, E1 >  a1,
int16< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
int32<N*2> simdpp::combine ( int32< N, E1 >  a1,
int32< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
int64<N*2> simdpp::combine ( int64< N, E1 >  a1,
int64< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
float32<N*2> simdpp::combine ( float32< N, E1 >  a1,
float32< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned N, class E1 , class E2 >
float64<N*2> simdpp::combine ( float64< N, E1 >  a1,
float64< N, E2 >  a2 
)

Combines two 128-bit vectors into a 256-bit vector.

r = [ a, b ]
  • In AVX2 this intrinsic results in at least 1 instructions.
  • In SSE2-AVX, NEON and ALTIVEC this intrinsic results in at least 0 instructions.
template<unsigned id>
uint8_t simdpp::extract ( uint8x16  a)

Extracts the id-th element from int8x16 vector.

r = a[id]

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 1 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
int8_t simdpp::extract ( int8x16  a)

Extracts the id-th element from int8x16 vector.

r = a[id]

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 1-2 instructions.
  • In SSE4.1-AVX this intrinsic results in at least 1 instructions.
  • In ALTIVEC this intrinsic results in at least 2 instructions.
template<unsigned id>
uint8x16 simdpp::insert ( uint8x16  a,
uint8_t  x 
)

Inserts an element into int8x16 vector at the position identified by id.

r0 = (id == 0) ? x : a0
...
r15 = (id == 15) ? x : a15

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 4-5 instructions.
  • In ALTIVEC this intrinsic results in at least 3 instructions.
template<unsigned id>
uint16x8 simdpp::insert ( uint16x8  a,
uint16_t  x 
)

Inserts an element into int16x8 vector at the position identified by id.

r0 = (id == 0) ? x : a0
...
r7 = (id == 7) ? x : a7

This function may have very high latency.

  • In ALTIVEC this intrinsic results in at least 3 instructions.
template<unsigned id>
uint32x4 simdpp::insert ( uint32x4  a,
uint32_t  x 
)

Inserts an element into int32x4 vector at the position identified by id.

r0 = (id == 0) ? x : a0
r1 = (id == 1) ? x : a1
r2 = (id == 2) ? x : a2
r3 = (id == 3) ? x : a3

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 3 instructions.
template<unsigned id>
uint64x2 simdpp::insert ( uint64x2  a,
uint64_t  x 
)

Inserts an element into int64x2 vector at the position identified by id.

r0 = (id == 0) ? x : a0
r1 = (id == 1) ? x : a1

This function may have very high latency.

  • In SSE2, SSE3 and SSSE3 this intrinsic results in at least 2 instructions.
  • In SSE4_1 this intrinsic results in at least 1 instructions.
  • In SSE2_32bit, SSE3_32bit and SSSE3_32bit this intrinsic results in at least 4 instructions.
  • In SSE4_1_32bit this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 3 instructions.
template<unsigned id>
float32x4 simdpp::insert ( float32x4  a,
float  x 
)

Inserts an element into float32x4 vector at the position identified by id.

r0 = (id == 0) ? x : a0
r1 = (id == 1) ? x : a1
r2 = (id == 2) ? x : a2
r3 = (id == 3) ? x : a3

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 4 instructions.
  • In ALTIVEC this intrinsic results in at least 3 instructions.
template<unsigned id>
float64x2 simdpp::insert ( float64x2  a,
double  x 
)

Inserts an element into float64x2 vector at the position identified by id.

This function potentially

r0 = (id == 0) ? x : a0
r1 = (id == 1) ? x : a1

This function may have very high latency.

  • In SSE2-SSSE3 this intrinsic results in at least 2 instructions.
  • In ALTIVEC this intrinsic results in at least 3 instructions.