Cog/Frameworks/lazyusf2/lazyusf2/rsp_lle/vu/shuffle.h

317 lines
10 KiB
C

/******************************************************************************\
* Authors: Iconoclast *
* Release: 2013.12.04 *
* License: CC0 Public Domain Dedication *
* *
* To the extent possible under law, the author(s) have dedicated all copyright *
* and related and neighboring rights to this software to the public domain *
* worldwide. This software is distributed without any warranty. *
* *
* You should have received a copy of the CC0 Public Domain Dedication along *
* with this software. *
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
\******************************************************************************/
#ifndef _SHUFFLE_H
#define _SHUFFLE_H
/*
* for ANSI compliance (null INLINE attribute if not already set to `inline`)
* Include "rsp.h" for active, non-ANSI inline definition.
*/
#ifndef INLINE
#define INLINE
#endif
#ifdef ARCH_MIN_ARM_NEON
static const unsigned char smask[16][16] = {
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD},
{0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7},
{0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}
};
#define conv816_to_882(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{
int8x16_t xmm;
int8x16_t key;
xmm = vld1q_s8((const int8_t*)VT);
key = vld1q_s8(smask[e]);
xmm = vcombine_s8(vtbl2_s8(conv816_to_882(xmm), vget_low_s8(key)), vtbl2_s8(conv816_to_882(xmm), vget_high_s8(key)));
vst1q_s8((int8_t*)VD, xmm);
return;
}
#endif
#ifdef ARCH_MIN_SSSE3
static const unsigned char smask[16][16] = {
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xE,0xF},
{0x0,0x1,0x0,0x1,0x4,0x5,0x4,0x5,0x8,0x9,0x8,0x9,0xC,0xD,0xC,0xD},
{0x2,0x3,0x2,0x3,0x6,0x7,0x6,0x7,0xA,0xB,0xA,0xB,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF},
{0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1,0x0,0x1},
{0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3,0x2,0x3},
{0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5,0x4,0x5},
{0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7,0x6,0x7},
{0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9,0x8,0x9},
{0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB,0xA,0xB},
{0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD,0xC,0xD},
{0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF,0xE,0xF}
};
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{ /* SSSE3 shuffling method was written entirely by CEN64 author MarathonMan. */
__m128i xmm;
__m128i key;
xmm = _mm_load_si128((__m128i *)VT);
key = _mm_load_si128((__m128i *)(smask[e]));
xmm = _mm_shuffle_epi8(xmm, key);
_mm_store_si128((__m128i *)VD, xmm);
return;
}
#endif
#if defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
#define B(x) ((x) & 3)
#define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
static const int simm[16] = {
SHUFFLE(00, 01, 02, 03), /* vector operands */
SHUFFLE(00, 01, 02, 03),
SHUFFLE(00, 00, 02, 02), /* scalar quarters */
SHUFFLE(01, 01, 03, 03),
SHUFFLE(00, 00, 00, 00), /* scalar halves */
SHUFFLE(01, 01, 01, 01),
SHUFFLE(02, 02, 02, 02),
SHUFFLE(03, 03, 03, 03),
SHUFFLE(00, 00, 00, 00), /* scalar wholes */
SHUFFLE(01, 01, 01, 01),
SHUFFLE(02, 02, 02, 02),
SHUFFLE(03, 03, 03, 03),
SHUFFLE(04, 04, 04, 04),
SHUFFLE(05, 05, 05, 05),
SHUFFLE(06, 06, 06, 06),
SHUFFLE(07, 07, 07, 07)
};
static __m128i shuffle_none(__m128i xmm)
{/*
// const int order = simm[0x0];
xmm = _mm_shufflehi_epi16(xmm, order);
xmm = _mm_shufflelo_epi16(xmm, order);*/
return (xmm);
}
static __m128i shuffle_0q(__m128i xmm)
{
//const int order = simm[0x2];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(00, 00, 02, 02));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(00, 00, 02, 02));
return (xmm);
}
static __m128i shuffle_1q(__m128i xmm)
{
//const int order = simm[0x3];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(01, 01, 03, 03));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(01, 01, 03, 03));
return (xmm);
}
static __m128i shuffle_0h(__m128i xmm)
{
//const int order = simm[0x4];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(00, 00, 00, 00));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(00, 00, 00, 00));
return (xmm);
}
static __m128i shuffle_1h(__m128i xmm)
{
//const int order = simm[0x5];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(01, 01, 01, 01));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(01, 01, 01, 01));
return (xmm);
}
static __m128i shuffle_2h(__m128i xmm)
{
//const int order = simm[0x6];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(02, 02, 02, 02));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(02, 02, 02, 02));
return (xmm);
}
static __m128i shuffle_3h(__m128i xmm)
{
//const int order = simm[0x7];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(03, 03, 03, 03));
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(03, 03, 03, 03));
return (xmm);
}
static __m128i shuffle_0w(__m128i xmm)
{
//const int order = simm[0x8];
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(00, 00, 00, 00));
xmm = _mm_unpacklo_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_1w(__m128i xmm)
{
//const int order = simm[0x9];
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(01, 01, 01, 01));
xmm = _mm_unpacklo_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_2w(__m128i xmm)
{
//const int order = simm[0xA];
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(02, 02, 02, 02));
xmm = _mm_unpacklo_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_3w(__m128i xmm)
{
//const int order = simm[0xB];
xmm = _mm_shufflelo_epi16(xmm, SHUFFLE(03, 03, 03, 03));
xmm = _mm_unpacklo_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_4w(__m128i xmm)
{
//const int order = simm[0xC];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(04, 04, 04, 04));
xmm = _mm_unpackhi_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_5w(__m128i xmm)
{
//const int order = simm[0xD];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(05, 05, 05, 05));
xmm = _mm_unpackhi_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_6w(__m128i xmm)
{
//const int order = simm[0xE];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(06, 06, 06, 06));
xmm = _mm_unpackhi_epi16(xmm, xmm);
return (xmm);
}
static __m128i shuffle_7w(__m128i xmm)
{
//const int order = simm[0xF];
xmm = _mm_shufflehi_epi16(xmm, SHUFFLE(07, 07, 07, 07));
xmm = _mm_unpackhi_epi16(xmm, xmm);
return (xmm);
}
static __m128i (*SSE2_SHUFFLE_16[16])(__m128i) = {
shuffle_none, shuffle_none,
shuffle_0q, shuffle_1q,
shuffle_0h, shuffle_1h, shuffle_2h, shuffle_3h,
shuffle_0w, shuffle_1w, shuffle_2w, shuffle_3w,
shuffle_4w, shuffle_5w, shuffle_6w, shuffle_7w
};
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{
__m128i xmm;
xmm = _mm_load_si128((__m128i *)VT);
xmm = SSE2_SHUFFLE_16[e](xmm);
_mm_store_si128((__m128i *)VD, xmm);
return;
}
#endif
#if !defined ARCH_MIN_ARM_NEON && !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_SSSE3
/*
* vector-scalar element decoding
* Obsolete. Consider using at least the SSE2 algorithms instead.
*/
static const int ei[16][8] = {
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
{ 00, 01, 02, 03, 04, 05, 06, 07 },
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
};
static const int sub_mask[16] = {
0x0,
0x0,
0x1, 0x1,
0x3, 0x3, 0x3, 0x3,
0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
};
INLINE static void SHUFFLE_VECTOR(short* VD, short* VT, const int e)
{
ALIGNED short SV[8];
register int i, j;
#if (0 == 0)
j = sub_mask[e];
for (i = 0; i < N; i++)
SV[i] = VT[(i & ~j) | (e & j)];
#else
if (e & 0x8)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x0) | (e & 0x7)];
else if (e & 0x4)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xC) | (e & 0x3)];
else if (e & 0x2)
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0xE) | (e & 0x1)];
else /* if ((e == 0b0000) || (e == 0b0001)) */
for (i = 0; i < N; i++)
SV[i] = VT[(i & 0x7) | (e & 0x0)];
#endif
for (i = 0; i < N; i++)
*(VD + i) = *(SV + i);
return;
}
#endif
#endif