use right intrinsic

This commit is contained in:
Konstantinos Margaritis 2020-10-06 13:45:52 +03:00
parent 5952c64066
commit e91082d477

View File

@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
expand32(v[2], m[2]), expand32(v[3], m[3]) }; expand32(v[2], m[2]), expand32(v[3], m[3]) };
return set32x4(x[3], x[2], x[1], x[0]); return set4x32(x[3], x[2], x[1], x[0]);
} }
#endif #endif
@ -264,10 +264,10 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
expand32(v[6], m[6]), expand32(v[7], m[7]) }; expand32(v[6], m[6]), expand32(v[7], m[7]) };
#if !defined(HAVE_AVX2) #if !defined(HAVE_AVX2)
m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
.hi = set32x4(x[7], x[6], x[5], x[4]) }; .hi = set4x32(x[7], x[6], x[5], x[4]) };
#else #else
m256 xvec = set32x8(x[7], x[6], x[5], x[4], m256 xvec = set8x32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
#endif #endif
return xvec; return xvec;
@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[8], m[8]), expand32(v[9], m[9]),
expand32(v[10], m[10]), expand32(v[11], m[11]) }; expand32(v[10], m[10]), expand32(v[11], m[11]) };
m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
.mid = set32x4(x[7], x[6], x[5], x[4]), .mid = set4x32(x[7], x[6], x[5], x[4]),
.hi = set32x4(x[11], x[10], x[9], x[8]) }; .hi = set4x32(x[11], x[10], x[9], x[8]) };
return xvec; return xvec;
} }
#endif #endif
@ -553,15 +553,15 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
x[7], x[6], x[5], x[4], x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2) #elif defined(HAVE_AVX2)
xvec.lo = set32x8(x[7], x[6], x[5], x[4], xvec.lo = set8x32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]); x[3], x[2], x[1], x[0]);
xvec.hi = set32x8(x[15], x[14], x[13], x[12], xvec.hi = set8x32(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8]); x[11], x[10], x[9], x[8]);
#else #else
xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
#endif #endif
return xvec; return xvec;
} }