simd_utils: fix undefined instruction issue for 32-bit system

fixes github issue #292
2026-01-17 16:00:26 +03:00 · 2021-01-27 11:57:51 +00:00
parent 62e35c910b
commit 1ecb3aef8b
1 changed files with 13 additions and 13 deletions
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -156,6 +156,16 @@ static really_inline u32 movd(const m128 in) {
    return _mm_cvtsi128_si32(in);
 }
 static really_inline u64a movq(const m128 in) {
 #if defined(ARCH_X86_64)
    return _mm_cvtsi128_si64(in);
 #else // 32-bit - this is horrific
    u32 lo = movd(in);
    u32 hi = movd(_mm_srli_epi64(in, 32));
    return (u64a)hi << 32 | lo;
 #endif
 }
 #if defined(HAVE_AVX512)
 static really_inline u32 movd512(const m512 in) {
    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
@@ -166,20 +176,10 @@ static really_inline u32 movd512(const m512 in) {
 static really_inline u64a movq512(const m512 in) {
    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
+    return movq(_mm512_castsi512_si128(in));
 }
 #endif
 static really_inline u64a movq(const m128 in) {
 #if defined(ARCH_X86_64)
    return _mm_cvtsi128_si64(in);
 #else // 32-bit - this is horrific
    u32 lo = movd(in);
    u32 hi = movd(_mm_srli_epi64(in, 32));
    return (u64a)hi << 32 | lo;
 #endif
 }
 /* another form of movq */
 static really_inline
 m128 load_m128_from_u64a(const u64a *p) {
@@ -791,7 +791,7 @@ m128 movdq_lo(m256 x) {
 #define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow64from256(a) movq(cast256to128(a))
 #define extractlow32from256(a) movd(cast256to128(a))
 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)