diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index fd13d676..52b4eb65 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -188,7 +188,7 @@ static really_inline m128 or128(m128 a, m128 b) {
 }
 
 #if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand128(m128 a) {
+static really_inline m512 broadcast128(m128 a) {
     return _mm512_broadcast_i32x4(a);
 }
 #endif
@@ -381,7 +381,7 @@ static really_inline m256 or256(m256 a, m256 b) {
 }
 
 #if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand256(m256 a) {
+static really_inline m512 broadcast256(m256 a) {
     return _mm512_broadcast_i64x4(a);
 }
 #endif
@@ -450,7 +450,7 @@ static really_inline m256 loadu256(const void *ptr) {
     return _mm256_loadu_si256((const m256 *)ptr);
 }
 
-static really_inline
+static really_really_inline
 m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
     return _mm256_maskz_loadu_epi8(k, ptr);
 }
@@ -535,7 +535,7 @@ m128 movdq_lo(m256 x) {
 #define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow64from256(a) movq(cast256to128(a))
 #define extractlow32from256(a) movd(cast256to128(a))
 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
@@ -591,7 +591,7 @@ static really_inline u32 movd512(const m512 in) {
 static really_inline u64a movq512(const m512 in) {
     // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
     //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
+    return movq(_mm512_castsi512_si128(in));
 }
 
 static really_inline
@@ -639,7 +639,7 @@ m512 set1_8x64(u64a a) {
 }
 
 static really_inline
-m512 set16x32(u32 a) {
+m512 set1_16x32(u32 a) {
     return _mm512_set1_epi32(a);
 }
 
@@ -652,7 +652,7 @@ m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
 
 static really_inline
 m512 swap256in512(m512 a) {
-    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
     return vpermq512(idx, a);
 }
 
@@ -683,7 +683,7 @@ m512 sub_u8_m512(m512 a, m512 b) {
 
 static really_inline m512
 add512(m512 a, m512 b) {
-    return _mm512_add_epu64(a, b);
+    return _mm512_add_epi64(a, b);
 }
 
 static really_inline
@@ -697,7 +697,7 @@ m512 or512(m512 a, m512 b) {
 }
 
 #if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand384(m384 a) {
+static really_inline m512 broadcast384(m384 a) {
     u64a *lo = (u64a*)&a.lo;
     u64a *mid = (u64a*)&a.mid;
     u64a *hi = (u64a*)&a.hi;
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 262104ac..1c39c936 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -102,10 +102,10 @@
 #define or_m512(a, b)       (or512(a, b))
 
 #if defined(HAVE_AVX512VBMI)
-#define expand_m128(a)      (expand128(a))
-#define expand_m256(a)      (expand256(a))
-#define expand_m384(a)      (expand384(a))
-#define expand_m512(a)      (a)
+#define broadcast_m128(a)      (broadcast128(a))
+#define broadcast_m256(a)      (broadcast256(a))
+#define broadcast_m384(a)      (broadcast384(a))
+#define broadcast_m512(a)      (a)
 
 #define shuffle_byte_m128(a, b)       (pshufb_m512(b, a))
 #define shuffle_byte_m256(a, b)       (vpermb512(a, b))