diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index f2706d70..e86b8d44 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -351,6 +351,36 @@ u64a pext64_impl_c(u64a x, u64a mask) { return result; } +static really_inline +u64a pdep64_impl_c(u64a x, u64a _m) { + /* Taken from: + * https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html + */ + + u64a result = 0x0UL; + const u64a mask = 0x8000000000000000UL; + u64a m = _m; + u64a c, t; + u64a p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + p = 64 - __builtin_popcountl (_m); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (m != 0) + { + c = __builtin_clzl (m); + t = x << (p - c); + m ^= (mask >> c); + result |= (t & (mask >> c)); + p++; + } + return (result); +} + /* compilers don't reliably synthesize the 32-bit ANDN instruction here, * so we force its generation. */