diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h new file mode 100644 index 00000000..85d5dc49 --- /dev/null +++ b/src/util/arch/common/bitutils.h @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_COMMON_H +#define BITUTILS_ARCH_COMMON_H + +#include "util/popcount.h" + +static really_inline +u32 clz32_impl_c(u32 x) { + return (u32)__builtin_clz(x); +} + +static really_inline +u32 clz64_impl_c(u64a x) { + return (u32)__builtin_clzll(x); +} + +// CTZ (count trailing zero) implementations. +static really_inline +u32 ctz32_impl_c(u32 x) { + return (u32)__builtin_ctz(x); +} + +static really_inline +u32 ctz64_impl_c(u64a x) { + return (u32)__builtin_ctzll(x); +} + +static really_inline +u32 lg2_impl_c(u32 x) { + if (!x) { + return 0; + } + return 31 - clz32_impl_c(x); +} + +static really_inline +u64a lg2_64_impl_c(u64a x) { + if (!x) { + return 0; + } + return 63 - clz64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl_c(u32 *v) { + u32 val = *v; + u32 offset = ctz32_impl_c(val); + *v = val & (val - 1); + + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearLSB_64_impl_c(u64a *v) { +#ifdef ARCH_64_BIT + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = ctz64_impl_c(val); + *v = val & (val - 1); +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (u32)(*v >> 32); + u32 offset; + if (v1) { + offset = findAndClearLSB_32_impl_c(&v1); + *v = (u64a)v1 | ((u64a)v2 << 32); + } else { + offset = findAndClearLSB_32_impl_c(&v2) + 32; + *v = (u64a)v2 << 32; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 findAndClearMSB_32_impl_c(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl_c(val); + *v = val & ~(1 << offset); + + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl_c(u64a *v) { +#ifdef ARCH_64_BIT + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = 63 - clz64_impl_c(val); + *v = val & ~(1ULL << offset); +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (*v >> 32); + u32 offset; + if (v2) { + offset = findAndClearMSB_32_impl_c(&v2) + 32; + *v = ((u64a)v2 << 32) | (u64a)v1; + } else { + offset = findAndClearMSB_32_impl_c(&v1); + *v = (u64a)v1; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 compress32_impl_c(u32 x, u32 m) { + + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u32 mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +} + +static really_inline +u64a compress64_impl_c(u64a x, u64a m) { + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u64a mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + mp ^= mp << 32; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +} + +static really_inline +u32 expand32_impl_c(u32 x, u32 m) { + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u32 m0, mk, mp, mv, t; + u32 array[5]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 4; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +} + +static really_inline +u64a expand64_impl_c(u64a x, u64a m) { + + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u64a m0, mk, mp, mv, t; + u64a array[6]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mp = mp ^ (mp << 32); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 5; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +} + + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl_c(bitfield); +} + +static really_inline +char bf64_set_impl_c(u64a *bitfield, u32 i) { + u64a mask = 1ULL << i; + char was_set = !!(*bitfield & mask); + *bitfield |= mask; + + return was_set; +} + +static really_inline +void bf64_unset_impl_c(u64a *bitfield, u32 i) { + *bitfield &= ~(1ULL << i); +} + +static really_inline +u32 rank_in_mask32_impl_c(u32 mask, u32 bit) { + mask &= (u32)(1U << bit) - 1; + return popcount32(mask); +} + +static really_inline +u32 rank_in_mask64_impl_c(u64a mask, u32 bit) { + mask &= (u64a)(1ULL << bit) - 1; + return popcount64(mask); +} + +static really_inline +u32 pext32_impl_c(u32 x, u32 mask) { + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_32_impl_c(&mask); + if (x & (1U << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +} + +static really_inline +u64a pext64_impl_c(u64a x, u64a mask) { + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_64_impl_c(&mask); + if (x & (1ULL << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +} + +#endif // BITUTILS_ARCH_COMMON_H diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h new file mode 100644 index 00000000..da7c747e --- /dev/null +++ b/src/util/arch/x86/bitutils.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_X86_H +#define BITUTILS_ARCH_X86_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { +#if defined(_WIN32) + unsigned long r; + _BitScanReverse(&r, x); + return 31 - r; +#else + return clz32_impl_c(x); +#endif +} + +static really_inline +u32 clz64_impl(u64a x) { +#if defined(_WIN64) + unsigned long r; + _BitScanReverse64(&r, x); + return 63 - r; +#elif defined(_WIN32) + unsigned long x1 = (u32)x; + unsigned long x2 = (u32)(x >> 32); + unsigned long r; + if (x2) { + _BitScanReverse(&r, x2); + return (u32)(31 - r); + } + _BitScanReverse(&r, (u32)x1); + return (u32)(63 - r); +#else + return clz64_impl_c(x); +#endif +} + +// CTZ (count trailing zero) implementations. +static really_inline +u32 ctz32_impl(u32 x) { +#if defined(_WIN32) + unsigned long r; + _BitScanForward(&r, x); + return r; +#else + return ctz32_impl_c(x); +#endif +} + +static really_inline +u32 ctz64_impl(u64a x) { +#if defined(_WIN64) + unsigned long r; + _BitScanForward64(&r, x); + return r; +#elif defined(_WIN32) + unsigned long r; + if (_BitScanForward(&r, (u32)x)) { + return (u32)r; + } + _BitScanForward(&r, x >> 32); + return (u32)(r + 32); +#else + return ctz64_impl_c(x); +#endif +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { +#ifndef NO_ASM + u32 val = *v, offset; + __asm__ ("bsf %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; + + assert(offset < 32); + return offset; +#else + return findAndClearLSB_32_impl_c(v); +#endif + +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { +#ifdef ARCH_64_BIT +#if !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsfq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = ctz64(val); + *v = val & (val - 1); +#endif // ARCH_X86_64 + assert(offset < 64); + return (u32)offset; +#else + return findAndClearLSB_64_impl_c(v); +#endif +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { +#if !defined(NO_ASM) + u32 val = *v, offset; + __asm__ ("bsr %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); +#endif + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { +#ifdef ARCH_64_BIT +#if !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsrq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = 63 - clz64_impl(val); + *v = val & ~(1ULL << offset); +#endif // ARCH_X86_64 + assert(offset < 64); + return (u32)offset; +#else + return findAndClearMSB_64_impl_c(v); +#endif +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u32(x, m); +#else + return compress32_impl_c(x, m); +#endif +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u64(x, m); +#else + return compress64_impl_c(x, m); +#endif +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u32(x, m); +#else + return expand32_impl_c(x, m); +#endif +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u64(x, m); +#else + return expand64_impl_c(x, m); +#endif +} + + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { +#if defined(HAVE_BMI2) + // Intel BMI2 can do this operation in one instruction. + return _pext_u32(x, mask); +#else + return pext32_impl_c(x, mask); +#endif +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + // Intel BMI2 can do this operation in one instruction. + return _pext_u64(x, mask); +#else + return pext64_impl_c(x, mask); +#endif +} + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) +static really_inline +u64a pdep64(u64a x, u64a mask) { + return _pdep_u64(x, mask); +} +#endif + +#endif // BITUTILS_ARCH_X86_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index c545ee18..651e5f93 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -33,6 +33,7 @@ #ifndef BITUTILS_H #define BITUTILS_H +#include "config.h" #include "ue2common.h" #include "popcount.h" #include "util/arch.h" @@ -43,351 +44,88 @@ #define DOUBLE_CASE_CLEAR 0xdfdf #define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL + +#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "util/arch/x86/bitutils.h" +#endif + static really_inline u32 clz32(u32 x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN32) - unsigned long r; - _BitScanReverse(&r, x); - return 31 - r; -#else - return (u32)__builtin_clz(x); -#endif + + return clz32_impl(x); } static really_inline u32 clz64(u64a x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN64) - unsigned long r; - _BitScanReverse64(&r, x); - return 63 - r; -#elif defined(_WIN32) - unsigned long x1 = (u32)x; - unsigned long x2 = (u32)(x >> 32); - unsigned long r; - if (x2) { - _BitScanReverse(&r, x2); - return (u32)(31 - r); - } - _BitScanReverse(&r, (u32)x1); - return (u32)(63 - r); -#else - return (u32)__builtin_clzll(x); -#endif + + return clz64_impl(x); } // CTZ (count trailing zero) implementations. static really_inline u32 ctz32(u32 x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN32) - unsigned long r; - _BitScanForward(&r, x); - return r; -#else - return (u32)__builtin_ctz(x); -#endif + + return ctz32_impl(x); } static really_inline u32 ctz64(u64a x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN64) - unsigned long r; - _BitScanForward64(&r, x); - return r; -#elif defined(_WIN32) - unsigned long r; - if (_BitScanForward(&r, (u32)x)) { - return (u32)r; - } - _BitScanForward(&r, x >> 32); - return (u32)(r + 32); -#else - return (u32)__builtin_ctzll(x); -#endif + + return ctz64_impl(x); } static really_inline u32 lg2(u32 x) { - if (!x) { - return 0; - } - return 31 - clz32(x); + return lg2_impl(x); } static really_inline u64a lg2_64(u64a x) { - if (!x) { - return 0; - } - return 63 - clz64(x); + return lg2_64_impl(x); } static really_inline u32 findAndClearLSB_32(u32 *v) { - assert(*v != 0); // behaviour not defined in this case -#ifndef NO_ASM - u32 val = *v, offset; - __asm__ ("bsf %1, %0\n" - "btr %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - u32 val = *v; - u32 offset = ctz32(val); - *v = val & (val - 1); -#endif - - assert(offset < 32); - return offset; + return findAndClearLSB_32_impl(v); } static really_inline u32 findAndClearLSB_64(u64a *v) { - assert(*v != 0); // behaviour not defined in this case - -#ifdef ARCH_64_BIT -#if defined(ARCH_X86_64) && !defined(NO_ASM) - u64a val = *v, offset; - __asm__ ("bsfq %1, %0\n" - "btrq %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - // generic variant using gcc's builtin on 64-bit - u64a val = *v, offset; - offset = ctz64(val); - *v = val & (val - 1); -#endif // ARCH_X86_64 -#else - // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't - // inline calls to __builtin_ctzll - u32 v1 = (u32)*v; - u32 v2 = (u32)(*v >> 32); - u32 offset; - if (v1) { - offset = findAndClearLSB_32(&v1); - *v = (u64a)v1 | ((u64a)v2 << 32); - } else { - offset = findAndClearLSB_32(&v2) + 32; - *v = (u64a)v2 << 32; - } -#endif - - assert(offset < 64); - return (u32)offset; + return findAndClearLSB_64_impl(v); } static really_inline u32 findAndClearMSB_32(u32 *v) { - assert(*v != 0); // behaviour not defined in this case -#ifndef NO_ASM - u32 val = *v, offset; - __asm__ ("bsr %1, %0\n" - "btr %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - u32 val = *v; - u32 offset = 31 - clz32(val); - *v = val & ~(1 << offset); -#endif - assert(offset < 32); - return offset; + return findAndClearMSB_32_impl(v); } static really_inline u32 findAndClearMSB_64(u64a *v) { - assert(*v != 0); // behaviour not defined in this case - -#ifdef ARCH_64_BIT -#if defined(ARCH_X86_64) && !defined(NO_ASM) - u64a val = *v, offset; - __asm__ ("bsrq %1, %0\n" - "btrq %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - // generic variant using gcc's builtin on 64-bit - u64a val = *v, offset; - offset = 63 - clz64(val); - *v = val & ~(1ULL << offset); -#endif // ARCH_X86_64 -#else - // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't - // inline calls to __builtin_ctzll - u32 v1 = (u32)*v; - u32 v2 = (*v >> 32); - u32 offset; - if (v2) { - offset = findAndClearMSB_32(&v2) + 32; - *v = ((u64a)v2 << 32) | (u64a)v1; - } else { - offset = findAndClearMSB_32(&v1); - *v = (u64a)v1; - } -#endif - - assert(offset < 64); - return (u32)offset; + return findAndClearMSB_64_impl(v); } static really_inline u32 compress32(u32 x, u32 m) { -#if defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pext_u32(x, m); -#else - - // Return zero quickly on trivial cases - if ((x & m) == 0) { - return 0; - } - - u32 mk, mp, mv, t; - - x &= m; // clear irrelevant bits - - mk = ~m << 1; // we will count 0's to right - for (u32 i = 0; i < 5; i++) { - mp = mk ^ (mk << 1); - mp ^= mp << 2; - mp ^= mp << 4; - mp ^= mp << 8; - mp ^= mp << 16; - - mv = mp & m; // bits to move - m = (m ^ mv) | (mv >> (1 << i)); // compress m - t = x & mv; - x = (x ^ t) | (t >> (1 << i)); // compress x - mk = mk & ~mp; - } - - return x; -#endif + return compress32_impl(x, m); } static really_inline u64a compress64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pext_u64(x, m); -#else - - // Return zero quickly on trivial cases - if ((x & m) == 0) { - return 0; - } - - u64a mk, mp, mv, t; - - x &= m; // clear irrelevant bits - - mk = ~m << 1; // we will count 0's to right - for (u32 i = 0; i < 6; i++) { - mp = mk ^ (mk << 1); - mp ^= mp << 2; - mp ^= mp << 4; - mp ^= mp << 8; - mp ^= mp << 16; - mp ^= mp << 32; - - mv = mp & m; // bits to move - m = (m ^ mv) | (mv >> (1 << i)); // compress m - t = x & mv; - x = (x ^ t) | (t >> (1 << i)); // compress x - mk = mk & ~mp; - } - - return x; -#endif + return compress64_impl(x, m); } static really_inline u32 expand32(u32 x, u32 m) { -#if defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pdep_u32(x, m); -#else - - // Return zero quickly on trivial cases - if (!x || !m) { - return 0; - } - - u32 m0, mk, mp, mv, t; - u32 array[5]; - - m0 = m; // save original mask - mk = ~m << 1; // we will count 0's to right - - for (int i = 0; i < 5; i++) { - mp = mk ^ (mk << 1); // parallel suffix - mp = mp ^ (mp << 2); - mp = mp ^ (mp << 4); - mp = mp ^ (mp << 8); - mp = mp ^ (mp << 16); - mv = mp & m; // bits to move - array[i] = mv; - m = (m ^ mv) | (mv >> (1 << i)); // compress m - mk = mk & ~mp; - } - - for (int i = 4; i >= 0; i--) { - mv = array[i]; - t = x << (1 << i); - x = (x & ~mv) | (t & mv); - } - - return x & m0; // clear out extraneous bits -#endif + return expand32_impl(x, m); } static really_inline u64a expand64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pdep_u64(x, m); -#else - - // Return zero quickly on trivial cases - if (!x || !m) { - return 0; - } - - u64a m0, mk, mp, mv, t; - u64a array[6]; - - m0 = m; // save original mask - mk = ~m << 1; // we will count 0's to right - - for (int i = 0; i < 6; i++) { - mp = mk ^ (mk << 1); // parallel suffix - mp = mp ^ (mp << 2); - mp = mp ^ (mp << 4); - mp = mp ^ (mp << 8); - mp = mp ^ (mp << 16); - mp = mp ^ (mp << 32); - mv = mp & m; // bits to move - array[i] = mv; - m = (m ^ mv) | (mv >> (1 << i)); // compress m - mk = mk & ~mp; - } - - for (int i = 5; i >= 0; i--) { - mv = array[i]; - t = x << (1 << i); - x = (x & ~mv) | (t & mv); - } - - return x & m0; // clear out extraneous bits -#endif + return expand64_impl(x, m); } @@ -396,97 +134,37 @@ u64a expand64(u64a x, u64a m) { */ static really_inline u32 bf64_iterate(u64a bitfield, u32 begin) { - if (begin != ~0U) { - /* switch off all bits at or below begin. Note: not legal to shift by - * by size of the datatype or larger. */ - assert(begin <= 63); - bitfield &= ~((2ULL << begin) - 1); - } - - if (!bitfield) { - return ~0U; - } - - return ctz64(bitfield); + return bf64_iterate_impl(bitfield, begin); } static really_inline char bf64_set(u64a *bitfield, u32 i) { - assert(i < 64); - u64a mask = 1ULL << i; - char was_set = !!(*bitfield & mask); - *bitfield |= mask; - - return was_set; + return bf64_set_impl(bitfield, i); } static really_inline void bf64_unset(u64a *bitfield, u32 i) { - assert(i < 64); - *bitfield &= ~(1ULL << i); + return bf64_unset_impl(bitfield, i); } static really_inline u32 rank_in_mask32(u32 mask, u32 bit) { - assert(bit < sizeof(u32) * 8); - assert(mask & (u32)(1U << bit)); - mask &= (u32)(1U << bit) - 1; - return popcount32(mask); + return rank_in_mask32_impl(mask, bit); } static really_inline u32 rank_in_mask64(u64a mask, u32 bit) { - assert(bit < sizeof(u64a) * 8); - assert(mask & (u64a)(1ULL << bit)); - mask &= (u64a)(1ULL << bit) - 1; - return popcount64(mask); + return rank_in_mask64_impl(mask, bit); } static really_inline u32 pext32(u32 x, u32 mask) { -#if defined(HAVE_BMI2) - // Intel BMI2 can do this operation in one instruction. - return _pext_u32(x, mask); -#else - - u32 result = 0, num = 1; - while (mask != 0) { - u32 bit = findAndClearLSB_32(&mask); - if (x & (1U << bit)) { - assert(num != 0); // more than 32 bits! - result |= num; - } - num <<= 1; - } - return result; -#endif + return pext32_impl(x, mask); } static really_inline u64a pext64(u64a x, u64a mask) { -#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) - // Intel BMI2 can do this operation in one instruction. - return _pext_u64(x, mask); -#else - - u32 result = 0, num = 1; - while (mask != 0) { - u32 bit = findAndClearLSB_64(&mask); - if (x & (1ULL << bit)) { - assert(num != 0); // more than 32 bits! - result |= num; - } - num <<= 1; - } - return result; -#endif + return pext64_impl(x, mask); } -#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) -static really_inline -u64a pdep64(u64a x, u64a mask) { - return _pdep_u64(x, mask); -} -#endif - #endif // BITUTILS_H