/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Bit-twiddling primitives (ctz, compress etc) */ #ifndef BITUTILS_H #define BITUTILS_H #include "ue2common.h" #include "popcount.h" #include "util/arch.h" #include "util/intrinsics.h" #define CASE_BIT 0x20 #define CASE_CLEAR 0xdf #define DOUBLE_CASE_CLEAR 0xdfdf #define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL static really_inline u32 clz32(u32 x) { assert(x); // behaviour not defined for x == 0 #if defined(_WIN32) unsigned long r; _BitScanReverse(&r, x); return 31 - r; #else return (u32)__builtin_clz(x); #endif } static really_inline u32 clz64(u64a x) { assert(x); // behaviour not defined for x == 0 #if defined(_WIN64) unsigned long r; _BitScanReverse64(&r, x); return 63 - r; #elif defined(_WIN32) unsigned long x1 = (u32)x; unsigned long x2 = (u32)(x >> 32); unsigned long r; if (x2) { _BitScanReverse(&r, x2); return (u32)(31 - r); } _BitScanReverse(&r, (u32)x1); return (u32)(63 - r); #else return (u32)__builtin_clzll(x); #endif } // CTZ (count trailing zero) implementations. static really_inline u32 ctz32(u32 x) { assert(x); // behaviour not defined for x == 0 #if defined(_WIN32) unsigned long r; _BitScanForward(&r, x); return r; #else return (u32)__builtin_ctz(x); #endif } static really_inline u32 ctz64(u64a x) { assert(x); // behaviour not defined for x == 0 #if defined(_WIN64) unsigned long r; _BitScanForward64(&r, x); return r; #elif defined(_WIN32) unsigned long r; if (_BitScanForward(&r, (u32)x)) { return (u32)r; } _BitScanForward(&r, x >> 32); return (u32)(r + 32); #else return (u32)__builtin_ctzll(x); #endif } static really_inline u32 lg2(u32 x) { if (!x) { return 0; } return 31 - clz32(x); } static really_inline u64a lg2_64(u64a x) { if (!x) { return 0; } return 63 - clz64(x); } static really_inline u32 findAndClearLSB_32(u32 *v) { assert(*v != 0); // behaviour not defined in this case #ifndef NO_ASM u32 val = *v, offset; __asm__ ("bsf %1, %0\n" "btr %0, %1\n" : "=r" (offset), "=r" (val) : "1" (val)); *v = val; #else u32 val = *v; u32 offset = ctz32(val); *v = val & (val - 1); #endif assert(offset < 32); return offset; } static really_inline u32 findAndClearLSB_64(u64a *v) { assert(*v != 0); // behaviour not defined in this case #ifdef ARCH_64_BIT #if defined(ARCH_X86_64) && !defined(NO_ASM) u64a val = *v, offset; __asm__ ("bsfq %1, %0\n" "btrq %0, %1\n" : "=r" (offset), "=r" (val) : "1" (val)); *v = val; #else // generic variant using gcc's builtin on 64-bit u64a val = *v, offset; offset = ctz64(val); *v = val & (val - 1); #endif // ARCH_X86_64 #else // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't // inline calls to __builtin_ctzll u32 v1 = (u32)*v; u32 v2 = (u32)(*v >> 32); u32 offset; if (v1) { offset = findAndClearLSB_32(&v1); *v = (u64a)v1 | ((u64a)v2 << 32); } else { offset = findAndClearLSB_32(&v2) + 32; *v = (u64a)v2 << 32; } #endif assert(offset < 64); return (u32)offset; } static really_inline u32 findAndClearMSB_32(u32 *v) { assert(*v != 0); // behaviour not defined in this case #ifndef NO_ASM u32 val = *v, offset; __asm__ ("bsr %1, %0\n" "btr %0, %1\n" : "=r" (offset), "=r" (val) : "1" (val)); *v = val; #else u32 val = *v; u32 offset = 31 - clz32(val); *v = val & ~(1 << offset); #endif assert(offset < 32); return offset; } static really_inline u32 findAndClearMSB_64(u64a *v) { assert(*v != 0); // behaviour not defined in this case #ifdef ARCH_64_BIT #if defined(ARCH_X86_64) && !defined(NO_ASM) u64a val = *v, offset; __asm__ ("bsrq %1, %0\n" "btrq %0, %1\n" : "=r" (offset), "=r" (val) : "1" (val)); *v = val; #else // generic variant using gcc's builtin on 64-bit u64a val = *v, offset; offset = 63 - clz64(val); *v = val & ~(1ULL << offset); #endif // ARCH_X86_64 #else // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't // inline calls to __builtin_ctzll u32 v1 = (u32)*v; u32 v2 = (*v >> 32); u32 offset; if (v2) { offset = findAndClearMSB_32(&v2) + 32; *v = ((u64a)v2 << 32) | (u64a)v1; } else { offset = findAndClearMSB_32(&v1); *v = (u64a)v1; } #endif assert(offset < 64); return (u32)offset; } static really_inline u32 compress32(u32 x, u32 m) { #if defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pext_u32(x, m); #else // Return zero quickly on trivial cases if ((x & m) == 0) { return 0; } u32 mk, mp, mv, t; x &= m; // clear irrelevant bits mk = ~m << 1; // we will count 0's to right for (u32 i = 0; i < 5; i++) { mp = mk ^ (mk << 1); mp ^= mp << 2; mp ^= mp << 4; mp ^= mp << 8; mp ^= mp << 16; mv = mp & m; // bits to move m = (m ^ mv) | (mv >> (1 << i)); // compress m t = x & mv; x = (x ^ t) | (t >> (1 << i)); // compress x mk = mk & ~mp; } return x; #endif } static really_inline u64a compress64(u64a x, u64a m) { #if defined(ARCH_X86_64) && defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pext_u64(x, m); #else // Return zero quickly on trivial cases if ((x & m) == 0) { return 0; } u64a mk, mp, mv, t; x &= m; // clear irrelevant bits mk = ~m << 1; // we will count 0's to right for (u32 i = 0; i < 6; i++) { mp = mk ^ (mk << 1); mp ^= mp << 2; mp ^= mp << 4; mp ^= mp << 8; mp ^= mp << 16; mp ^= mp << 32; mv = mp & m; // bits to move m = (m ^ mv) | (mv >> (1 << i)); // compress m t = x & mv; x = (x ^ t) | (t >> (1 << i)); // compress x mk = mk & ~mp; } return x; #endif } static really_inline u32 expand32(u32 x, u32 m) { #if defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pdep_u32(x, m); #else // Return zero quickly on trivial cases if (!x || !m) { return 0; } u32 m0, mk, mp, mv, t; u32 array[5]; m0 = m; // save original mask mk = ~m << 1; // we will count 0's to right for (int i = 0; i < 5; i++) { mp = mk ^ (mk << 1); // parallel suffix mp = mp ^ (mp << 2); mp = mp ^ (mp << 4); mp = mp ^ (mp << 8); mp = mp ^ (mp << 16); mv = mp & m; // bits to move array[i] = mv; m = (m ^ mv) | (mv >> (1 << i)); // compress m mk = mk & ~mp; } for (int i = 4; i >= 0; i--) { mv = array[i]; t = x << (1 << i); x = (x & ~mv) | (t & mv); } return x & m0; // clear out extraneous bits #endif } static really_inline u64a expand64(u64a x, u64a m) { #if defined(ARCH_X86_64) && defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pdep_u64(x, m); #else // Return zero quickly on trivial cases if (!x || !m) { return 0; } u64a m0, mk, mp, mv, t; u64a array[6]; m0 = m; // save original mask mk = ~m << 1; // we will count 0's to right for (int i = 0; i < 6; i++) { mp = mk ^ (mk << 1); // parallel suffix mp = mp ^ (mp << 2); mp = mp ^ (mp << 4); mp = mp ^ (mp << 8); mp = mp ^ (mp << 16); mp = mp ^ (mp << 32); mv = mp & m; // bits to move array[i] = mv; m = (m ^ mv) | (mv >> (1 << i)); // compress m mk = mk & ~mp; } for (int i = 5; i >= 0; i--) { mv = array[i]; t = x << (1 << i); x = (x & ~mv) | (t & mv); } return x & m0; // clear out extraneous bits #endif } /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ static really_inline u32 bf64_iterate(u64a bitfield, u32 begin) { if (begin != ~0U) { /* switch off all bits at or below begin. Note: not legal to shift by * by size of the datatype or larger. */ assert(begin <= 63); bitfield &= ~((2ULL << begin) - 1); } if (!bitfield) { return ~0U; } return ctz64(bitfield); } static really_inline char bf64_set(u64a *bitfield, u32 i) { assert(i < 64); u64a mask = 1ULL << i; char was_set = !!(*bitfield & mask); *bitfield |= mask; return was_set; } static really_inline void bf64_unset(u64a *bitfield, u32 i) { assert(i < 64); *bitfield &= ~(1ULL << i); } static really_inline u32 rank_in_mask32(u32 mask, u32 bit) { assert(bit < sizeof(u32) * 8); assert(mask & (u32)(1U << bit)); mask &= (u32)(1U << bit) - 1; return popcount32(mask); } static really_inline u32 rank_in_mask64(u64a mask, u32 bit) { assert(bit < sizeof(u64a) * 8); assert(mask & (u64a)(1ULL << bit)); mask &= (u64a)(1ULL << bit) - 1; return popcount64(mask); } static really_inline u32 pext32(u32 x, u32 mask) { #if defined(HAVE_BMI2) // Intel BMI2 can do this operation in one instruction. return _pext_u32(x, mask); #else u32 result = 0, num = 1; while (mask != 0) { u32 bit = findAndClearLSB_32(&mask); if (x & (1U << bit)) { assert(num != 0); // more than 32 bits! result |= num; } num <<= 1; } return result; #endif } static really_inline u64a pext64(u64a x, u64a mask) { #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) // Intel BMI2 can do this operation in one instruction. return _pext_u64(x, mask); #else u32 result = 0, num = 1; while (mask != 0) { u32 bit = findAndClearLSB_64(&mask); if (x & (1ULL << bit)) { assert(num != 0); // more than 32 bits! result |= num; } num <<= 1; } return result; #endif } #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) static really_inline u64a pdep64(u64a x, u64a mask) { return _pdep_u64(x, mask); } #endif #endif // BITUTILS_H