mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
move x86 bitutils.h implementations to util/arch/x86/bitutils.h
This commit is contained in:
parent
8ed5f4ac75
commit
aac1f0f1dc
353
src/util/arch/common/bitutils.h
Normal file
353
src/util/arch/common/bitutils.h
Normal file
@ -0,0 +1,353 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Bit-twiddling primitives (ctz, compress etc)
|
||||
*/
|
||||
|
||||
#ifndef BITUTILS_ARCH_COMMON_H
|
||||
#define BITUTILS_ARCH_COMMON_H
|
||||
|
||||
#include "util/popcount.h"
|
||||
|
||||
static really_inline
|
||||
u32 clz32_impl_c(u32 x) {
|
||||
return (u32)__builtin_clz(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64_impl_c(u64a x) {
|
||||
return (u32)__builtin_clzll(x);
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32_impl_c(u32 x) {
|
||||
return (u32)__builtin_ctz(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64_impl_c(u64a x) {
|
||||
return (u32)__builtin_ctzll(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2_impl_c(u32 x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 31 - clz32_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64_impl_c(u64a x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 63 - clz64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32_impl_c(u32 *v) {
|
||||
u32 val = *v;
|
||||
u32 offset = ctz32_impl_c(val);
|
||||
*v = val & (val - 1);
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64_impl_c(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64_impl_c(val);
|
||||
*v = val & (val - 1);
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (u32)(*v >> 32);
|
||||
u32 offset;
|
||||
if (v1) {
|
||||
offset = findAndClearLSB_32_impl_c(&v1);
|
||||
*v = (u64a)v1 | ((u64a)v2 << 32);
|
||||
} else {
|
||||
offset = findAndClearLSB_32_impl_c(&v2) + 32;
|
||||
*v = (u64a)v2 << 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32_impl_c(u32 *v) {
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32_impl_c(val);
|
||||
*v = val & ~(1 << offset);
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64_impl_c(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64_impl_c(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (*v >> 32);
|
||||
u32 offset;
|
||||
if (v2) {
|
||||
offset = findAndClearMSB_32_impl_c(&v2) + 32;
|
||||
*v = ((u64a)v2 << 32) | (u64a)v1;
|
||||
} else {
|
||||
offset = findAndClearMSB_32_impl_c(&v1);
|
||||
*v = (u64a)v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32_impl_c(u32 x, u32 m) {
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64_impl_c(u64a x, u64a m) {
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
mp ^= mp << 32;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32_impl_c(u32 x, u32 m) {
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 m0, mk, mp, mv, t;
|
||||
u32 array[5];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 4; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64_impl_c(u64a x, u64a m) {
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a m0, mk, mp, mv, t;
|
||||
u64a array[6];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mp = mp ^ (mp << 32);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 5; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
}
|
||||
|
||||
|
||||
/* returns the first set bit after begin (if not ~0U). If no bit is set after
|
||||
* begin returns ~0U
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64_impl_c(bitfield);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set_impl_c(u64a *bitfield, u32 i) {
|
||||
u64a mask = 1ULL << i;
|
||||
char was_set = !!(*bitfield & mask);
|
||||
*bitfield |= mask;
|
||||
|
||||
return was_set;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset_impl_c(u64a *bitfield, u32 i) {
|
||||
*bitfield &= ~(1ULL << i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
|
||||
mask &= (u32)(1U << bit) - 1;
|
||||
return popcount32(mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
|
||||
mask &= (u64a)(1ULL << bit) - 1;
|
||||
return popcount64(mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32_impl_c(u32 x, u32 mask) {
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_32_impl_c(&mask);
|
||||
if (x & (1U << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64_impl_c(u64a x, u64a mask) {
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_64_impl_c(&mask);
|
||||
if (x & (1ULL << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // BITUTILS_ARCH_COMMON_H
|
304
src/util/arch/x86/bitutils.h
Normal file
304
src/util/arch/x86/bitutils.h
Normal file
@ -0,0 +1,304 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Bit-twiddling primitives (ctz, compress etc)
|
||||
*/
|
||||
|
||||
#ifndef BITUTILS_ARCH_X86_H
|
||||
#define BITUTILS_ARCH_X86_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include "util/arch/common/bitutils.h"
|
||||
|
||||
static really_inline
|
||||
u32 clz32_impl(u32 x) {
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanReverse(&r, x);
|
||||
return 31 - r;
|
||||
#else
|
||||
return clz32_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64_impl(u64a x) {
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanReverse64(&r, x);
|
||||
return 63 - r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long x1 = (u32)x;
|
||||
unsigned long x2 = (u32)(x >> 32);
|
||||
unsigned long r;
|
||||
if (x2) {
|
||||
_BitScanReverse(&r, x2);
|
||||
return (u32)(31 - r);
|
||||
}
|
||||
_BitScanReverse(&r, (u32)x1);
|
||||
return (u32)(63 - r);
|
||||
#else
|
||||
return clz64_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32_impl(u32 x) {
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanForward(&r, x);
|
||||
return r;
|
||||
#else
|
||||
return ctz32_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64_impl(u64a x) {
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanForward64(&r, x);
|
||||
return r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long r;
|
||||
if (_BitScanForward(&r, (u32)x)) {
|
||||
return (u32)r;
|
||||
}
|
||||
_BitScanForward(&r, x >> 32);
|
||||
return (u32)(r + 32);
|
||||
#else
|
||||
return ctz64_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2_impl(u32 x) {
|
||||
return lg2_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64_impl(u64a x) {
|
||||
return lg2_64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32_impl(u32 *v) {
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsf %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
#else
|
||||
return findAndClearLSB_32_impl_c(v);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64_impl(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
#if !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsfq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64(val);
|
||||
*v = val & (val - 1);
|
||||
#endif // ARCH_X86_64
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
#else
|
||||
return findAndClearLSB_64_impl_c(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32_impl(u32 *v) {
|
||||
#if !defined(NO_ASM)
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsr %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32_impl(val);
|
||||
*v = val & ~(1 << offset);
|
||||
#endif
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64_impl(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
#if !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsrq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64_impl(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#endif // ARCH_X86_64
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
#else
|
||||
return findAndClearMSB_64_impl_c(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32_impl(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u32(x, m);
|
||||
#else
|
||||
return compress32_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64_impl(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u64(x, m);
|
||||
#else
|
||||
return compress64_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32_impl(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u32(x, m);
|
||||
#else
|
||||
return expand32_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64_impl(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u64(x, m);
|
||||
#else
|
||||
return expand64_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* returns the first set bit after begin (if not ~0U). If no bit is set after
|
||||
* begin returns ~0U
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64_impl(bitfield);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_set_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_unset_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
|
||||
return rank_in_mask32_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
|
||||
return rank_in_mask64_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32_impl(u32 x, u32 mask) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u32(x, mask);
|
||||
#else
|
||||
return pext32_impl_c(x, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64_impl(u64a x, u64a mask) {
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u64(x, mask);
|
||||
#else
|
||||
return pext64_impl_c(x, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return _pdep_u64(x, mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITUTILS_ARCH_X86_H
|
@ -33,6 +33,7 @@
|
||||
#ifndef BITUTILS_H
|
||||
#define BITUTILS_H
|
||||
|
||||
#include "config.h"
|
||||
#include "ue2common.h"
|
||||
#include "popcount.h"
|
||||
#include "util/arch.h"
|
||||
@ -43,351 +44,88 @@
|
||||
#define DOUBLE_CASE_CLEAR 0xdfdf
|
||||
#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
|
||||
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/bitutils.h"
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
u32 clz32(u32 x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanReverse(&r, x);
|
||||
return 31 - r;
|
||||
#else
|
||||
return (u32)__builtin_clz(x);
|
||||
#endif
|
||||
|
||||
return clz32_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64(u64a x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanReverse64(&r, x);
|
||||
return 63 - r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long x1 = (u32)x;
|
||||
unsigned long x2 = (u32)(x >> 32);
|
||||
unsigned long r;
|
||||
if (x2) {
|
||||
_BitScanReverse(&r, x2);
|
||||
return (u32)(31 - r);
|
||||
}
|
||||
_BitScanReverse(&r, (u32)x1);
|
||||
return (u32)(63 - r);
|
||||
#else
|
||||
return (u32)__builtin_clzll(x);
|
||||
#endif
|
||||
|
||||
return clz64_impl(x);
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32(u32 x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanForward(&r, x);
|
||||
return r;
|
||||
#else
|
||||
return (u32)__builtin_ctz(x);
|
||||
#endif
|
||||
|
||||
return ctz32_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64(u64a x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanForward64(&r, x);
|
||||
return r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long r;
|
||||
if (_BitScanForward(&r, (u32)x)) {
|
||||
return (u32)r;
|
||||
}
|
||||
_BitScanForward(&r, x >> 32);
|
||||
return (u32)(r + 32);
|
||||
#else
|
||||
return (u32)__builtin_ctzll(x);
|
||||
#endif
|
||||
|
||||
return ctz64_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2(u32 x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 31 - clz32(x);
|
||||
return lg2_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64(u64a x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 63 - clz64(x);
|
||||
return lg2_64_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32(u32 *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsf %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = ctz32(val);
|
||||
*v = val & (val - 1);
|
||||
#endif
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
return findAndClearLSB_32_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64(u64a *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#if defined(ARCH_X86_64) && !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsfq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64(val);
|
||||
*v = val & (val - 1);
|
||||
#endif // ARCH_X86_64
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (u32)(*v >> 32);
|
||||
u32 offset;
|
||||
if (v1) {
|
||||
offset = findAndClearLSB_32(&v1);
|
||||
*v = (u64a)v1 | ((u64a)v2 << 32);
|
||||
} else {
|
||||
offset = findAndClearLSB_32(&v2) + 32;
|
||||
*v = (u64a)v2 << 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
return findAndClearLSB_64_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32(u32 *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsr %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32(val);
|
||||
*v = val & ~(1 << offset);
|
||||
#endif
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
return findAndClearMSB_32_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64(u64a *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#if defined(ARCH_X86_64) && !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsrq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#endif // ARCH_X86_64
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (*v >> 32);
|
||||
u32 offset;
|
||||
if (v2) {
|
||||
offset = findAndClearMSB_32(&v2) + 32;
|
||||
*v = ((u64a)v2 << 32) | (u64a)v1;
|
||||
} else {
|
||||
offset = findAndClearMSB_32(&v1);
|
||||
*v = (u64a)v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
return findAndClearMSB_64_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u32(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
#endif
|
||||
return compress32_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u64(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
mp ^= mp << 32;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
#endif
|
||||
return compress64_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u32(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 m0, mk, mp, mv, t;
|
||||
u32 array[5];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 4; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
#endif
|
||||
return expand32_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u64(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a m0, mk, mp, mv, t;
|
||||
u64a array[6];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mp = mp ^ (mp << 32);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 5; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
#endif
|
||||
return expand64_impl(x, m);
|
||||
}
|
||||
|
||||
|
||||
@ -396,97 +134,37 @@ u64a expand64(u64a x, u64a m) {
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64(bitfield);
|
||||
return bf64_iterate_impl(bitfield, begin);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set(u64a *bitfield, u32 i) {
|
||||
assert(i < 64);
|
||||
u64a mask = 1ULL << i;
|
||||
char was_set = !!(*bitfield & mask);
|
||||
*bitfield |= mask;
|
||||
|
||||
return was_set;
|
||||
return bf64_set_impl(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset(u64a *bitfield, u32 i) {
|
||||
assert(i < 64);
|
||||
*bitfield &= ~(1ULL << i);
|
||||
return bf64_unset_impl(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32(u32 mask, u32 bit) {
|
||||
assert(bit < sizeof(u32) * 8);
|
||||
assert(mask & (u32)(1U << bit));
|
||||
mask &= (u32)(1U << bit) - 1;
|
||||
return popcount32(mask);
|
||||
return rank_in_mask32_impl(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64(u64a mask, u32 bit) {
|
||||
assert(bit < sizeof(u64a) * 8);
|
||||
assert(mask & (u64a)(1ULL << bit));
|
||||
mask &= (u64a)(1ULL << bit) - 1;
|
||||
return popcount64(mask);
|
||||
return rank_in_mask64_impl(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32(u32 x, u32 mask) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u32(x, mask);
|
||||
#else
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_32(&mask);
|
||||
if (x & (1U << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return pext32_impl(x, mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64(u64a x, u64a mask) {
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u64(x, mask);
|
||||
#else
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_64(&mask);
|
||||
if (x & (1ULL << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return pext64_impl(x, mask);
|
||||
}
|
||||
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return _pdep_u64(x, mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITUTILS_H
|
||||
|
Loading…
x
Reference in New Issue
Block a user