mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-30 03:34:25 +03:00
Introduce a 64-bit LimEx model.
On 64-bit platforms, the Limex 64 model is implemented in normal GPRs. On 32-bit platforms, however, 128-bit SSE registers are used for the runtime implementation.
This commit is contained in:
@@ -173,6 +173,12 @@ static really_inline u64a movq(const m128 in) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/* another form of movq */
|
||||
static really_inline
|
||||
m128 load_m128_from_u64a(const u64a *p) {
|
||||
return _mm_loadl_epi64((const m128 *)p);
|
||||
}
|
||||
|
||||
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
|
||||
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
|
||||
|
||||
@@ -270,12 +276,12 @@ void clearbit128(m128 *ptr, unsigned int n) {
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit128(const m128 *ptr, unsigned int n) {
|
||||
char testbit128(m128 val, unsigned int n) {
|
||||
const m128 mask = mask1bit128(n);
|
||||
#if defined(__SSE4_1__)
|
||||
return !_mm_testz_si128(mask, *ptr);
|
||||
return !_mm_testz_si128(mask, val);
|
||||
#else
|
||||
return isnonzero128(and128(mask, *ptr));
|
||||
return isnonzero128(and128(mask, val));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -606,13 +612,13 @@ void clearbit256(m256 *ptr, unsigned int n) {
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit256(const m256 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
const m128 *sub;
|
||||
char testbit256(m256 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
sub = val.lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
sub = val.hi;
|
||||
n -= 128;
|
||||
}
|
||||
return testbit128(sub, n);
|
||||
@@ -633,9 +639,9 @@ void clearbit256(m256 *ptr, unsigned int n) {
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit256(const m256 *ptr, unsigned int n) {
|
||||
char testbit256(m256 val, unsigned int n) {
|
||||
const m256 mask = mask1bit256(n);
|
||||
return !_mm256_testz_si256(mask, *ptr);
|
||||
return !_mm256_testz_si256(mask, val);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
@@ -827,15 +833,15 @@ void clearbit384(m384 *ptr, unsigned int n) {
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit384(const m384 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
const m128 *sub;
|
||||
char testbit384(m384 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
sub = val.lo;
|
||||
} else if (n < 256) {
|
||||
sub = &ptr->mid;
|
||||
sub = val.mid;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
sub = val.hi;
|
||||
}
|
||||
return testbit128(sub, n % 128);
|
||||
}
|
||||
@@ -1040,26 +1046,26 @@ void clearbit512(m512 *ptr, unsigned int n) {
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit512(const m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
char testbit512(m512 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
#if !defined(__AVX2__)
|
||||
const m128 *sub;
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo.lo;
|
||||
sub = val.lo.lo;
|
||||
} else if (n < 256) {
|
||||
sub = &ptr->lo.hi;
|
||||
sub = val.lo.hi;
|
||||
} else if (n < 384) {
|
||||
sub = &ptr->hi.lo;
|
||||
sub = val.hi.lo;
|
||||
} else {
|
||||
sub = &ptr->hi.hi;
|
||||
sub = val.hi.hi;
|
||||
}
|
||||
return testbit128(sub, n % 128);
|
||||
#else
|
||||
const m256 *sub;
|
||||
m256 sub;
|
||||
if (n < 256) {
|
||||
sub = &ptr->lo;
|
||||
sub = val.lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
sub = val.hi;
|
||||
n -= 256;
|
||||
}
|
||||
return testbit256(sub, n);
|
||||
|
@@ -180,44 +180,52 @@
|
||||
#define partial_load_m384(ptr, sz) loadbytes384(ptr, sz)
|
||||
#define partial_load_m512(ptr, sz) loadbytes512(ptr, sz)
|
||||
|
||||
#define store_compressed_u32(ptr, x, m) storecompressed32(ptr, x, m)
|
||||
#define store_compressed_u64a(ptr, x, m) storecompressed64(ptr, x, m)
|
||||
#define store_compressed_m128(ptr, x, m) storecompressed128(ptr, x, m)
|
||||
#define store_compressed_m256(ptr, x, m) storecompressed256(ptr, x, m)
|
||||
#define store_compressed_m384(ptr, x, m) storecompressed384(ptr, x, m)
|
||||
#define store_compressed_m512(ptr, x, m) storecompressed512(ptr, x, m)
|
||||
#define store_compressed_u32(ptr, x, m, len) storecompressed32(ptr, x, m, len)
|
||||
#define store_compressed_u64a(ptr, x, m, len) storecompressed64(ptr, x, m, len)
|
||||
#define store_compressed_m128(ptr, x, m, len) storecompressed128(ptr, x, m, len)
|
||||
#define store_compressed_m256(ptr, x, m, len) storecompressed256(ptr, x, m, len)
|
||||
#define store_compressed_m384(ptr, x, m, len) storecompressed384(ptr, x, m, len)
|
||||
#define store_compressed_m512(ptr, x, m, len) storecompressed512(ptr, x, m, len)
|
||||
|
||||
#define load_compressed_u32(x, ptr, m) loadcompressed32(x, ptr, m)
|
||||
#define load_compressed_u64a(x, ptr, m) loadcompressed64(x, ptr, m)
|
||||
#define load_compressed_m128(x, ptr, m) loadcompressed128(x, ptr, m)
|
||||
#define load_compressed_m256(x, ptr, m) loadcompressed256(x, ptr, m)
|
||||
#define load_compressed_m384(x, ptr, m) loadcompressed384(x, ptr, m)
|
||||
#define load_compressed_m512(x, ptr, m) loadcompressed512(x, ptr, m)
|
||||
#define load_compressed_u32(x, ptr, m, len) loadcompressed32(x, ptr, m, len)
|
||||
#define load_compressed_u64a(x, ptr, m, len) loadcompressed64(x, ptr, m, len)
|
||||
#define load_compressed_m128(x, ptr, m, len) loadcompressed128(x, ptr, m, len)
|
||||
#define load_compressed_m256(x, ptr, m, len) loadcompressed256(x, ptr, m, len)
|
||||
#define load_compressed_m384(x, ptr, m, len) loadcompressed384(x, ptr, m, len)
|
||||
#define load_compressed_m512(x, ptr, m, len) loadcompressed512(x, ptr, m, len)
|
||||
|
||||
static really_inline void clearbit_u32(u32 *p, u32 n) {
|
||||
static really_inline
|
||||
void clearbit_u32(u32 *p, u32 n) {
|
||||
assert(n < sizeof(*p) * 8);
|
||||
*p &= ~(1U << n);
|
||||
}
|
||||
static really_inline void clearbit_u64a(u64a *p, u32 n) {
|
||||
|
||||
static really_inline
|
||||
void clearbit_u64a(u64a *p, u32 n) {
|
||||
assert(n < sizeof(*p) * 8);
|
||||
*p &= ~(1ULL << n);
|
||||
}
|
||||
|
||||
#define clearbit_m128(ptr, n) (clearbit128(ptr, n))
|
||||
#define clearbit_m256(ptr, n) (clearbit256(ptr, n))
|
||||
#define clearbit_m384(ptr, n) (clearbit384(ptr, n))
|
||||
#define clearbit_m512(ptr, n) (clearbit512(ptr, n))
|
||||
|
||||
static really_inline char testbit_u32(const u32 *p, u32 n) {
|
||||
assert(n < sizeof(*p) * 8);
|
||||
return !!(*p & (1U << n));
|
||||
static really_inline
|
||||
char testbit_u32(u32 val, u32 n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
return !!(val & (1U << n));
|
||||
}
|
||||
static really_inline char testbit_u64a(const u64a *p, u32 n) {
|
||||
assert(n < sizeof(*p) * 8);
|
||||
return !!(*p & (1ULL << n));
|
||||
|
||||
static really_inline
|
||||
char testbit_u64a(u64a val, u32 n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
return !!(val & (1ULL << n));
|
||||
}
|
||||
#define testbit_m128(ptr, n) (testbit128(ptr, n))
|
||||
#define testbit_m256(ptr, n) (testbit256(ptr, n))
|
||||
#define testbit_m384(ptr, n) (testbit384(ptr, n))
|
||||
#define testbit_m512(ptr, n) (testbit512(ptr, n))
|
||||
|
||||
#define testbit_m128(val, n) (testbit128(val, n))
|
||||
#define testbit_m256(val, n) (testbit256(val, n))
|
||||
#define testbit_m384(val, n) (testbit384(val, n))
|
||||
#define testbit_m512(val, n) (testbit512(val, n))
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user