Merge a5fdbcb873d414d5f14305d924e2c2267de0e0cb into 9e9a10ad01fceb2032ae6e36cb0262c4dbba90c7

This commit is contained in:
Konstantinos Margaritis 2025-06-17 12:35:17 +12:00 committed by GitHub
commit 61e073b092
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 226 additions and 170 deletions

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2024, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -82,44 +83,6 @@ struct zone {
const u8 *floodPtr;
};
static
const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
};
/* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent
@ -141,96 +104,159 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
return s;
}
//#include "../print_simd.h"
static really_inline
void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, u32 domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
/* +1: the zones ensure that we can read the byte at z->end */
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a domain_mask = ~domain_mask_flipped;
u64a it_hi = *(const u64a *)itPtr;
u64a it_lo = *(const u64a *)(itPtr + 8);
u64a reach0 = domain_mask & it_hi;
u64a reach1 = domain_mask & (it_hi >> 8);
u64a reach2 = domain_mask & (it_hi >> 16);
u64a reach3 = domain_mask & (it_hi >> 24);
u64a reach4 = domain_mask & (it_hi >> 32);
u64a reach5 = domain_mask & (it_hi >> 40);
u64a reach6 = domain_mask & (it_hi >> 48);
u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
u64a reach8 = domain_mask & it_lo;
u64a reach9 = domain_mask & (it_lo >> 8);
u64a reach10 = domain_mask & (it_lo >> 16);
u64a reach11 = domain_mask & (it_lo >> 24);
u64a reach12 = domain_mask & (it_lo >> 32);
u64a reach13 = domain_mask & (it_lo >> 40);
u64a reach14 = domain_mask & (it_lo >> 48);
u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
// u64a ALIGN_ATTR(16) reach[16];
u32 ALIGN_ATTR(16) reach[16];
m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
m128 st8 = load_m128_from_u64a(ft + reach8);
m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
m128 domain_mask_v = set1_4x32(domain_mask);
// m256 ft_v = set1_4x64((ptrdiff_t)ft);
m128 it_v = loadu128(itPtr);
m128 it_shifted8_v = rshiftbyte_m128(it_v, 1);
m128 it_shifted16_v = rshiftbyte_m128(it_v, 2);
m128 it_shifted24_v = rshiftbyte_m128(it_v, 3);
it_shifted24_v = insert32_m128(it_shifted24_v, unaligned_load_u32(itPtr + 15), 3);
m128 reach_v[4];
// m256 reach64_v[4];
reach_v[0] = and128(domain_mask_v, it_v);
reach_v[1] = and128(domain_mask_v, it_shifted8_v);
reach_v[2] = and128(domain_mask_v, it_shifted16_v);
reach_v[3] = and128(domain_mask_v, it_shifted24_v);
// reach_v[0] = lshift32_m128(reach_v[0], 3);
// reach_v[1] = lshift32_m128(reach_v[1], 3);
// reach_v[2] = lshift32_m128(reach_v[2], 3);
// reach_v[3] = lshift32_m128(reach_v[3], 3);
// reach64_v[0] = widen128(reach_v[0]);
// reach64_v[1] = widen128(reach_v[1]);
// reach64_v[2] = widen128(reach_v[2]);
// reach64_v[3] = widen128(reach_v[3]);
// reach64_v[0] = add256(reach64_v[0], ft_v);
// reach64_v[1] = add256(reach64_v[1], ft_v);
// reach64_v[2] = add256(reach64_v[2], ft_v);
// reach64_v[3] = add256(reach64_v[3], ft_v);
// store256(&reach[0], reach64_v[0]);
// store256(&reach[4], reach64_v[1]);
// store256(&reach[8], reach64_v[2]);
// store256(&reach[12], reach64_v[3]);
store128(&reach[0], reach_v[0]);
store128(&reach[4], reach_v[1]);
store128(&reach[8], reach_v[2]);
store128(&reach[12], reach_v[3]);
m128 st0 = load_m128_from_u64a(ft + reach[0]);
m128 st1 = load_m128_from_u64a(ft + reach[4]);
st1 = lshiftbyte_m128(st1, 1);
st0 = or128(st0, st1);
m128 st2 = load_m128_from_u64a(ft + reach[8]);
st2 = lshiftbyte_m128(st2, 2);
m128 st3 = load_m128_from_u64a(ft + reach[12]);
st3 = lshiftbyte_m128(st3, 3);
st2 = or128(st2, st3);
m128 st4 = load_m128_from_u64a(ft + reach[1]);
st4 = lshiftbyte_m128(st4, 4);
m128 st5 = load_m128_from_u64a(ft + reach[5]);
st5 = lshiftbyte_m128(st5, 5);
st4 = or128(st4, st5);
m128 st6 = load_m128_from_u64a(ft + reach[9]);
st6 = lshiftbyte_m128(st6, 6);
m128 st7 = load_m128_from_u64a(ft + reach[13]);
st7 = lshiftbyte_m128(st7, 7);
st6 = or128(st6, st7);
m128 st8 = load_m128_from_u64a(ft + reach[2]);
m128 st9 = load_m128_from_u64a(ft + reach[6]);
st9 = lshiftbyte_m128(st9, 1);
st8 = or128(st8, st9);
m128 st10 = load_m128_from_u64a(ft + reach[10]);
st10 = lshiftbyte_m128(st10, 2);
m128 st11 = load_m128_from_u64a(ft + reach[14]);
st11 = lshiftbyte_m128(st11, 3);
st10 = or128(st10, st11);
m128 st12 = load_m128_from_u64a(ft + reach[3]);
st12 = lshiftbyte_m128(st12, 4);
m128 st13 = load_m128_from_u64a(ft + reach[7]);
st13 = lshiftbyte_m128(st13, 5);
st12 = or128(st12, st13);
m128 st14 = load_m128_from_u64a(ft + reach[11]);
st14 = lshiftbyte_m128(st14, 6);
m128 st15 = load_m128_from_u64a(ft + reach[15]);
st15 = lshiftbyte_m128(st15, 7);
st14 = or128(st14, st15);
// m128 st0 = load_m128_from_u64a((u64a *)reach[0]);
// m128 st4 = load_m128_from_u64a((u64a *)reach[1]);
// m128 st8 = load_m128_from_u64a((u64a *)reach[2]);
// m128 st12 = load_m128_from_u64a((u64a *)reach[3]);
// m128 st1 = load_m128_from_u64a((u64a *)reach[4]);
// m128 st5 = load_m128_from_u64a((u64a *)reach[5]);
// m128 st9 = load_m128_from_u64a((u64a *)reach[6]);
// m128 st13 = load_m128_from_u64a((u64a *)reach[7]);
// m128 st2 = load_m128_from_u64a((u64a *)reach[8]);
// m128 st6 = load_m128_from_u64a((u64a *)reach[9]);
// m128 st10 = load_m128_from_u64a((u64a *)reach[10]);
// m128 st14 = load_m128_from_u64a((u64a *)reach[11]);
// m128 st3 = load_m128_from_u64a((u64a *)reach[12]);
// m128 st7 = load_m128_from_u64a((u64a *)reach[13]);
// m128 st11 = load_m128_from_u64a((u64a *)reach[14]);
// m128 st15 = load_m128_from_u64a((u64a *)reach[15]);
st0 = or128(st0, st2);
st4 = or128(st4, st6);
st0 = or128(st0, st4);
m128 st = or128(*s, st0);
*conf0 = movq(st) ^ ~0ULL;
st = rshiftbyte_m128(st, 8);
st8 = or128(st8, st9);
st10 = or128(st10, st11);
st12 = or128(st12, st13);
st14 = or128(st14, st15);
st8 = or128(st8, st10);
st12 = or128(st12, st14);
st8 = or128(st8, st12);
m128 st = or128(*s, st0);
*conf0 = movq(st) ^ ~0ULL;
st = rshiftbyte_m128(st, 8);
st = or128(st, st8);
*conf8 = movq(st) ^ ~0ULL;
*s = rshiftbyte_m128(st, 8);
}
static really_inline
void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, u32 domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
u64a it_hi = *(const u64a *)itPtr;
u64a it_lo = *(const u64a *)(itPtr + 8);
u64a reach0 = domain_mask & it_hi;
u64a reach2 = domain_mask & (it_hi >> 16);
u64a reach4 = domain_mask & (it_hi >> 32);
u64a reach6 = domain_mask & (it_hi >> 48);
u64a reach8 = domain_mask & it_lo;
u64a reach10 = domain_mask & (it_lo >> 16);
u64a reach12 = domain_mask & (it_lo >> 32);
u64a reach14 = domain_mask & (it_lo >> 48);
m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st2 = load_m128_from_u64a(ft + reach2);
m128 st4 = load_m128_from_u64a(ft + reach4);
m128 st6 = load_m128_from_u64a(ft + reach6);
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
m128 st8 = load_m128_from_u64a(ft + reach8);
m128 st10 = load_m128_from_u64a(ft + reach10);
m128 st12 = load_m128_from_u64a(ft + reach12);
@ -239,6 +265,9 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
st2 = lshiftbyte_m128(st2, 2);
st4 = lshiftbyte_m128(st4, 4);
st6 = lshiftbyte_m128(st6, 6);
st10 = lshiftbyte_m128(st10, 2);
st12 = lshiftbyte_m128(st12, 4);
st14 = lshiftbyte_m128(st14, 6);
*s = or128(*s, st0);
*s = or128(*s, st2);
@ -249,10 +278,6 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
*s = rshiftbyte_m128(*s, 8);
*conf0 ^= ~0ULL;
st10 = lshiftbyte_m128(st10, 2);
st12 = lshiftbyte_m128(st12, 4);
st14 = lshiftbyte_m128(st14, 6);
*s = or128(*s, st8);
*s = or128(*s, st10);
*s = or128(*s, st12);
@ -265,14 +290,16 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
static really_inline
void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, u32 domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
u64a it_hi = *(const u64a *)itPtr;
u64a it_lo = *(const u64a *)(itPtr + 8);
u64a reach0 = domain_mask & it_hi;
u64a reach4 = domain_mask & (it_hi >> 32);
u64a reach8 = domain_mask & it_lo;
u64a reach12 = domain_mask & (it_lo >> 32);
m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st4 = load_m128_from_u64a(ft + reach4);
@ -660,41 +687,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
#define INVALID_MATCH_ID (~0U)
#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
do { \
const u8 *tryFloodDetect = zz->floodPtr; \
const u8 *start_ptr = zz->start; \
const u8 *end_ptr = zz->end; \
for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
itPtr += 4*ITER_BYTES) { \
__builtin_prefetch(itPtr); \
} \
\
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
itPtr += ITER_BYTES) { \
if (unlikely(itPtr > tryFloodDetect)) { \
tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
&floodBackoff, &control, \
ITER_BYTES); \
if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
return HWLM_TERMINATED; \
} \
} \
__builtin_prefetch(itPtr + ITER_BYTES); \
u64a conf0; \
u64a conf8; \
get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
ft, &conf0, &conf8, &s); \
do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
&last_match_id, zz); \
do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
&last_match_id, zz); \
if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
return HWLM_TERMINATED; \
} \
} /* end for loop */ \
} while (0) \
static never_inline
hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
@ -703,7 +695,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
u32 floodBackoff = FLOOD_BACKOFF_START;
u32 last_match_id = INVALID_MATCH_ID;
u32 domain_mask_flipped = ~fdr->domainMask;
u32 domain_mask = fdr->domainMask;
u8 stride = fdr->stride;
const u64a *ft =
(const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
@ -722,42 +714,51 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
for (size_t curZone = 0; curZone < numZone; curZone++) {
struct zone *z = &zones[curZone];
dumpZoneInfo(z, curZone);
/* When a zone contains less data than is processed in an iteration
* of FDR_MAIN_LOOP(), we need to scan over some extra data.
*
* We have chosen to scan this extra data at the start of the
* iteration. The extra data is either data we have already scanned or
* garbage (if it is earlier than offset 0),
*
* As a result we need to shift the incoming state back so that it will
* properly line up with the data being scanned.
*
* We also need to forbid reporting any matches in the data being
* rescanned as they have already been reported (or are over garbage but
* later stages should also provide that safety guarantee).
*/
u8 shift = z->shift;
state = variable_byte_shift_m128(state, shift);
state = or128(state, variable_byte_shift_m128(ones128(), shift-16));
state = or128(state, load128(zone_or_mask[shift]));
const u8 *tryFloodDetect = z->floodPtr;
const u8 *start_ptr = z->start;
const u8 *end_ptr = z->end;
for (const u8 *itPtr = ROUNDDOWN_PTR(z->start, 64); itPtr + 4*ITER_BYTES <= z->end; itPtr += 4*ITER_BYTES) {
__builtin_prefetch(itPtr + 16*ITER_BYTES);
}
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;
itPtr += ITER_BYTES) {
if (unlikely(itPtr > tryFloodDetect)) {
tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,
&floodBackoff, &control,
ITER_BYTES);
if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
}
u64a conf0;
u64a conf8;
__builtin_prefetch(itPtr + 16*ITER_BYTES);
switch (stride) {
case 1:
FDR_MAIN_LOOP(z, state, get_conf_stride_1);
get_conf_stride_1(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
break;
case 2:
FDR_MAIN_LOOP(z, state, get_conf_stride_2);
get_conf_stride_2(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
break;
case 4:
FDR_MAIN_LOOP(z, state, get_conf_stride_4);
get_conf_stride_4(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
break;
default:
break;
}
do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z);
do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z);
if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
return HWLM_TERMINATED;
}
} /* end for loop */
}
return HWLM_SUCCESS;

View File

@ -181,6 +181,10 @@ static really_inline m128 set1_2x64(u64a c) {
return (m128) vdupq_n_u64(c);
}
static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
return (m128) vsetq_lane_u32(val, (uint32x4_t)in, imm);
}
static really_inline u32 movd(const m128 in) {
return vgetq_lane_u32((uint32x4_t) in, 0);
}
@ -449,4 +453,14 @@ m128 set2x64(u64a hi, u64a lo) {
return (m128) vld1q_u64((uint64_t *) data);
}
static really_inline
m128 widenlo128(m128 x) {
return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x));
}
static really_inline
m128 widenhi128(m128 x) {
return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x));
}
#endif // ARCH_ARM_SIMD_UTILS_H

View File

@ -388,6 +388,14 @@ m256 pshufb_m256(m256 a, m256 b) {
return rv;
}
static really_inline
m256 widen128(m128 x) {
m256 rv;
rv.lo = widenlo128(x);
rv.hi = widenhi128(x);
return rv;
}
#endif // HAVE_SIMD_256_BITS
/****

View File

@ -123,6 +123,17 @@ m128 sub_2x64(m128 a, m128 b) {
return (m128) _mm_sub_epi64(a, b);
}
static really_really_inline
m128 lshift32_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm_slli_epi32(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm_sll_epi32(a, x);
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
@ -158,6 +169,10 @@ static really_inline m128 set1_2x64(u64a c) {
return _mm_set1_epi64x(c);
}
static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
return _mm_insert_epi32(in, val, imm);
}
static really_inline u32 movd(const m128 in) {
return _mm_cvtsi128_si32(in);
}
@ -474,6 +489,18 @@ m128 set2x64(u64a hi, u64a lo) {
return _mm_set_epi64x(hi, lo);
}
#include "../print_simd.h"
static really_inline
m128 widenlo128(m128 x) {
return _mm_unpacklo_epi32(x, zeroes128());
}
static really_inline
m128 widenhi128(m128 x) {
return _mm_unpackhi_epi32(x, zeroes128());
}
/****
**** 256-bit Primitives
****/
@ -750,6 +777,12 @@ m256 combine2x128(m128 hi, m128 lo) {
return insert128to256(cast128to256(lo), hi, 1);
#endif
}
static really_inline
m256 widen128(m128 x) {
return (m256) _mm256_cvtepu32_epi64(x);
}
#endif //AVX2
/****