mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Merge pull request #124 from VectorCamp/develop
Merge develop to master
This commit is contained in:
commit
361feb64e3
@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
|
|||||||
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
|
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
|
||||||
DEBUG_PRINTF("user callback told us to skip this pattern\n");
|
DEBUG_PRINTF("user callback told us to skip this pattern\n");
|
||||||
pd->scanStart = hyctx->length;
|
pd->scanStart = hyctx->length;
|
||||||
|
if (top_id == id) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (top_id == id) {
|
if (top_id == id) {
|
||||||
|
@ -9,10 +9,10 @@ export CROSS_SYS=<arm-cross-compiler-system-dir>
|
|||||||
# wget -O boost_$BOOST_VERSION.tar.gz https://sourceforge.net/projects/boost/files/boost/$BOOST_DOT_VERSION/boost_$BOOST_VERSION.tar.gz/download
|
# wget -O boost_$BOOST_VERSION.tar.gz https://sourceforge.net/projects/boost/files/boost/$BOOST_DOT_VERSION/boost_$BOOST_VERSION.tar.gz/download
|
||||||
# tar xf boost_$BOOST_VERSION.tar.gz
|
# tar xf boost_$BOOST_VERSION.tar.gz
|
||||||
# fi
|
# fi
|
||||||
if [ ! -d "pcre-8.41" ];
|
if [ ! -d "pcre-8.45" ];
|
||||||
then
|
then
|
||||||
wget -O pcre-8.41.tar.bz2 https://ftp.pcre.org/pub/pcre/pcre-8.41.tar.bz2
|
wget -O pcre-8.45.tar.bz2 https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.bz2/download
|
||||||
tar xf pcre-8.41.tar.bz2
|
tar xf pcre-8.45.tar.bz2
|
||||||
export PCRE_SOURCE=1
|
export PCRE_SOURCE=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2020, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that our pattern isn't too long (in characters).
|
// Ensure that our pattern isn't too long (in characters).
|
||||||
if (strlen(expression) > cc.grey.limitPatternLength) {
|
size_t maxlen = cc.grey.limitPatternLength + 1;
|
||||||
|
if (strnlen(expression, maxlen) >= maxlen) {
|
||||||
throw CompileError("Pattern length exceeds limit.");
|
throw CompileError("Pattern length exceeds limit.");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -416,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
|
|||||||
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
|
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!strcmp(expression, "")) {
|
||||||
|
throw CompileError("Pure literal API doesn't support empty string.");
|
||||||
|
}
|
||||||
|
|
||||||
// This expression must be a pure literal, we can build ue2_literal
|
// This expression must be a pure literal, we can build ue2_literal
|
||||||
// directly based on expression text.
|
// directly based on expression text.
|
||||||
ParsedLitExpression ple(index, expression, expLength, flags, id);
|
ParsedLitExpression ple(index, expression, expLength, flags, id);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2020, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -517,6 +517,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
|||||||
return HS_COMPILER_ERROR;
|
return HS_COMPILER_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (flags & HS_FLAG_COMBINATION) {
|
||||||
|
*error = generateCompileError("Invalid parameter: unsupported "
|
||||||
|
"logical combination expression", -1);
|
||||||
|
return HS_COMPILER_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
*info = nullptr;
|
*info = nullptr;
|
||||||
*error = nullptr;
|
*error = nullptr;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2020, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
|
|||||||
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
||||||
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
||||||
* when a match is found.
|
* when a match is found.
|
||||||
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
|
* - HS_FLAG_QUIET - This flag will be ignored.
|
||||||
* syntax.
|
|
||||||
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
|
|
||||||
* the sub-expressions in logical combinations.
|
|
||||||
*
|
*
|
||||||
* @param info
|
* @param info
|
||||||
* On success, a pointer to the pattern information will be returned in
|
* On success, a pointer to the pattern information will be returned in
|
||||||
@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
|
|||||||
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
||||||
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
||||||
* when a match is found.
|
* when a match is found.
|
||||||
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
|
* - HS_FLAG_QUIET - This flag will be ignored.
|
||||||
* syntax.
|
|
||||||
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
|
|
||||||
* the sub-expressions in logical combinations.
|
|
||||||
*
|
*
|
||||||
* @param ext
|
* @param ext
|
||||||
* A pointer to a filled @ref hs_expr_ext_t structure that defines
|
* A pointer to a filled @ref hs_expr_ext_t structure that defines
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2019, Intel Corporation
|
* Copyright (c) 2019-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -80,7 +80,9 @@ extern "C"
|
|||||||
| HS_FLAG_PREFILTER \
|
| HS_FLAG_PREFILTER \
|
||||||
| HS_FLAG_SINGLEMATCH \
|
| HS_FLAG_SINGLEMATCH \
|
||||||
| HS_FLAG_ALLOWEMPTY \
|
| HS_FLAG_ALLOWEMPTY \
|
||||||
| HS_FLAG_SOM_LEFTMOST)
|
| HS_FLAG_SOM_LEFTMOST \
|
||||||
|
| HS_FLAG_COMBINATION \
|
||||||
|
| HS_FLAG_QUIET)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
|
@ -36,7 +36,7 @@ static really_really_inline
|
|||||||
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
||||||
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
||||||
while (unlikely(z)) {
|
while (unlikely(z)) {
|
||||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
|
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
|
||||||
size_t matchPos = d - buf + pos;
|
size_t matchPos = d - buf + pos;
|
||||||
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
||||||
hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
|
hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
|
||||||
@ -49,7 +49,7 @@ static really_really_inline
|
|||||||
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
||||||
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
||||||
while (unlikely(z)) {
|
while (unlikely(z)) {
|
||||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
|
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
|
||||||
size_t matchPos = d - buf + pos - 1;
|
size_t matchPos = d - buf + pos - 1;
|
||||||
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
||||||
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
|
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
|
||||||
@ -77,9 +77,11 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
|
|||||||
SuperVector<S> v = SuperVector<S>::Zeroes();
|
SuperVector<S> v = SuperVector<S>::Zeroes();
|
||||||
memcpy(&v.u, d, l);
|
memcpy(&v.u, d, l);
|
||||||
|
|
||||||
typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l);
|
typename SuperVector<S>::comparemask_type mask =
|
||||||
|
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
|
||||||
v = v & caseMask;
|
v = v & caseMask;
|
||||||
typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
|
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
return single_zscan(n, d, buf, z, len, cbi);
|
return single_zscan(n, d, buf, z, len, cbi);
|
||||||
}
|
}
|
||||||
@ -103,9 +105,12 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
|
|||||||
return HWLM_SUCCESS;
|
return HWLM_SUCCESS;
|
||||||
}
|
}
|
||||||
size_t buf_off = start - offset;
|
size_t buf_off = start - offset;
|
||||||
typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off;
|
typename SuperVector<S>::comparemask_type mask =
|
||||||
|
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
|
||||||
|
<< (buf_off * SuperVector<S>::mask_width());
|
||||||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
||||||
typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
|
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
return single_zscan(n, d, buf, z, len, cbi);
|
return single_zscan(n, d, buf, z, len, cbi);
|
||||||
}
|
}
|
||||||
@ -126,10 +131,13 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
|
|||||||
memcpy(&v.u, d, l);
|
memcpy(&v.u, d, l);
|
||||||
v = v & caseMask;
|
v = v & caseMask;
|
||||||
|
|
||||||
typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
|
typename SuperVector<S>::comparemask_type mask =
|
||||||
typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
|
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
|
||||||
typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
|
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||||
typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
|
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||||
|
typename SuperVector<S>::comparemask_type z =
|
||||||
|
mask & (z1 << (SuperVector<S>::mask_width())) & z2;
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
return double_zscan(n, d, buf, z, len, cbi);
|
return double_zscan(n, d, buf, z, len, cbi);
|
||||||
}
|
}
|
||||||
@ -148,10 +156,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
|
|||||||
}
|
}
|
||||||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
||||||
size_t buf_off = start - offset;
|
size_t buf_off = start - offset;
|
||||||
typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off;
|
typename SuperVector<S>::comparemask_type mask =
|
||||||
typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
|
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
|
||||||
typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
|
<< (buf_off * SuperVector<S>::mask_width());
|
||||||
typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
|
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||||
|
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||||
|
typename SuperVector<S>::comparemask_type z =
|
||||||
|
mask & (z1 << SuperVector<S>::mask_width()) & z2;
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
return double_zscan(n, d, buf, z, len, cbi);
|
return double_zscan(n, d, buf, z, len, cbi);
|
||||||
}
|
}
|
||||||
@ -191,7 +203,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
|
|||||||
__builtin_prefetch(base + 256);
|
__builtin_prefetch(base + 256);
|
||||||
|
|
||||||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
||||||
typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
|
typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
|
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
|
||||||
RETURN_IF_TERMINATED(rv);
|
RETURN_IF_TERMINATED(rv);
|
||||||
@ -220,7 +233,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
|
|||||||
|
|
||||||
size_t start = offset + n->msk_len - n->key_offset;
|
size_t start = offset + n->msk_len - n->key_offset;
|
||||||
|
|
||||||
typename SuperVector<S>::movemask_type lastz1{0};
|
typename SuperVector<S>::comparemask_type lastz1{0};
|
||||||
|
|
||||||
const u8 *d = buf + start;
|
const u8 *d = buf + start;
|
||||||
const u8 *e = buf + end;
|
const u8 *e = buf + end;
|
||||||
@ -248,10 +261,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
|
|||||||
__builtin_prefetch(base + 256);
|
__builtin_prefetch(base + 256);
|
||||||
|
|
||||||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
||||||
typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
|
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||||
typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
|
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||||
typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
|
typename SuperVector<S>::comparemask_type z =
|
||||||
lastz1 = z1 >> Z_SHIFT;
|
(z1 << SuperVector<S>::mask_width() | lastz1) & z2;
|
||||||
|
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
|
||||||
|
z = SuperVector<S>::iteration_mask(z);
|
||||||
|
|
||||||
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
|
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
|
||||||
RETURN_IF_TERMINATED(rv);
|
RETURN_IF_TERMINATED(rv);
|
||||||
|
@ -53,7 +53,15 @@ really_really_inline
|
|||||||
u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
|
u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
|
||||||
SuperVector<16> shuffled = s.pshufb<true>(permute);
|
SuperVector<16> shuffled = s.pshufb<true>(permute);
|
||||||
SuperVector<16> compared = shuffled & compare;
|
SuperVector<16> compared = shuffled & compare;
|
||||||
u16 rv = ~compared.eqmask(shuffled);
|
u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
|
||||||
|
if (SuperVector<16>::mask_width() != 1) {
|
||||||
|
u32 ans = 0;
|
||||||
|
for (u32 i = 0; i < 16; ++i) {
|
||||||
|
ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
|
||||||
|
(i * SuperVector<16>::mask_width() - i);
|
||||||
|
}
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
return (u32)rv;
|
return (u32)rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -62,7 +70,8 @@ really_really_inline
|
|||||||
u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
|
u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
|
||||||
SuperVector<32> shuffled = s.pshufb<true>(permute);
|
SuperVector<32> shuffled = s.pshufb<true>(permute);
|
||||||
SuperVector<32> compared = shuffled & compare;
|
SuperVector<32> compared = shuffled & compare;
|
||||||
u32 rv = ~compared.eqmask(shuffled);
|
// TODO(danlark1): Future ARM support might have a bug.
|
||||||
|
u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
|
||||||
return (u32)((rv >> 16) | (rv & 0xffffU));
|
return (u32)((rv >> 16) | (rv & 0xffffU));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,6 +80,7 @@ really_really_inline
|
|||||||
u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
|
u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
|
||||||
SuperVector<64> shuffled = s.pshufb<true>(permute);
|
SuperVector<64> shuffled = s.pshufb<true>(permute);
|
||||||
SuperVector<64> compared = shuffled & compare;
|
SuperVector<64> compared = shuffled & compare;
|
||||||
|
// TODO(danlark1): Future ARM support might have a bug.
|
||||||
u64a rv = ~compared.eqmask(shuffled);
|
u64a rv = ~compared.eqmask(shuffled);
|
||||||
rv = rv >> 32 | rv;
|
rv = rv >> 32 | rv;
|
||||||
return (u32)(((rv >> 16) | rv) & 0xffffU);
|
return (u32)(((rv >> 16) | rv) & 0xffffU);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2020, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -1081,7 +1081,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
|
|||||||
// Use the daddy already set for this state so long as it isn't already
|
// Use the daddy already set for this state so long as it isn't already
|
||||||
// a Sherman state.
|
// a Sherman state.
|
||||||
dstate_id_t daddy = currState.daddy;
|
dstate_id_t daddy = currState.daddy;
|
||||||
if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
|
if (info.is_widestate(daddy)) {
|
||||||
|
return;
|
||||||
|
} else if (!info.is_sherman(daddy)) {
|
||||||
hinted.insert(currState.daddy);
|
hinted.insert(currState.daddy);
|
||||||
} else {
|
} else {
|
||||||
// Fall back to granddaddy, which has already been processed (due
|
// Fall back to granddaddy, which has already been processed (due
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2020, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -3092,6 +3092,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
|
|||||||
|
|
||||||
const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
|
const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
|
||||||
const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
|
const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
|
||||||
|
const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
|
||||||
|
|
||||||
const char *pc_base = getByOffset(t, programOffset);
|
const char *pc_base = getByOffset(t, programOffset);
|
||||||
const char *pc = pc_base;
|
const char *pc = pc_base;
|
||||||
@ -3188,6 +3189,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
|
|||||||
}
|
}
|
||||||
L_PROGRAM_NEXT_INSTRUCTION
|
L_PROGRAM_NEXT_INSTRUCTION
|
||||||
|
|
||||||
|
L_PROGRAM_CASE(CATCH_UP_MPV) {
|
||||||
|
if (from_mpv || skip_mpv_catchup) {
|
||||||
|
DEBUG_PRINTF("skipping mpv catchup\n");
|
||||||
|
} else if (roseCatchUpMPV(t,
|
||||||
|
end - scratch->core_info.buf_offset,
|
||||||
|
scratch) == HWLM_TERMINATE_MATCHING) {
|
||||||
|
return HWLM_TERMINATE_MATCHING;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
L_PROGRAM_NEXT_INSTRUCTION
|
||||||
|
|
||||||
L_PROGRAM_CASE(SOM_FROM_REPORT) {
|
L_PROGRAM_CASE(SOM_FROM_REPORT) {
|
||||||
som = handleSomExternal(scratch, &ri->som, end);
|
som = handleSomExternal(scratch, &ri->som, end);
|
||||||
DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
|
DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
|
||||||
@ -3195,6 +3207,15 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
|
|||||||
}
|
}
|
||||||
L_PROGRAM_NEXT_INSTRUCTION
|
L_PROGRAM_NEXT_INSTRUCTION
|
||||||
|
|
||||||
|
L_PROGRAM_CASE(TRIGGER_SUFFIX) {
|
||||||
|
if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
|
||||||
|
end) == HWLM_TERMINATE_MATCHING) {
|
||||||
|
return HWLM_TERMINATE_MATCHING;
|
||||||
|
}
|
||||||
|
work_done = 1;
|
||||||
|
}
|
||||||
|
L_PROGRAM_NEXT_INSTRUCTION
|
||||||
|
|
||||||
L_PROGRAM_CASE(DEDUPE) {
|
L_PROGRAM_CASE(DEDUPE) {
|
||||||
updateSeqPoint(tctxt, end, from_mpv);
|
updateSeqPoint(tctxt, end, from_mpv);
|
||||||
const char do_som = t->hasSom; // TODO: constant propagate
|
const char do_som = t->hasSom; // TODO: constant propagate
|
||||||
|
@ -47,7 +47,15 @@ namespace ue2 {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* get us a posix_memalign from somewhere */
|
/* get us a posix_memalign from somewhere */
|
||||||
#if !defined(HAVE_POSIX_MEMALIGN)
|
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <intrin.h>
|
||||||
|
#include <malloc.h>
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
|
#define posix_memalign(A, B, C) ((*A = (void *)__mingw_aligned_malloc(C, B)) == nullptr)
|
||||||
|
|
||||||
|
#elif !defined(HAVE_POSIX_MEMALIGN)
|
||||||
# if defined(HAVE_MEMALIGN)
|
# if defined(HAVE_MEMALIGN)
|
||||||
#define posix_memalign(A, B, C) ((*A = (void *)memalign(B, C)) == nullptr)
|
#define posix_memalign(A, B, C) ((*A = (void *)memalign(B, C)) == nullptr)
|
||||||
# elif defined(HAVE__ALIGNED_MALLOC)
|
# elif defined(HAVE__ALIGNED_MALLOC)
|
||||||
@ -77,7 +85,11 @@ void aligned_free_internal(void *ptr) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||||
|
__mingw_aligned_free(ptr);
|
||||||
|
#else
|
||||||
free(ptr);
|
free(ptr);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \brief 64-byte aligned, zeroed malloc.
|
/** \brief 64-byte aligned, zeroed malloc.
|
||||||
|
@ -76,7 +76,11 @@ public:
|
|||||||
|
|
||||||
T *allocate(std::size_t size) const {
|
T *allocate(std::size_t size) const {
|
||||||
size_t alloc_size = size * sizeof(T);
|
size_t alloc_size = size * sizeof(T);
|
||||||
return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
|
T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
|
||||||
|
if (!ptr) {
|
||||||
|
throw std::bad_alloc();
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void deallocate(T *x, std::size_t) const noexcept {
|
void deallocate(T *x, std::size_t) const noexcept {
|
||||||
|
@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
|
|||||||
uint32x4_t m = mask.u.u32x4[0];
|
uint32x4_t m = mask.u.u32x4[0];
|
||||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||||
if (vmax != 0) {
|
if (vmax != 0) {
|
||||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
u32 pos = ctz32(z & 0xffff);
|
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
|
DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
} else {
|
} else {
|
||||||
return NULL; // no match
|
return NULL; // no match
|
||||||
@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
|
|||||||
uint32x4_t m = mask.u.u32x4[0];
|
uint32x4_t m = mask.u.u32x4[0];
|
||||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||||
if (vmax != 0) {
|
if (vmax != 0) {
|
||||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
u32 pos = clz32(z & 0xffff);
|
u32 pos = clz64(z) / SuperVector<16>::mask_width();
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos >= 16 && pos < 32);
|
return buf + (15 - pos);
|
||||||
return buf + (31 - pos);
|
|
||||||
} else {
|
} else {
|
||||||
return NULL; // no match
|
return NULL; // no match
|
||||||
}
|
}
|
||||||
@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
|
|||||||
uint32x4_t m = mask.u.u32x4[0];
|
uint32x4_t m = mask.u.u32x4[0];
|
||||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||||
if (vmax != 0) {
|
if (vmax != 0) {
|
||||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
u32 pos = ctz32(z & 0xffff);
|
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
|
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
|
||||||
@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
|
|||||||
uint32x4_t m = mask.u.u32x4[0];
|
uint32x4_t m = mask.u.u32x4[0];
|
||||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||||
if (vmax != 0) {
|
if (vmax != 0) {
|
||||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
u32 pos = clz32(z & 0xffff);
|
u32 pos = clz64(z) / SuperVector<16>::mask_width();
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos >= 16 && pos < 32);
|
return buf + (15 - pos);
|
||||||
return buf + (31 - pos);
|
|
||||||
} else {
|
} else {
|
||||||
return NULL; // no match
|
return NULL; // no match
|
||||||
}
|
}
|
||||||
|
@ -53,24 +53,6 @@
|
|||||||
|
|
||||||
#include <string.h> // for memcpy
|
#include <string.h> // for memcpy
|
||||||
|
|
||||||
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
|
||||||
|
|
||||||
/** \brief LUT for the mask1bit functions. */
|
|
||||||
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
ZEROES_31, 0x01, ZEROES_32,
|
|
||||||
ZEROES_31, 0x02, ZEROES_32,
|
|
||||||
ZEROES_31, 0x04, ZEROES_32,
|
|
||||||
ZEROES_31, 0x08, ZEROES_32,
|
|
||||||
ZEROES_31, 0x10, ZEROES_32,
|
|
||||||
ZEROES_31, 0x20, ZEROES_32,
|
|
||||||
ZEROES_31, 0x40, ZEROES_32,
|
|
||||||
ZEROES_31, 0x80, ZEROES_32,
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
};
|
|
||||||
|
|
||||||
static really_inline m128 ones128(void) {
|
static really_inline m128 ones128(void) {
|
||||||
return (m128) vdupq_n_s8(0xFF);
|
return (m128) vdupq_n_s8(0xFF);
|
||||||
}
|
}
|
||||||
@ -86,8 +68,9 @@ static really_inline m128 not128(m128 a) {
|
|||||||
|
|
||||||
/** \brief Return 1 if a and b are different otherwise 0 */
|
/** \brief Return 1 if a and b are different otherwise 0 */
|
||||||
static really_inline int diff128(m128 a, m128 b) {
|
static really_inline int diff128(m128 a, m128 b) {
|
||||||
int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
|
uint64_t res = vget_lane_u64(
|
||||||
return (-16 != res);
|
(uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
|
||||||
|
return (~0ull != res);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline int isnonzero128(m128 a) {
|
static really_inline int isnonzero128(m128 a) {
|
||||||
@ -129,43 +112,8 @@ m128 lshift_m128(m128 a, unsigned b) {
|
|||||||
return (m128) vshlq_n_u32((uint32x4_t)a, b);
|
return (m128) vshlq_n_u32((uint32x4_t)a, b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break;
|
int32x4_t shift_indices = vdupq_n_s32(b);
|
||||||
switch (b) {
|
return (m128) vshlq_s32(a, shift_indices);
|
||||||
case 0: return a; break;
|
|
||||||
CASE_LSHIFT_m128(a, 1);
|
|
||||||
CASE_LSHIFT_m128(a, 2);
|
|
||||||
CASE_LSHIFT_m128(a, 3);
|
|
||||||
CASE_LSHIFT_m128(a, 4);
|
|
||||||
CASE_LSHIFT_m128(a, 5);
|
|
||||||
CASE_LSHIFT_m128(a, 6);
|
|
||||||
CASE_LSHIFT_m128(a, 7);
|
|
||||||
CASE_LSHIFT_m128(a, 8);
|
|
||||||
CASE_LSHIFT_m128(a, 9);
|
|
||||||
CASE_LSHIFT_m128(a, 10);
|
|
||||||
CASE_LSHIFT_m128(a, 11);
|
|
||||||
CASE_LSHIFT_m128(a, 12);
|
|
||||||
CASE_LSHIFT_m128(a, 13);
|
|
||||||
CASE_LSHIFT_m128(a, 14);
|
|
||||||
CASE_LSHIFT_m128(a, 15);
|
|
||||||
CASE_LSHIFT_m128(a, 16);
|
|
||||||
CASE_LSHIFT_m128(a, 17);
|
|
||||||
CASE_LSHIFT_m128(a, 18);
|
|
||||||
CASE_LSHIFT_m128(a, 19);
|
|
||||||
CASE_LSHIFT_m128(a, 20);
|
|
||||||
CASE_LSHIFT_m128(a, 21);
|
|
||||||
CASE_LSHIFT_m128(a, 22);
|
|
||||||
CASE_LSHIFT_m128(a, 23);
|
|
||||||
CASE_LSHIFT_m128(a, 24);
|
|
||||||
CASE_LSHIFT_m128(a, 25);
|
|
||||||
CASE_LSHIFT_m128(a, 26);
|
|
||||||
CASE_LSHIFT_m128(a, 27);
|
|
||||||
CASE_LSHIFT_m128(a, 28);
|
|
||||||
CASE_LSHIFT_m128(a, 29);
|
|
||||||
CASE_LSHIFT_m128(a, 30);
|
|
||||||
CASE_LSHIFT_m128(a, 31);
|
|
||||||
default: return zeroes128(); break;
|
|
||||||
}
|
|
||||||
#undef CASE_LSHIFT_m128
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -175,43 +123,8 @@ m128 rshift_m128(m128 a, unsigned b) {
|
|||||||
return (m128) vshrq_n_u32((uint32x4_t)a, b);
|
return (m128) vshrq_n_u32((uint32x4_t)a, b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break;
|
int32x4_t shift_indices = vdupq_n_s32(-b);
|
||||||
switch (b) {
|
return (m128) vshlq_s32(a, shift_indices);
|
||||||
case 0: return a; break;
|
|
||||||
CASE_RSHIFT_m128(a, 1);
|
|
||||||
CASE_RSHIFT_m128(a, 2);
|
|
||||||
CASE_RSHIFT_m128(a, 3);
|
|
||||||
CASE_RSHIFT_m128(a, 4);
|
|
||||||
CASE_RSHIFT_m128(a, 5);
|
|
||||||
CASE_RSHIFT_m128(a, 6);
|
|
||||||
CASE_RSHIFT_m128(a, 7);
|
|
||||||
CASE_RSHIFT_m128(a, 8);
|
|
||||||
CASE_RSHIFT_m128(a, 9);
|
|
||||||
CASE_RSHIFT_m128(a, 10);
|
|
||||||
CASE_RSHIFT_m128(a, 11);
|
|
||||||
CASE_RSHIFT_m128(a, 12);
|
|
||||||
CASE_RSHIFT_m128(a, 13);
|
|
||||||
CASE_RSHIFT_m128(a, 14);
|
|
||||||
CASE_RSHIFT_m128(a, 15);
|
|
||||||
CASE_RSHIFT_m128(a, 16);
|
|
||||||
CASE_RSHIFT_m128(a, 17);
|
|
||||||
CASE_RSHIFT_m128(a, 18);
|
|
||||||
CASE_RSHIFT_m128(a, 19);
|
|
||||||
CASE_RSHIFT_m128(a, 20);
|
|
||||||
CASE_RSHIFT_m128(a, 21);
|
|
||||||
CASE_RSHIFT_m128(a, 22);
|
|
||||||
CASE_RSHIFT_m128(a, 23);
|
|
||||||
CASE_RSHIFT_m128(a, 24);
|
|
||||||
CASE_RSHIFT_m128(a, 25);
|
|
||||||
CASE_RSHIFT_m128(a, 26);
|
|
||||||
CASE_RSHIFT_m128(a, 27);
|
|
||||||
CASE_RSHIFT_m128(a, 28);
|
|
||||||
CASE_RSHIFT_m128(a, 29);
|
|
||||||
CASE_RSHIFT_m128(a, 30);
|
|
||||||
CASE_RSHIFT_m128(a, 31);
|
|
||||||
default: return zeroes128(); break;
|
|
||||||
}
|
|
||||||
#undef CASE_RSHIFT_m128
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -221,75 +134,8 @@ m128 lshift64_m128(m128 a, unsigned b) {
|
|||||||
return (m128) vshlq_n_u64((uint64x2_t)a, b);
|
return (m128) vshlq_n_u64((uint64x2_t)a, b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break;
|
int64x2_t shift_indices = vdupq_n_s64(b);
|
||||||
switch (b) {
|
return (m128) vshlq_s64((int64x2_t) a, shift_indices);
|
||||||
case 0: return a; break;
|
|
||||||
CASE_LSHIFT64_m128(a, 1);
|
|
||||||
CASE_LSHIFT64_m128(a, 2);
|
|
||||||
CASE_LSHIFT64_m128(a, 3);
|
|
||||||
CASE_LSHIFT64_m128(a, 4);
|
|
||||||
CASE_LSHIFT64_m128(a, 5);
|
|
||||||
CASE_LSHIFT64_m128(a, 6);
|
|
||||||
CASE_LSHIFT64_m128(a, 7);
|
|
||||||
CASE_LSHIFT64_m128(a, 8);
|
|
||||||
CASE_LSHIFT64_m128(a, 9);
|
|
||||||
CASE_LSHIFT64_m128(a, 10);
|
|
||||||
CASE_LSHIFT64_m128(a, 11);
|
|
||||||
CASE_LSHIFT64_m128(a, 12);
|
|
||||||
CASE_LSHIFT64_m128(a, 13);
|
|
||||||
CASE_LSHIFT64_m128(a, 14);
|
|
||||||
CASE_LSHIFT64_m128(a, 15);
|
|
||||||
CASE_LSHIFT64_m128(a, 16);
|
|
||||||
CASE_LSHIFT64_m128(a, 17);
|
|
||||||
CASE_LSHIFT64_m128(a, 18);
|
|
||||||
CASE_LSHIFT64_m128(a, 19);
|
|
||||||
CASE_LSHIFT64_m128(a, 20);
|
|
||||||
CASE_LSHIFT64_m128(a, 21);
|
|
||||||
CASE_LSHIFT64_m128(a, 22);
|
|
||||||
CASE_LSHIFT64_m128(a, 23);
|
|
||||||
CASE_LSHIFT64_m128(a, 24);
|
|
||||||
CASE_LSHIFT64_m128(a, 25);
|
|
||||||
CASE_LSHIFT64_m128(a, 26);
|
|
||||||
CASE_LSHIFT64_m128(a, 27);
|
|
||||||
CASE_LSHIFT64_m128(a, 28);
|
|
||||||
CASE_LSHIFT64_m128(a, 29);
|
|
||||||
CASE_LSHIFT64_m128(a, 30);
|
|
||||||
CASE_LSHIFT64_m128(a, 31);
|
|
||||||
CASE_LSHIFT64_m128(a, 32);
|
|
||||||
CASE_LSHIFT64_m128(a, 33);
|
|
||||||
CASE_LSHIFT64_m128(a, 34);
|
|
||||||
CASE_LSHIFT64_m128(a, 35);
|
|
||||||
CASE_LSHIFT64_m128(a, 36);
|
|
||||||
CASE_LSHIFT64_m128(a, 37);
|
|
||||||
CASE_LSHIFT64_m128(a, 38);
|
|
||||||
CASE_LSHIFT64_m128(a, 39);
|
|
||||||
CASE_LSHIFT64_m128(a, 40);
|
|
||||||
CASE_LSHIFT64_m128(a, 41);
|
|
||||||
CASE_LSHIFT64_m128(a, 42);
|
|
||||||
CASE_LSHIFT64_m128(a, 43);
|
|
||||||
CASE_LSHIFT64_m128(a, 44);
|
|
||||||
CASE_LSHIFT64_m128(a, 45);
|
|
||||||
CASE_LSHIFT64_m128(a, 46);
|
|
||||||
CASE_LSHIFT64_m128(a, 47);
|
|
||||||
CASE_LSHIFT64_m128(a, 48);
|
|
||||||
CASE_LSHIFT64_m128(a, 49);
|
|
||||||
CASE_LSHIFT64_m128(a, 50);
|
|
||||||
CASE_LSHIFT64_m128(a, 51);
|
|
||||||
CASE_LSHIFT64_m128(a, 52);
|
|
||||||
CASE_LSHIFT64_m128(a, 53);
|
|
||||||
CASE_LSHIFT64_m128(a, 54);
|
|
||||||
CASE_LSHIFT64_m128(a, 55);
|
|
||||||
CASE_LSHIFT64_m128(a, 56);
|
|
||||||
CASE_LSHIFT64_m128(a, 57);
|
|
||||||
CASE_LSHIFT64_m128(a, 58);
|
|
||||||
CASE_LSHIFT64_m128(a, 59);
|
|
||||||
CASE_LSHIFT64_m128(a, 60);
|
|
||||||
CASE_LSHIFT64_m128(a, 61);
|
|
||||||
CASE_LSHIFT64_m128(a, 62);
|
|
||||||
CASE_LSHIFT64_m128(a, 63);
|
|
||||||
default: return zeroes128(); break;
|
|
||||||
}
|
|
||||||
#undef CASE_LSHIFT64_m128
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -299,75 +145,8 @@ m128 rshift64_m128(m128 a, unsigned b) {
|
|||||||
return (m128) vshrq_n_u64((uint64x2_t)a, b);
|
return (m128) vshrq_n_u64((uint64x2_t)a, b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break;
|
int64x2_t shift_indices = vdupq_n_s64(-b);
|
||||||
switch (b) {
|
return (m128) vshlq_s64((int64x2_t) a, shift_indices);
|
||||||
case 0: return a; break;
|
|
||||||
CASE_RSHIFT64_m128(a, 1);
|
|
||||||
CASE_RSHIFT64_m128(a, 2);
|
|
||||||
CASE_RSHIFT64_m128(a, 3);
|
|
||||||
CASE_RSHIFT64_m128(a, 4);
|
|
||||||
CASE_RSHIFT64_m128(a, 5);
|
|
||||||
CASE_RSHIFT64_m128(a, 6);
|
|
||||||
CASE_RSHIFT64_m128(a, 7);
|
|
||||||
CASE_RSHIFT64_m128(a, 8);
|
|
||||||
CASE_RSHIFT64_m128(a, 9);
|
|
||||||
CASE_RSHIFT64_m128(a, 10);
|
|
||||||
CASE_RSHIFT64_m128(a, 11);
|
|
||||||
CASE_RSHIFT64_m128(a, 12);
|
|
||||||
CASE_RSHIFT64_m128(a, 13);
|
|
||||||
CASE_RSHIFT64_m128(a, 14);
|
|
||||||
CASE_RSHIFT64_m128(a, 15);
|
|
||||||
CASE_RSHIFT64_m128(a, 16);
|
|
||||||
CASE_RSHIFT64_m128(a, 17);
|
|
||||||
CASE_RSHIFT64_m128(a, 18);
|
|
||||||
CASE_RSHIFT64_m128(a, 19);
|
|
||||||
CASE_RSHIFT64_m128(a, 20);
|
|
||||||
CASE_RSHIFT64_m128(a, 21);
|
|
||||||
CASE_RSHIFT64_m128(a, 22);
|
|
||||||
CASE_RSHIFT64_m128(a, 23);
|
|
||||||
CASE_RSHIFT64_m128(a, 24);
|
|
||||||
CASE_RSHIFT64_m128(a, 25);
|
|
||||||
CASE_RSHIFT64_m128(a, 26);
|
|
||||||
CASE_RSHIFT64_m128(a, 27);
|
|
||||||
CASE_RSHIFT64_m128(a, 28);
|
|
||||||
CASE_RSHIFT64_m128(a, 29);
|
|
||||||
CASE_RSHIFT64_m128(a, 30);
|
|
||||||
CASE_RSHIFT64_m128(a, 31);
|
|
||||||
CASE_RSHIFT64_m128(a, 32);
|
|
||||||
CASE_RSHIFT64_m128(a, 33);
|
|
||||||
CASE_RSHIFT64_m128(a, 34);
|
|
||||||
CASE_RSHIFT64_m128(a, 35);
|
|
||||||
CASE_RSHIFT64_m128(a, 36);
|
|
||||||
CASE_RSHIFT64_m128(a, 37);
|
|
||||||
CASE_RSHIFT64_m128(a, 38);
|
|
||||||
CASE_RSHIFT64_m128(a, 39);
|
|
||||||
CASE_RSHIFT64_m128(a, 40);
|
|
||||||
CASE_RSHIFT64_m128(a, 41);
|
|
||||||
CASE_RSHIFT64_m128(a, 42);
|
|
||||||
CASE_RSHIFT64_m128(a, 43);
|
|
||||||
CASE_RSHIFT64_m128(a, 44);
|
|
||||||
CASE_RSHIFT64_m128(a, 45);
|
|
||||||
CASE_RSHIFT64_m128(a, 46);
|
|
||||||
CASE_RSHIFT64_m128(a, 47);
|
|
||||||
CASE_RSHIFT64_m128(a, 48);
|
|
||||||
CASE_RSHIFT64_m128(a, 49);
|
|
||||||
CASE_RSHIFT64_m128(a, 50);
|
|
||||||
CASE_RSHIFT64_m128(a, 51);
|
|
||||||
CASE_RSHIFT64_m128(a, 52);
|
|
||||||
CASE_RSHIFT64_m128(a, 53);
|
|
||||||
CASE_RSHIFT64_m128(a, 54);
|
|
||||||
CASE_RSHIFT64_m128(a, 55);
|
|
||||||
CASE_RSHIFT64_m128(a, 56);
|
|
||||||
CASE_RSHIFT64_m128(a, 57);
|
|
||||||
CASE_RSHIFT64_m128(a, 58);
|
|
||||||
CASE_RSHIFT64_m128(a, 59);
|
|
||||||
CASE_RSHIFT64_m128(a, 60);
|
|
||||||
CASE_RSHIFT64_m128(a, 61);
|
|
||||||
CASE_RSHIFT64_m128(a, 62);
|
|
||||||
CASE_RSHIFT64_m128(a, 63);
|
|
||||||
default: return zeroes128(); break;
|
|
||||||
}
|
|
||||||
#undef CASE_RSHIFT64_m128
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline m128 eq128(m128 a, m128 b) {
|
static really_inline m128 eq128(m128 a, m128 b) {
|
||||||
@ -594,9 +373,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
|||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
assert(n < sizeof(m128) * 8);
|
assert(n < sizeof(m128) * 8);
|
||||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
static m128 onebit = { 1, 0 };
|
||||||
mask_idx -= n / 8;
|
m128 mask = lshiftbyte_m128( onebit, n / 8 );
|
||||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
return lshift64_m128( mask, n % 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// switches on bit N in the given vector.
|
// switches on bit N in the given vector.
|
||||||
|
@ -88,6 +88,26 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
|
|||||||
#define print_m128_2x64(label, vec) ;
|
#define print_m128_2x64(label, vec) ;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
|
||||||
|
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
||||||
|
|
||||||
|
/** \brief LUT for the mask1bit functions. */
|
||||||
|
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
||||||
|
ZEROES_32, ZEROES_32,
|
||||||
|
ZEROES_31, 0x01, ZEROES_32,
|
||||||
|
ZEROES_31, 0x02, ZEROES_32,
|
||||||
|
ZEROES_31, 0x04, ZEROES_32,
|
||||||
|
ZEROES_31, 0x08, ZEROES_32,
|
||||||
|
ZEROES_31, 0x10, ZEROES_32,
|
||||||
|
ZEROES_31, 0x20, ZEROES_32,
|
||||||
|
ZEROES_31, 0x40, ZEROES_32,
|
||||||
|
ZEROES_31, 0x80, ZEROES_32,
|
||||||
|
ZEROES_32, ZEROES_32,
|
||||||
|
};
|
||||||
|
#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)
|
||||||
|
|
||||||
/****
|
/****
|
||||||
**** 256-bit Primitives
|
**** 256-bit Primitives
|
||||||
****/
|
****/
|
||||||
|
@ -30,12 +30,12 @@
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = ctz32(z);
|
u32 pos = ctz32(z);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
@ -47,9 +47,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = clz32(z);
|
u32 pos = clz32(z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
@ -63,12 +63,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z != 0xffff)) {
|
if (unlikely(z != 0xffff)) {
|
||||||
u32 pos = ctz32(~z & 0xffff);
|
u32 pos = ctz32(~z & 0xffff);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
@ -81,12 +81,12 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
|
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z != 0xffff)) {
|
if (unlikely(z != 0xffff)) {
|
||||||
u32 pos = clz32(~z & 0xffff);
|
u32 pos = clz32(~z & 0xffff);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos >= 16 && pos < 32);
|
assert(pos >= 16 && pos < 32);
|
||||||
return buf + (31 - pos);
|
return buf + (31 - pos);
|
||||||
|
@ -54,34 +54,6 @@ typedef __vector signed char int8x16_t;
|
|||||||
|
|
||||||
typedef unsigned long long int ulong64_t;
|
typedef unsigned long long int ulong64_t;
|
||||||
typedef signed long long int long64_t;
|
typedef signed long long int long64_t;
|
||||||
/*
|
|
||||||
typedef __vector uint64_t uint64x2_t;
|
|
||||||
typedef __vector int64_t int64x2_t;
|
|
||||||
typedef __vector uint32_t uint32x4_t;
|
|
||||||
typedef __vector int32_t int32x4_t;
|
|
||||||
typedef __vector uint16_t uint16x8_t;
|
|
||||||
typedef __vector int16_t int16x8_t;
|
|
||||||
typedef __vector uint8_t uint8x16_t;
|
|
||||||
typedef __vector int8_t int8x16_t;*/
|
|
||||||
|
|
||||||
|
|
||||||
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
|
|
||||||
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
|
|
||||||
|
|
||||||
/** \brief LUT for the mask1bit functions. */
|
|
||||||
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
ZEROES_31, 0x01, ZEROES_32,
|
|
||||||
ZEROES_31, 0x02, ZEROES_32,
|
|
||||||
ZEROES_31, 0x04, ZEROES_32,
|
|
||||||
ZEROES_31, 0x08, ZEROES_32,
|
|
||||||
ZEROES_31, 0x10, ZEROES_32,
|
|
||||||
ZEROES_31, 0x20, ZEROES_32,
|
|
||||||
ZEROES_31, 0x40, ZEROES_32,
|
|
||||||
ZEROES_31, 0x80, ZEROES_32,
|
|
||||||
ZEROES_32, ZEROES_32,
|
|
||||||
};
|
|
||||||
|
|
||||||
static really_inline m128 ones128(void) {
|
static really_inline m128 ones128(void) {
|
||||||
return (m128) vec_splat_u8(-1);
|
return (m128) vec_splat_u8(-1);
|
||||||
@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
|
|||||||
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
|
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
|
||||||
mask = vec_and(not128(mask), movemask);
|
mask = vec_and(not128(mask), movemask);
|
||||||
m128 sum = vec_sums(mask, zeroes128());
|
m128 sum = vec_sums(mask, zeroes128());
|
||||||
//sum = vec_sld(zeroes128(), sum, 4);
|
|
||||||
//s32 ALIGN_ATTR(16) x;
|
|
||||||
//vec_ste(sum, 0, &x);
|
|
||||||
//return x; // it could be ~(movemask_128(mask)) & 0x;
|
|
||||||
return sum[3];
|
return sum[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
|||||||
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
||||||
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
|
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
|
||||||
m128 sum = vec_sums((m128)mask, zeroes128());
|
m128 sum = vec_sums((m128)mask, zeroes128());
|
||||||
//sum = vec_sld(zeroes128(), sum, 4);
|
|
||||||
//s32 ALIGN_ATTR(16) x;
|
|
||||||
//vec_ste(sum, 0, &x);
|
|
||||||
//return x;
|
|
||||||
return sum[3];
|
return sum[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) {
|
|||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 lshift_m128(m128 a, unsigned b) {
|
m128 lshift_m128(m128 a, unsigned b) {
|
||||||
switch(b){
|
if (b == 0) return a;
|
||||||
case 1: return vec_sld(a, zeroes128(), 1); break;
|
m128 sl = (m128) vec_splats((uint8_t) b << 3);
|
||||||
case 2: return vec_sld(a, zeroes128(), 2); break;
|
m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
|
||||||
case 3: return vec_sld(a, zeroes128(), 3); break;
|
return result;
|
||||||
case 4: return vec_sld(a, zeroes128(), 4); break;
|
|
||||||
case 5: return vec_sld(a, zeroes128(), 5); break;
|
|
||||||
case 6: return vec_sld(a, zeroes128(), 6); break;
|
|
||||||
case 7: return vec_sld(a, zeroes128(), 7); break;
|
|
||||||
case 8: return vec_sld(a, zeroes128(), 8); break;
|
|
||||||
case 9: return vec_sld(a, zeroes128(), 9); break;
|
|
||||||
case 10: return vec_sld(a, zeroes128(), 10); break;
|
|
||||||
case 11: return vec_sld(a, zeroes128(), 11); break;
|
|
||||||
case 12: return vec_sld(a, zeroes128(), 12); break;
|
|
||||||
case 13: return vec_sld(a, zeroes128(), 13); break;
|
|
||||||
case 14: return vec_sld(a, zeroes128(), 14); break;
|
|
||||||
case 15: return vec_sld(a, zeroes128(), 15); break;
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 rshift_m128(m128 a, unsigned b) {
|
m128 rshift_m128(m128 a, unsigned b) {
|
||||||
switch(b){
|
if (b == 0) return a;
|
||||||
case 1: return vec_sld(zeroes128(), a, 15); break;
|
m128 sl = (m128) vec_splats((uint8_t) b << 3);
|
||||||
case 2: return vec_sld(zeroes128(), a, 14); break;
|
m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl);
|
||||||
case 3: return vec_sld(zeroes128(), a, 13); break;
|
return result;
|
||||||
case 4: return vec_sld(zeroes128(), a, 12); break;
|
|
||||||
case 5: return vec_sld(zeroes128(), a, 11); break;
|
|
||||||
case 6: return vec_sld(zeroes128(), a, 10); break;
|
|
||||||
case 7: return vec_sld(zeroes128(), a, 9); break;
|
|
||||||
case 8: return vec_sld(zeroes128(), a, 8); break;
|
|
||||||
case 9: return vec_sld(zeroes128(), a, 7); break;
|
|
||||||
case 10: return vec_sld(zeroes128(), a, 6); break;
|
|
||||||
case 11: return vec_sld(zeroes128(), a, 5); break;
|
|
||||||
case 12: return vec_sld(zeroes128(), a, 4); break;
|
|
||||||
case 13: return vec_sld(zeroes128(), a, 3); break;
|
|
||||||
case 14: return vec_sld(zeroes128(), a, 2); break;
|
|
||||||
case 15: return vec_sld(zeroes128(), a, 1); break;
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -212,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
|
|||||||
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static really_inline u32 movemask128(m128 a) {
|
static really_inline u32 movemask128(m128 a) {
|
||||||
uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));
|
static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
|
uint8x16_t bitmask = vec_gb((uint8x16_t) a);
|
||||||
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
|
bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
|
||||||
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
|
u32 movemask;
|
||||||
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
|
vec_ste((uint32x4_t) bitmask, 0, &movemask);
|
||||||
|
return movemask;
|
||||||
uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
|
|
||||||
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
|
|
||||||
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
|
|
||||||
|
|
||||||
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
|
|
||||||
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
|
|
||||||
|
|
||||||
uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
|
|
||||||
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
|
|
||||||
|
|
||||||
return s5[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline m128 set1_16x8(u8 c) {
|
static really_inline m128 set1_16x8(u8 c) {
|
||||||
@ -363,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
|
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
@ -392,42 +313,50 @@ m128 palignr_imm(m128 r, m128 l, int offset) {
|
|||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 palignr(m128 r, m128 l, int offset) {
|
m128 palignr(m128 r, m128 l, int offset) {
|
||||||
#if defined(HS_OPTIMIZE)
|
if (offset == 0) return l;
|
||||||
// need a faster way to do this.
|
if (offset == 16) return r;
|
||||||
return palignr_imm(r, l, offset);
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
#else
|
if (__builtin_constant_p(offset)) {
|
||||||
return palignr_imm(r, l, offset);
|
return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
|
||||||
|
m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
|
||||||
|
m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
|
||||||
|
m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
|
||||||
|
return or128(lhs, rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CASE_ALIGN_VECTORS
|
#undef CASE_ALIGN_VECTORS
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
||||||
return rshift_m128(a,b);
|
return palignr_imm(zeroes128(), a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
||||||
return lshift_m128(a,b);
|
return palignr_imm(a, zeroes128(), 16 - b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||||
assert(amount >= -16 && amount <= 16);
|
assert(amount >= -16 && amount <= 16);
|
||||||
if (amount < 0) {
|
if (amount < 0) {
|
||||||
return palignr_imm(zeroes128(), in, -amount);
|
return rshiftbyte_m128(in, -amount);
|
||||||
} else {
|
} else {
|
||||||
return palignr_imm(in, zeroes128(), 16 - amount);
|
return lshiftbyte_m128(in, amount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
assert(n < sizeof(m128) * 8);
|
assert(n < sizeof(m128) * 8);
|
||||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
static uint64x2_t onebit = { 1, 0 };
|
||||||
mask_idx -= n / 8;
|
m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
|
||||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
|
||||||
|
m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
|
||||||
|
return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
|
||||||
}
|
}
|
||||||
|
|
||||||
// switches on bit N in the given vector.
|
// switches on bit N in the given vector.
|
||||||
|
@ -30,12 +30,13 @@
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
assert(SuperVector<16>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = ctz32(z);
|
u32 pos = ctz32(z);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
||||||
SuperVector<32>::movemask_type z = v.movemask();
|
assert(SuperVector<32>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
SuperVector<32>::comparemask_type z = v.comparemask();
|
||||||
|
DEBUG_PRINTF("z 0x%08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = ctz32(z);
|
u32 pos = ctz32(z);
|
||||||
assert(pos < 32);
|
assert(pos < 32);
|
||||||
@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
||||||
SuperVector<64>::movemask_type z = v.movemask();
|
assert(SuperVector<64>::mask_width() == 1);
|
||||||
|
SuperVector<64>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||||
u64a mask = (~0ULL) >> (64 - len);
|
u64a mask = (~0ULL) >> (64 - len);
|
||||||
DEBUG_PRINTF("mask %016llx\n", mask);
|
DEBUG_PRINTF("mask %016llx\n", mask);
|
||||||
@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
assert(SuperVector<16>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = clz32(z);
|
u32 pos = clz32(z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
||||||
SuperVector<32>::movemask_type z = v.movemask();
|
assert(SuperVector<32>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
SuperVector<32>::comparemask_type z = v.comparemask();
|
||||||
|
DEBUG_PRINTF("z 0x%08llx\n", z);
|
||||||
if (unlikely(z)) {
|
if (unlikely(z)) {
|
||||||
u32 pos = clz32(z);
|
u32 pos = clz32(z);
|
||||||
assert(pos < 32);
|
assert(pos < 32);
|
||||||
@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
||||||
SuperVector<64>::movemask_type z = v.movemask();
|
assert(SuperVector<64>::mask_width() == 1);
|
||||||
|
SuperVector<64>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||||
u64a mask = (~0ULL) >> (64 - len);
|
u64a mask = (~0ULL) >> (64 - len);
|
||||||
DEBUG_PRINTF("mask %016llx\n", mask);
|
DEBUG_PRINTF("mask %016llx\n", mask);
|
||||||
@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
assert(SuperVector<16>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z != 0xffff)) {
|
if (unlikely(z != 0xffff)) {
|
||||||
u32 pos = ctz32(~z & 0xffff);
|
u32 pos = ctz32(~z & 0xffff);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos < 16);
|
assert(pos < 16);
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
|
||||||
SuperVector<32>::movemask_type z = v.movemask();
|
assert(SuperVector<32>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
SuperVector<32>::comparemask_type z = v.comparemask();
|
||||||
|
DEBUG_PRINTF("z 0x%08llx\n", z);
|
||||||
if (unlikely(z != 0xffffffff)) {
|
if (unlikely(z != 0xffffffff)) {
|
||||||
u32 pos = ctz32(~z);
|
u32 pos = ctz32(~z & 0xffffffffu);
|
||||||
assert(pos < 32);
|
assert(pos < 32);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
return buf + pos;
|
return buf + pos;
|
||||||
@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
|
||||||
SuperVector<64>::movemask_type z = v.movemask();
|
assert(SuperVector<64>::mask_width() == 1);
|
||||||
|
SuperVector<64>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||||
u64a mask = (~0ULL) >> (64 - len);
|
u64a mask = (~0ULL) >> (64 - len);
|
||||||
DEBUG_PRINTF("mask %016llx\n", mask);
|
DEBUG_PRINTF("mask %016llx\n", mask);
|
||||||
@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
|
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
|
||||||
SuperVector<16>::movemask_type z = v.movemask();
|
assert(SuperVector<16>::mask_width() == 1);
|
||||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
SuperVector<16>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z %08x\n", z);
|
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||||
|
DEBUG_PRINTF("z %08llx\n", z);
|
||||||
if (unlikely(z != 0xffff)) {
|
if (unlikely(z != 0xffff)) {
|
||||||
u32 pos = clz32(~z & 0xffff);
|
u32 pos = clz32(~z & 0xffffu);
|
||||||
DEBUG_PRINTF("~z %08x\n", ~z);
|
DEBUG_PRINTF("~z %08llx\n", ~z);
|
||||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||||
assert(pos >= 16 && pos < 32);
|
assert(pos >= 16 && pos < 32);
|
||||||
return buf + (31 - pos);
|
return buf + (31 - pos);
|
||||||
@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_
|
|||||||
template<>
|
template<>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
|
const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
|
||||||
SuperVector<32>::movemask_type z = v.movemask();
|
assert(SuperVector<32>::mask_width() == 1);
|
||||||
if (unlikely(z != 0xffffffff)) {
|
SuperVector<32>::comparemask_type z = v.comparemask();
|
||||||
u32 pos = clz32(~z & 0xffffffff);
|
if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
|
||||||
|
u32 pos = clz32(~z & 0xffffffffu);
|
||||||
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
|
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
|
||||||
assert(pos < 32);
|
assert(pos < 32);
|
||||||
return buf + (31 - pos);
|
return buf + (31 - pos);
|
||||||
@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_
|
|||||||
template <>
|
template <>
|
||||||
really_really_inline
|
really_really_inline
|
||||||
const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
|
const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
|
||||||
|
assert(SuperVector<64>::mask_width() == 1);
|
||||||
v.print8("v");
|
v.print8("v");
|
||||||
SuperVector<64>::movemask_type z = v.movemask();
|
SuperVector<64>::comparemask_type z = v.comparemask();
|
||||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||||
u64a mask = (~0ULL) >> (64 - len);
|
u64a mask = (~0ULL) >> (64 - len);
|
||||||
DEBUG_PRINTF("mask %016llx\n", mask);
|
DEBUG_PRINTF("mask %016llx\n", mask);
|
||||||
|
@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) {
|
|||||||
return _mm_set_epi64x(0LL, *p);
|
return _mm_set_epi64x(0LL, *p);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
|
#define CASE_RSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break;
|
||||||
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
|
|
||||||
|
static really_inline
|
||||||
|
m128 rshiftbyte_m128(const m128 a, int count_immed) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(count_immed)) {
|
||||||
|
return _mm_srli_si128(a, count_immed);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
switch (count_immed) {
|
||||||
|
case 0: return a; break;
|
||||||
|
CASE_RSHIFT_VECTOR(a, 1);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 2);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 3);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 4);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 5);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 6);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 7);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 8);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 9);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 10);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 11);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 12);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 13);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 14);
|
||||||
|
CASE_RSHIFT_VECTOR(a, 15);
|
||||||
|
default: return zeroes128(); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#undef CASE_RSHIFT_VECTOR
|
||||||
|
|
||||||
|
#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_slli_si128((m128)(a), (count)); break;
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
m128 lshiftbyte_m128(const m128 a, int count_immed) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(count_immed)) {
|
||||||
|
return _mm_slli_si128(a, count_immed);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
switch (count_immed) {
|
||||||
|
case 0: return a; break;
|
||||||
|
CASE_LSHIFT_VECTOR(a, 1);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 2);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 3);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 4);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 5);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 6);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 7);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 8);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 9);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 10);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 11);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 12);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 13);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 14);
|
||||||
|
CASE_LSHIFT_VECTOR(a, 15);
|
||||||
|
default: return zeroes128(); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#undef CASE_LSHIFT_VECTOR
|
||||||
|
|
||||||
#if defined(HAVE_SSE41)
|
#if defined(HAVE_SSE41)
|
||||||
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
|
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
|
||||||
@ -255,14 +314,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
|
|||||||
memcpy(&a, ptr, n);
|
memcpy(&a, ptr, n);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
extern const u8 simd_onebit_masks[];
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif*/
|
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 mask1bit128(unsigned int n) {
|
m128 mask1bit128(unsigned int n) {
|
||||||
@ -330,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#undef CASE_ALIGN_VECTORS
|
||||||
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m128 palignr(m128 r, m128 l, int offset) {
|
m128 palignr(m128 r, m128 l, int offset) {
|
||||||
@ -340,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) {
|
|||||||
#endif
|
#endif
|
||||||
return palignr_sw(r, l, offset);
|
return palignr_sw(r, l, offset);
|
||||||
}
|
}
|
||||||
#undef CASE_ALIGN_VECTORS
|
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||||
|
@ -189,10 +189,7 @@ public:
|
|||||||
size_t sum = 0;
|
size_t sum = 0;
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i + 4 <= num_blocks; i += 4) {
|
for (; i + 4 <= num_blocks; i += 4) {
|
||||||
sum += popcount64(bits[i]);
|
sum += popcount64x4(&bits[i]);
|
||||||
sum += popcount64(bits[i + 1]);
|
|
||||||
sum += popcount64(bits[i + 2]);
|
|
||||||
sum += popcount64(bits[i + 3]);
|
|
||||||
}
|
}
|
||||||
for (; i < num_blocks; i++) {
|
for (; i < num_blocks; i++) {
|
||||||
sum += popcount64(bits[i]);
|
sum += popcount64(bits[i]);
|
||||||
|
@ -52,6 +52,15 @@ u32 popcount32(u32 x) {
|
|||||||
// #endif
|
// #endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
u32 popcount32x4(u32 const *x) {
|
||||||
|
u32 sum = popcount32(x[0]);
|
||||||
|
sum += popcount32(x[1]);
|
||||||
|
sum += popcount32(x[2]);
|
||||||
|
sum += popcount32(x[3]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
u32 popcount64(u64a x) {
|
u32 popcount64(u64a x) {
|
||||||
return __builtin_popcountll(x);
|
return __builtin_popcountll(x);
|
||||||
@ -73,5 +82,14 @@ u32 popcount64(u64a x) {
|
|||||||
// #endif
|
// #endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static really_inline
|
||||||
|
u32 popcount64x4(u64a const *x) {
|
||||||
|
volatile u32 sum = popcount64(x[0]);
|
||||||
|
sum += popcount64(x[1]);
|
||||||
|
sum += popcount64(x[2]);
|
||||||
|
sum += popcount64(x[3]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* UTIL_POPCOUNT_H_ */
|
#endif /* UTIL_POPCOUNT_H_ */
|
||||||
|
|
||||||
|
@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::comparemask(void) const {
|
||||||
SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
|
return static_cast<typename SuperVector<16>::comparemask_type>(
|
||||||
|
vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
|
||||||
// Compute the mask from the input
|
|
||||||
uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
|
|
||||||
uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
|
|
||||||
mask = vorrq_u8(mask, (uint8x16_t) mask1);
|
|
||||||
|
|
||||||
// Get the resulting bytes
|
|
||||||
uint16_t output;
|
|
||||||
vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
|
|
||||||
return static_cast<typename SuperVector<16>::movemask_type>(output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||||
return eq(b).movemask();
|
return eq(b).comparemask();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
|
SuperVector<16>::iteration_mask(
|
||||||
|
typename SuperVector<16>::comparemask_type mask) {
|
||||||
|
return mask & 0x1111111111111111ull;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -374,10 +374,9 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
if (N == 8) return Zeroes();
|
||||||
SuperVector result;
|
int8x16_t shift_indices = vdupq_n_s8(N);
|
||||||
Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
|
return { vshlq_s8(u.s8x16[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -385,9 +384,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
if (N == 16) return Zeroes();
|
||||||
SuperVector result;
|
int16x8_t shift_indices = vdupq_n_s16(N);
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
|
return { vshlq_s16(u.s16x8[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -395,9 +393,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 32) return Zeroes();
|
if (N == 32) return Zeroes();
|
||||||
SuperVector result;
|
int32x4_t shift_indices = vdupq_n_s32(N);
|
||||||
Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
|
return { vshlq_s32(u.s32x4[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -405,9 +402,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 64) return Zeroes();
|
if (N == 64) return Zeroes();
|
||||||
SuperVector result;
|
int64x2_t shift_indices = vdupq_n_s64(N);
|
||||||
Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
|
return { vshlq_s64(u.s64x2[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -415,6 +411,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
if (N == 16) return Zeroes();
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(N)) {
|
||||||
|
return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
SuperVector result;
|
SuperVector result;
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
|
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
|
||||||
return result;
|
return result;
|
||||||
@ -431,9 +432,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 8) return Zeroes();
|
if (N == 8) return Zeroes();
|
||||||
SuperVector result;
|
int8x16_t shift_indices = vdupq_n_s8(-N);
|
||||||
Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
|
return { vshlq_s8(u.s8x16[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -441,9 +441,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
if (N == 16) return Zeroes();
|
||||||
SuperVector result;
|
int16x8_t shift_indices = vdupq_n_s16(-N);
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
|
return { vshlq_s16(u.s16x8[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -451,9 +450,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 32) return Zeroes();
|
if (N == 32) return Zeroes();
|
||||||
SuperVector result;
|
int32x4_t shift_indices = vdupq_n_s32(-N);
|
||||||
Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
|
return { vshlq_s32(u.s32x4[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -461,9 +459,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 64) return Zeroes();
|
if (N == 64) return Zeroes();
|
||||||
SuperVector result;
|
int64x2_t shift_indices = vdupq_n_s64(-N);
|
||||||
Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
|
return { vshlq_s64(u.s64x2[0], shift_indices) };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -471,6 +468,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
|
|||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
if (N == 16) return Zeroes();
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(N)) {
|
||||||
|
return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
SuperVector result;
|
SuperVector result;
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
|
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
|
||||||
return result;
|
return result;
|
||||||
@ -485,22 +487,12 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||||
{
|
{
|
||||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
|
||||||
if (__builtin_constant_p(N)) {
|
|
||||||
return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return vshr_128(N);
|
return vshr_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||||
{
|
{
|
||||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
|
||||||
if (__builtin_constant_p(N)) {
|
|
||||||
return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return vshl_128(N);
|
return vshl_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -534,45 +526,23 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||||
{
|
{
|
||||||
SuperVector mask = Ones_vshr(16 -len);
|
SuperVector mask = Ones_vshr(16 -len);
|
||||||
//mask.print8("mask");
|
|
||||||
SuperVector<16> v = loadu(ptr);
|
SuperVector<16> v = loadu(ptr);
|
||||||
//v.print8("v");
|
|
||||||
return mask & v;
|
return mask & v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
||||||
{
|
{
|
||||||
|
if (offset == 0) return other;
|
||||||
|
if (offset == 16) return *this;
|
||||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
if (__builtin_constant_p(offset)) {
|
if (__builtin_constant_p(offset)) {
|
||||||
if (offset == 16) {
|
|
||||||
return *this;
|
|
||||||
} else {
|
|
||||||
return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
|
return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
switch(offset) {
|
SuperVector result;
|
||||||
case 0: return other; break;
|
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {vextq_u8(other.u.u8x16[0], v->u.u8x16[0], n)}; });
|
||||||
case 1: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 1)}; break;
|
return result;
|
||||||
case 2: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 2)}; break;
|
|
||||||
case 3: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 3)}; break;
|
|
||||||
case 4: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 4)}; break;
|
|
||||||
case 5: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 5)}; break;
|
|
||||||
case 6: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 6)}; break;
|
|
||||||
case 7: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 7)}; break;
|
|
||||||
case 8: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 8)}; break;
|
|
||||||
case 9: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 9)}; break;
|
|
||||||
case 10: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 10)}; break;
|
|
||||||
case 11: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 11)}; break;
|
|
||||||
case 12: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 12)}; break;
|
|
||||||
case 13: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 13)}; break;
|
|
||||||
case 14: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 14)}; break;
|
|
||||||
case 15: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 15)}; break;
|
|
||||||
case 16: return *this; break;
|
|
||||||
default: break;
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
@ -39,7 +39,7 @@
|
|||||||
#include "util/supervector/supervector.hpp"
|
#include "util/supervector/supervector.hpp"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
// 128-bit Powerpc64le implementation
|
// 128-bit IBM Power VSX implementation
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
||||||
@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other)
|
|||||||
u.v128[0] = other.u.v128[0];
|
u.v128[0] = other.u.v128[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(char __bool __vector v)
|
||||||
|
{
|
||||||
|
u.u8x16[0] = (uint8x16_t) v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int8x16_t const v)
|
||||||
|
{
|
||||||
|
u.s8x16[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
|
||||||
|
{
|
||||||
|
u.u8x16[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int16x8_t const v)
|
||||||
|
{
|
||||||
|
u.s16x8[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
|
||||||
|
{
|
||||||
|
u.u16x8[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int32x4_t const v)
|
||||||
|
{
|
||||||
|
u.s32x4[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
|
||||||
|
{
|
||||||
|
u.u32x4[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(int64x2_t const v)
|
||||||
|
{
|
||||||
|
u.s64x2[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
template<>
|
||||||
|
really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
|
||||||
|
{
|
||||||
|
u.u64x2[0] = v;
|
||||||
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
|
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
|
||||||
{
|
{
|
||||||
@ -57,69 +120,69 @@ template<>
|
|||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int8_t const other)
|
really_inline SuperVector<16>::SuperVector(int8_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s8x16[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint8_t const other)
|
really_inline SuperVector<16>::SuperVector(uint8_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
|
u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int16_t const other)
|
really_inline SuperVector<16>::SuperVector(int16_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s16x8[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint16_t const other)
|
really_inline SuperVector<16>::SuperVector(uint16_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
|
u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int32_t const other)
|
really_inline SuperVector<16>::SuperVector(int32_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(other);
|
u.s32x4[0] = vec_splats(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint32_t const other)
|
really_inline SuperVector<16>::SuperVector(uint32_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
|
u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(int64_t const other)
|
really_inline SuperVector<16>::SuperVector(int64_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
|
u.s64x2[0] = (int64x2_t) vec_splats(static_cast<ulong64_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16>::SuperVector(uint64_t const other)
|
really_inline SuperVector<16>::SuperVector(uint64_t const other)
|
||||||
{
|
{
|
||||||
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
|
u.u64x2[0] = (uint64x2_t) vec_splats(static_cast<ulong64_t>(other));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Constants
|
// Constants
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
||||||
{
|
{
|
||||||
return {(m128) vec_splat_s8(-1)};
|
return { vec_splat_s8(-1)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
||||||
{
|
{
|
||||||
return {(m128) vec_splat_s8(0)};
|
return { vec_splat_s8(0) };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Methods
|
// Methods
|
||||||
@ -145,27 +208,26 @@ really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_xor(u.v128[0], b.u.v128[0])};
|
return { vec_xor(u.v128[0], b.u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator!() const
|
really_inline SuperVector<16> SuperVector<16>::operator!() const
|
||||||
{
|
{
|
||||||
return {(m128) vec_xor(u.v128[0], u.v128[0])};
|
return { vec_xor(u.v128[0], u.v128[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
|
int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
|
||||||
return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
|
return { vec_and(not_res, b.u.s8x16[0]) };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
|
return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])};
|
return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};
|
return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};
|
return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};
|
return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
|
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
|
||||||
{
|
{
|
||||||
@ -206,69 +267,64 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::comparemask(void) const {
|
||||||
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
|
uint8x16_t bitmask = vec_gb( u.u8x16[0]);
|
||||||
|
static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
|
bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
|
||||||
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
|
u32 movemask;
|
||||||
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
|
vec_ste((uint32x4_t) bitmask, 0, &movemask);
|
||||||
|
return movemask;
|
||||||
uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
|
|
||||||
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
|
|
||||||
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
|
|
||||||
|
|
||||||
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
|
|
||||||
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
|
|
||||||
|
|
||||||
uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
|
|
||||||
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
|
|
||||||
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
|
|
||||||
|
|
||||||
return s5[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||||
return eq(b).movemask();
|
return eq(b).comparemask();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
|
SuperVector<16>::iteration_mask(
|
||||||
|
typename SuperVector<16>::comparemask_type mask) {
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) };
|
return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
|
return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
|
return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)};
|
return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -282,35 +338,35 @@ template <>
|
|||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
|
return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) };
|
return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
|
return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
template<uint8_t N>
|
template<uint8_t N>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
|
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
|
||||||
{
|
{
|
||||||
return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -340,50 +396,40 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint8x16_t shift_indices = vec_splats((uint8_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u8x16[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint16x8_t shift_indices = vec_splats((uint16_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u16x8[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint32x4_t shift_indices = vec_splats((uint32_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u32x4[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint64x2_t shift_indices = vec_splats((ulong64_t) N);
|
||||||
SuperVector result;
|
return { vec_sl(u.u64x2[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
SuperVector sl{N << 3};
|
||||||
SuperVector result;
|
return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -396,50 +442,40 @@ template <>
|
|||||||
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint8x16_t shift_indices = vec_splats((uint8_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u8x16[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint16x8_t shift_indices = vec_splats((uint16_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u16x8[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint32x4_t shift_indices = vec_splats((uint32_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u32x4[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
uint64x2_t shift_indices = vec_splats((ulong64_t) N);
|
||||||
SuperVector result;
|
return { vec_sr(u.u64x2[0], shift_indices) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const
|
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
|
||||||
{
|
{
|
||||||
if (N == 0) return *this;
|
if (N == 0) return *this;
|
||||||
if (N == 16) return Zeroes();
|
SuperVector sr{N << 3};
|
||||||
SuperVector result;
|
return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) };
|
||||||
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -451,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||||
{
|
{
|
||||||
switch(N) {
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break;
|
if (N == 0) return *this;
|
||||||
case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break;
|
if (__builtin_constant_p(N)) {
|
||||||
case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break;
|
return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
|
||||||
case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break;
|
|
||||||
case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break;
|
|
||||||
case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break;
|
|
||||||
case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break;
|
|
||||||
case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break;
|
|
||||||
case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break;
|
|
||||||
case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break;
|
|
||||||
case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break;
|
|
||||||
case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break;
|
|
||||||
case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break;
|
|
||||||
case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break;
|
|
||||||
case 16: return Zeroes(); break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
return vshr_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||||
{
|
{
|
||||||
switch(N) {
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break;
|
if (N == 0) return *this;
|
||||||
case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break;
|
if (__builtin_constant_p(N)) {
|
||||||
case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break;
|
return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
|
||||||
case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break;
|
|
||||||
case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break;
|
|
||||||
case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break;
|
|
||||||
case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break;
|
|
||||||
case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break;
|
|
||||||
case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break;
|
|
||||||
case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break;
|
|
||||||
case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break;
|
|
||||||
case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break;
|
|
||||||
case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break;
|
|
||||||
case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break;
|
|
||||||
case 16: return Zeroes(); break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
return vshl_128(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -513,50 +523,39 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
|
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
|
||||||
{
|
{
|
||||||
return (m128) vec_xl(0, (const long64_t*)ptr);
|
return { vec_xl(0, (const long64_t*)ptr) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
||||||
{
|
{
|
||||||
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
|
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
|
||||||
return (m128) vec_xl(0, (const long64_t*)ptr);
|
return { vec_xl(0, (const long64_t*)ptr) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||||
{
|
{
|
||||||
SuperVector<16> mask = Ones_vshr(16 -len);
|
SuperVector<16> mask = Ones_vshr(16 -len);
|
||||||
mask.print8("mask");
|
|
||||||
SuperVector<16> v = loadu(ptr);
|
SuperVector<16> v = loadu(ptr);
|
||||||
v.print8("v");
|
|
||||||
return mask & v;
|
return mask & v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
|
||||||
{
|
{
|
||||||
|
if (offset == 0) return other;
|
||||||
switch(offset) {
|
if (offset == 16) return *this;
|
||||||
case 0: return other; break;
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break;
|
if (__builtin_constant_p(offset)) {
|
||||||
case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break;
|
return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) };
|
||||||
case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break;
|
|
||||||
case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break;
|
|
||||||
case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break;
|
|
||||||
case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break;
|
|
||||||
case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break;
|
|
||||||
case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break;
|
|
||||||
case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break;
|
|
||||||
case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break;
|
|
||||||
case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break;
|
|
||||||
case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break;
|
|
||||||
case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break;
|
|
||||||
case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break;
|
|
||||||
case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break;
|
|
||||||
default: break;
|
|
||||||
}
|
}
|
||||||
return *this;
|
#endif
|
||||||
|
uint8x16_t sl = vec_splats((uint8_t) (offset << 3));
|
||||||
|
uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3));
|
||||||
|
uint8x16_t rhs = vec_slo(u.u8x16[0], sr);
|
||||||
|
uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl);
|
||||||
|
return { vec_or(lhs, rhs) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -566,9 +565,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
|
|||||||
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
||||||
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
|
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
|
||||||
below is the version that is converted from Intel to PPC. */
|
below is the version that is converted from Intel to PPC. */
|
||||||
uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
|
uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
|
||||||
uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
|
uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
|
||||||
return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
|
return { vec_sel(res, vec_splat_u8(0), mask) };
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::comparemask(void) const {
|
||||||
return _mm_movemask_epi8(u.v128[0]);
|
return (u32)_mm_movemask_epi8(u.v128[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
{
|
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||||
return eq(b).movemask();
|
return eq(b).comparemask();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
really_inline typename SuperVector<16>::comparemask_type
|
||||||
|
SuperVector<16>::iteration_mask(
|
||||||
|
typename SuperVector<16>::comparemask_type mask) {
|
||||||
|
return mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
// template <>
|
// template <>
|
||||||
@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
|
really_inline typename SuperVector<32>::comparemask_type
|
||||||
{
|
SuperVector<32>::comparemask(void) const {
|
||||||
return _mm256_movemask_epi8(u.v256[0]);
|
return (u32)_mm256_movemask_epi8(u.v256[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
|
really_inline typename SuperVector<32>::comparemask_type
|
||||||
{
|
SuperVector<32>::eqmask(SuperVector<32> const b) const {
|
||||||
return eq(b).movemask();
|
return eq(b).comparemask();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
really_inline typename SuperVector<32>::comparemask_type
|
||||||
|
SuperVector<32>::iteration_mask(
|
||||||
|
typename SuperVector<32>::comparemask_type mask) {
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
|
||||||
// template <>
|
// template <>
|
||||||
// template<uint8_t N>
|
// template<uint8_t N>
|
||||||
@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b
|
|||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
|
really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
|
||||||
{
|
{
|
||||||
SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
|
SuperVector<64>::comparemask_type mask =
|
||||||
|
_mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
return {_mm512_movm_epi8(mask)};
|
return {_mm512_movm_epi8(mask)};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
|
really_inline typename SuperVector<64>::comparemask_type
|
||||||
{
|
SuperVector<64>::comparemask(void) const {
|
||||||
__m512i msb = _mm512_set1_epi8(0xFF);
|
__m512i msb = _mm512_set1_epi8(0xFF);
|
||||||
__m512i mask = _mm512_and_si512(msb, u.v512[0]);
|
__m512i mask = _mm512_and_si512(msb, u.v512[0]);
|
||||||
return _mm512_cmpeq_epi8_mask(mask, msb);
|
return _mm512_cmpeq_epi8_mask(mask, msb);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
|
really_inline typename SuperVector<64>::comparemask_type
|
||||||
{
|
SuperVector<64>::eqmask(SuperVector<64> const b) const {
|
||||||
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
really_inline typename SuperVector<64>::comparemask_type
|
||||||
|
SuperVector<64>::iteration_mask(
|
||||||
|
typename SuperVector<64>::comparemask_type mask) {
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
|
||||||
// template <>
|
// template <>
|
||||||
// template<uint8_t N>
|
// template<uint8_t N>
|
||||||
// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
||||||
|
@ -46,19 +46,29 @@
|
|||||||
using Z_TYPE = u64a;
|
using Z_TYPE = u64a;
|
||||||
#define Z_BITS 64
|
#define Z_BITS 64
|
||||||
#define Z_SHIFT 63
|
#define Z_SHIFT 63
|
||||||
|
#define Z_POSSHIFT 0
|
||||||
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
|
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
|
||||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||||
#elif defined(HAVE_SIMD_256_BITS)
|
#elif defined(HAVE_SIMD_256_BITS)
|
||||||
using Z_TYPE = u32;
|
using Z_TYPE = u32;
|
||||||
#define Z_BITS 32
|
#define Z_BITS 32
|
||||||
#define Z_SHIFT 31
|
#define Z_SHIFT 31
|
||||||
|
#define Z_POSSHIFT 0
|
||||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||||
#elif defined(HAVE_SIMD_128_BITS)
|
#elif defined(HAVE_SIMD_128_BITS)
|
||||||
|
#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||||
|
using Z_TYPE = u64a;
|
||||||
|
#define Z_BITS 64
|
||||||
|
#define Z_POSSHIFT 2
|
||||||
|
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
|
||||||
|
#else
|
||||||
using Z_TYPE = u32;
|
using Z_TYPE = u32;
|
||||||
#define Z_BITS 32
|
#define Z_BITS 32
|
||||||
#define Z_SHIFT 15
|
#define Z_POSSHIFT 0
|
||||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||||
|
#endif
|
||||||
|
#define Z_SHIFT 15
|
||||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -94,7 +104,7 @@ struct BaseVector
|
|||||||
static constexpr bool is_valid = false;
|
static constexpr bool is_valid = false;
|
||||||
static constexpr u16 size = 8;
|
static constexpr u16 size = 8;
|
||||||
using type = void;
|
using type = void;
|
||||||
using movemask_type = void;
|
using comparemask_type = void;
|
||||||
static constexpr bool has_previous = false;
|
static constexpr bool has_previous = false;
|
||||||
using previous_type = void;
|
using previous_type = void;
|
||||||
static constexpr u16 previous_size = 4;
|
static constexpr u16 previous_size = 4;
|
||||||
@ -106,7 +116,7 @@ struct BaseVector<128>
|
|||||||
static constexpr bool is_valid = true;
|
static constexpr bool is_valid = true;
|
||||||
static constexpr u16 size = 128;
|
static constexpr u16 size = 128;
|
||||||
using type = void;
|
using type = void;
|
||||||
using movemask_type = u64a;
|
using comparemask_type = u64a;
|
||||||
static constexpr bool has_previous = true;
|
static constexpr bool has_previous = true;
|
||||||
using previous_type = m512;
|
using previous_type = m512;
|
||||||
static constexpr u16 previous_size = 64;
|
static constexpr u16 previous_size = 64;
|
||||||
@ -118,7 +128,7 @@ struct BaseVector<64>
|
|||||||
static constexpr bool is_valid = true;
|
static constexpr bool is_valid = true;
|
||||||
static constexpr u16 size = 64;
|
static constexpr u16 size = 64;
|
||||||
using type = m512;
|
using type = m512;
|
||||||
using movemask_type = u64a;
|
using comparemask_type = u64a;
|
||||||
static constexpr bool has_previous = true;
|
static constexpr bool has_previous = true;
|
||||||
using previous_type = m256;
|
using previous_type = m256;
|
||||||
static constexpr u16 previous_size = 32;
|
static constexpr u16 previous_size = 32;
|
||||||
@ -131,7 +141,7 @@ struct BaseVector<32>
|
|||||||
static constexpr bool is_valid = true;
|
static constexpr bool is_valid = true;
|
||||||
static constexpr u16 size = 32;
|
static constexpr u16 size = 32;
|
||||||
using type = m256;
|
using type = m256;
|
||||||
using movemask_type = u32;
|
using comparemask_type = u64a;
|
||||||
static constexpr bool has_previous = true;
|
static constexpr bool has_previous = true;
|
||||||
using previous_type = m128;
|
using previous_type = m128;
|
||||||
static constexpr u16 previous_size = 16;
|
static constexpr u16 previous_size = 16;
|
||||||
@ -144,7 +154,7 @@ struct BaseVector<16>
|
|||||||
static constexpr bool is_valid = true;
|
static constexpr bool is_valid = true;
|
||||||
static constexpr u16 size = 16;
|
static constexpr u16 size = 16;
|
||||||
using type = m128;
|
using type = m128;
|
||||||
using movemask_type = u32;
|
using comparemask_type = u64a;
|
||||||
static constexpr bool has_previous = false;
|
static constexpr bool has_previous = false;
|
||||||
using previous_type = u64a;
|
using previous_type = u64a;
|
||||||
static constexpr u16 previous_size = 8;
|
static constexpr u16 previous_size = 8;
|
||||||
@ -194,7 +204,7 @@ public:
|
|||||||
SuperVector(typename base_type::type const v);
|
SuperVector(typename base_type::type const v);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
SuperVector(T other);
|
SuperVector(T const other);
|
||||||
|
|
||||||
SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
|
SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
|
||||||
SuperVector(previous_type const lo, previous_type const hi);
|
SuperVector(previous_type const lo, previous_type const hi);
|
||||||
@ -231,8 +241,17 @@ public:
|
|||||||
SuperVector eq(SuperVector const &b) const;
|
SuperVector eq(SuperVector const &b) const;
|
||||||
SuperVector operator<<(uint8_t const N) const;
|
SuperVector operator<<(uint8_t const N) const;
|
||||||
SuperVector operator>>(uint8_t const N) const;
|
SuperVector operator>>(uint8_t const N) const;
|
||||||
typename base_type::movemask_type movemask(void) const;
|
// Returns mask_width groups of zeros or ones. To get the mask which can be
|
||||||
typename base_type::movemask_type eqmask(SuperVector const b) const;
|
// iterated, use iteration_mask method, it ensures only one bit is set per
|
||||||
|
// mask_width group.
|
||||||
|
// Precondition: all bytes must be 0 or 0xff.
|
||||||
|
typename base_type::comparemask_type comparemask(void) const;
|
||||||
|
typename base_type::comparemask_type eqmask(SuperVector const b) const;
|
||||||
|
static u32 mask_width();
|
||||||
|
// Returns a mask with at most 1 bit set to 1. It can be used to iterate
|
||||||
|
// over bits through ctz/clz and lowest bit clear.
|
||||||
|
static typename base_type::comparemask_type
|
||||||
|
iteration_mask(typename base_type::comparemask_type mask);
|
||||||
|
|
||||||
static SuperVector loadu(void const *ptr);
|
static SuperVector loadu(void const *ptr);
|
||||||
static SuperVector load(void const *ptr);
|
static SuperVector load(void const *ptr);
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
|
|
||||||
#if defined(HAVE_SIGACTION)
|
#if defined(HAVE_SIGACTION)
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
#define STACK_SIZE 8192
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
@ -166,7 +167,7 @@ void installSignalHandler(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_SIGALTSTACK
|
#ifdef HAVE_SIGALTSTACK
|
||||||
static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
|
static TLS_VARIABLE char alt_stack_loc[STACK_SIZE];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void setSignalStack(void) {
|
void setSignalStack(void) {
|
||||||
@ -178,7 +179,7 @@ void setSignalStack(void) {
|
|||||||
stack_t alt_stack;
|
stack_t alt_stack;
|
||||||
memset(&alt_stack, 0, sizeof(alt_stack));
|
memset(&alt_stack, 0, sizeof(alt_stack));
|
||||||
alt_stack.ss_flags = 0;
|
alt_stack.ss_flags = 0;
|
||||||
alt_stack.ss_size = SIGSTKSZ;
|
alt_stack.ss_size = STACK_SIZE;
|
||||||
alt_stack.ss_sp = alt_stack_loc;
|
alt_stack.ss_sp = alt_stack_loc;
|
||||||
if (!sigaltstack(&alt_stack, nullptr)) {
|
if (!sigaltstack(&alt_stack, nullptr)) {
|
||||||
act.sa_flags |= SA_ONSTACK;
|
act.sa_flags |= SA_ONSTACK;
|
||||||
|
@ -28,6 +28,8 @@
|
|||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
#include "ue2common.h"
|
#include "ue2common.h"
|
||||||
#include "util/compile_error.h"
|
#include "util/compile_error.h"
|
||||||
|
@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define TEST_LSHIFTBYTE128(v1, buf, l) { \
|
||||||
|
m128 v_shifted = lshiftbyte_m128(v1, l); \
|
||||||
|
storeu128(res, v_shifted); \
|
||||||
|
int i; \
|
||||||
|
for (i=0; i < l; i++) { \
|
||||||
|
assert(res[i] == 0); \
|
||||||
|
} \
|
||||||
|
for (; i < 16; i++) { \
|
||||||
|
assert(res[i] == vec[i - l]); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SimdUtilsTest, lshiftbyte128){
|
||||||
|
u8 vec[16];
|
||||||
|
u8 res[16];
|
||||||
|
for (int i=0; i<16; i++) {
|
||||||
|
vec[i]=i;
|
||||||
|
}
|
||||||
|
m128 v1 = loadu128(vec);
|
||||||
|
for (int j = 0; j<16; j++){
|
||||||
|
TEST_LSHIFTBYTE128(v1, vec, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TEST_RSHIFTBYTE128(v1, buf, l) { \
|
||||||
|
m128 v_shifted = rshiftbyte_m128(v1, l); \
|
||||||
|
storeu128(res, v_shifted); \
|
||||||
|
int i; \
|
||||||
|
for (i=15; i >= 16 - l; i--) { \
|
||||||
|
assert(res[i] == 0); \
|
||||||
|
} \
|
||||||
|
for (; i >= 0; i--) { \
|
||||||
|
assert(res[i] == vec[i + l]); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SimdUtilsTest, rshiftbyte128){
|
||||||
|
u8 vec[16];
|
||||||
|
u8 res[16];
|
||||||
|
for (int i=0; i<16; i++) {
|
||||||
|
vec[i]=i;
|
||||||
|
}
|
||||||
|
m128 v1 = loadu128(vec);
|
||||||
|
for (int j = 0; j<16; j++){
|
||||||
|
TEST_RSHIFTBYTE128(v1, vec, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(SimdUtilsTest, variableByteShift128) {
|
TEST(SimdUtilsTest, variableByteShift128) {
|
||||||
char base[] = "0123456789ABCDEF";
|
char base[] = "0123456789ABCDEF";
|
||||||
m128 in = loadu128(base);
|
m128 in = loadu128(base);
|
||||||
|
|
||||||
|
|
||||||
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
|
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
|
||||||
variable_byte_shift_m128(in, 0)));
|
variable_byte_shift_m128(in, 0)));
|
||||||
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
|
EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
|
||||||
@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) {
|
|||||||
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
|
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
|
||||||
variable_byte_shift_m128(in, 10)));
|
variable_byte_shift_m128(in, 10)));
|
||||||
|
|
||||||
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
|
EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15)));
|
||||||
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
|
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,9 +176,9 @@ TEST(SuperVectorUtilsTest,Movemask128c){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto SP = SuperVector<16>::loadu(vec);
|
auto SP = SuperVector<16>::loadu(vec);
|
||||||
u16 mask = SP.movemask();
|
u64a mask = SP.comparemask();
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
if (mask & (1 << i)) {
|
if (mask & (1ull << (i * SuperVector<16>::mask_width()))) {
|
||||||
vec2[i] = 0xff;
|
vec2[i] = 0xff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -195,15 +195,21 @@ TEST(SuperVectorUtilsTest,Eqmask128c){
|
|||||||
for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
|
for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
|
||||||
auto SP = SuperVector<16>::loadu(vec);
|
auto SP = SuperVector<16>::loadu(vec);
|
||||||
auto SP1 = SuperVector<16>::loadu(vec2);
|
auto SP1 = SuperVector<16>::loadu(vec2);
|
||||||
int mask = SP.eqmask(SP);
|
u64a mask = SP.eqmask(SP);
|
||||||
ASSERT_EQ(mask,0xFFFF);
|
for (u32 i = 0; i < 16; ++i) {
|
||||||
|
ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width())));
|
||||||
|
}
|
||||||
mask = SP.eqmask(SP1);
|
mask = SP.eqmask(SP1);
|
||||||
ASSERT_EQ(mask,0);
|
ASSERT_EQ(mask,0);
|
||||||
vec2[0] = vec[0];
|
vec2[0] = vec[0];
|
||||||
vec2[1] = vec[1];
|
vec2[1] = vec[1];
|
||||||
auto SP2 = SuperVector<16>::loadu(vec2);
|
auto SP2 = SuperVector<16>::loadu(vec2);
|
||||||
mask = SP.eqmask(SP2);
|
mask = SP.eqmask(SP2);
|
||||||
ASSERT_EQ(mask,3);
|
ASSERT_TRUE(mask & 1);
|
||||||
|
ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width()));
|
||||||
|
for (u32 i = 2; i < 16; ++i) {
|
||||||
|
ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width())));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Define LSHIFT128 macro*/
|
/*Define LSHIFT128 macro*/
|
||||||
@ -507,9 +513,9 @@ TEST(SuperVectorUtilsTest,Movemask256c){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto SP = SuperVector<32>::loadu(vec);
|
auto SP = SuperVector<32>::loadu(vec);
|
||||||
u32 mask = SP.movemask();
|
u64a mask = SP.comparemask();
|
||||||
for(int i=0; i<32; i++) {
|
for(int i=0; i<32; i++) {
|
||||||
if (mask & (1 << i)) {
|
if (mask & (1ull << (i * SuperVector<32>::mask_width()))) {
|
||||||
vec2[i] = 0xff;
|
vec2[i] = 0xff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -527,15 +533,21 @@ TEST(SuperVectorUtilsTest,Eqmask256c){
|
|||||||
for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
|
for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
|
||||||
auto SP = SuperVector<32>::loadu(vec);
|
auto SP = SuperVector<32>::loadu(vec);
|
||||||
auto SP1 = SuperVector<32>::loadu(vec2);
|
auto SP1 = SuperVector<32>::loadu(vec2);
|
||||||
u32 mask = SP.eqmask(SP);
|
u64a mask = SP.eqmask(SP);
|
||||||
ASSERT_EQ(mask,0xFFFFFFFF);
|
for (u32 i = 0; i < 32; ++i) {
|
||||||
|
ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width())));
|
||||||
|
}
|
||||||
mask = SP.eqmask(SP1);
|
mask = SP.eqmask(SP1);
|
||||||
ASSERT_EQ(mask,0);
|
ASSERT_EQ(mask,0);
|
||||||
vec2[0] = vec[0];
|
vec2[0] = vec[0];
|
||||||
vec2[1] = vec[1];
|
vec2[1] = vec[1];
|
||||||
auto SP2 = SuperVector<32>::loadu(vec2);
|
auto SP2 = SuperVector<32>::loadu(vec2);
|
||||||
mask = SP.eqmask(SP2);
|
mask = SP.eqmask(SP2);
|
||||||
ASSERT_EQ(mask,3);
|
ASSERT_TRUE(mask & 1);
|
||||||
|
ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width()));
|
||||||
|
for (u32 i = 2; i < 32; ++i) {
|
||||||
|
ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width())));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SuperVectorUtilsTest,pshufb256c) {
|
TEST(SuperVectorUtilsTest,pshufb256c) {
|
||||||
@ -849,7 +861,7 @@ TEST(SuperVectorUtilsTest,Movemask512c){
|
|||||||
}
|
}
|
||||||
auto SP = SuperVector<64>::loadu(vec);
|
auto SP = SuperVector<64>::loadu(vec);
|
||||||
u8 vec2[64] = {0};
|
u8 vec2[64] = {0};
|
||||||
u64a mask = SP.movemask();
|
u64a mask = SP.comparemask();
|
||||||
for(int i=0; i<64; i++) {
|
for(int i=0; i<64; i++) {
|
||||||
if (mask & (1ULL << i)) {
|
if (mask & (1ULL << i)) {
|
||||||
vec2[i] = 0xff;
|
vec2[i] = 0xff;
|
||||||
@ -871,6 +883,8 @@ TEST(SuperVectorUtilsTest,Eqmask512c){
|
|||||||
auto SP = SuperVector<64>::loadu(vec);
|
auto SP = SuperVector<64>::loadu(vec);
|
||||||
auto SP1 = SuperVector<64>::loadu(vec2);
|
auto SP1 = SuperVector<64>::loadu(vec2);
|
||||||
u64a mask = SP.eqmask(SP);
|
u64a mask = SP.eqmask(SP);
|
||||||
|
// Mask width for 64 bit type cannot be more than 1.
|
||||||
|
ASSERT_EQ(SuperVector<64>::mask_width(), 1);
|
||||||
ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
|
ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
|
||||||
mask = SP.eqmask(SP1);
|
mask = SP.eqmask(SP1);
|
||||||
ASSERT_EQ(mask,0);
|
ASSERT_EQ(mask,0);
|
||||||
|
@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
|
|||||||
unichar CorpusEditorUtf8::chooseCodePoint(void) {
|
unichar CorpusEditorUtf8::chooseCodePoint(void) {
|
||||||
/* We need to ensure that we don't pick a surrogate cp */
|
/* We need to ensure that we don't pick a surrogate cp */
|
||||||
const u32 range =
|
const u32 range =
|
||||||
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
|
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
|
||||||
unichar raw = props.rand(0, range - 1);
|
unichar raw = props.rand(0, range - 1);
|
||||||
if (raw < UNICODE_SURROGATE_MIN) {
|
if (raw < UNICODE_SURROGATE_MIN) {
|
||||||
return raw;
|
return raw;
|
||||||
} else {
|
} else {
|
||||||
return raw + UNICODE_SURROGATE_MAX + 1;
|
return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -476,14 +476,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
|
|||||||
* that we've been asked for. */
|
* that we've been asked for. */
|
||||||
unichar CorpusGeneratorUtf8::getRandomChar() {
|
unichar CorpusGeneratorUtf8::getRandomChar() {
|
||||||
u32 range = MAX_UNICODE + 1
|
u32 range = MAX_UNICODE + 1
|
||||||
- (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
|
- (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
|
||||||
range = min(cProps.alphabetSize, range);
|
range = min(cProps.alphabetSize, range);
|
||||||
assert(range);
|
assert(range);
|
||||||
|
|
||||||
unichar c = 'a' + cProps.rand(0, range - 1);
|
unichar c = 'a' + cProps.rand(0, range - 1);
|
||||||
|
|
||||||
if (c >= UNICODE_SURROGATE_MIN) {
|
if (c >= UNICODE_SURROGATE_MIN) {
|
||||||
c =+ UNICODE_SURROGATE_MAX + 1;
|
c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return c % (MAX_UNICODE + 1);
|
return c % (MAX_UNICODE + 1);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user