Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle.
Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s
onhe microbenchmark.
SVE2 also offer this capability for 128b vector with a speedup around
25% compared to normal SVE

Add unit tests and benchmark for this wide variant

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
Yoan Picchi
2024-04-23 12:04:40 +00:00
parent 154cb6333b
commit 7054378c93
23 changed files with 1125 additions and 78 deletions

View File

@@ -45,6 +45,14 @@
#ifdef HAVE_SVE
static really_inline
svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars);
static really_inline
svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars);
#ifdef HAVE_SVE2
static really_inline
svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars);
#endif //HAVE_SVE2
#else
template <uint16_t S>
static really_inline
@@ -64,19 +72,36 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
#endif
#ifdef HAVE_SVE
const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
template <bool is_wide, bool is_vector_128b>
static really_inline
const u8 *truffleExecSVE(m256 shuf_mask_32,
const u8 *buf, const u8 *buf_end);
const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
template <bool is_wide, bool is_vector_128b>
static really_inline
const u8 *rtruffleExecSVE(m256 shuf_mask_32,
const u8 *buf, const u8 *buf_end);
template <bool is_wide, bool is_vector_128b>
static really_inline
const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars, const u8 *buf, bool forward) {
const size_t vector_size_int_8 = svcntb();
const svuint8_t result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset,
svuint8_t chars, const u8 *buf, const size_t vector_size_int_8, bool forward)
{
svuint8_t result_mask;
if(is_wide) {
if(is_vector_128b) {
#ifdef HAVE_SVE2
result_mask = blockSingleMaskWide(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
#else
DEBUG_PRINTF("Wide Truffle is not supported with 128b vectors unless SVE2 is enabled");
assert(false);
#endif
} else {
result_mask = blockSingleMaskWide32(shuf_mask_lo_highclear, chars);
}
} else {
result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
}
uint64_t index;
if (forward) {
index = first_non_zero(vector_size_int_8, result_mask);
@@ -84,25 +109,33 @@ const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_hig
index = last_non_zero(vector_size_int_8, result_mask);
}
if(index < vector_size_int_8) {
if (index < vector_size_int_8) {
return buf+index;
} else {
return NULL;
}
}
really_inline
const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) {
template <bool is_wide, bool is_vector_128b>
static really_inline
const u8 *truffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end) {
const int vect_size_int8 = svcntb();
// Activate only 16 lanes to read the m128 buffers
const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear);
svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset);
svuint8_t wide_shuf_mask_lo_highclear;
svuint8_t wide_shuf_mask_lo_highset;
if (is_wide && !is_vector_128b) {
const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo);
wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */
} else {
const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo);
wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi);
}
const u8 *work_buffer = buf;
const u8 *ret_val;
@@ -118,16 +151,16 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
if (!ISALIGNED_N(work_buffer, vect_size_int8)) {
svuint8_t chars = svld1(svptrue_b8(), work_buffer);
const u8 *alligned_buffer = ROUNDUP_PTR(work_buffer, vect_size_int8);
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true);
if (ret_val && ret_val < alligned_buffer) return ret_val;
work_buffer = alligned_buffer;
}
while(work_buffer + vect_size_int8 <= buf_end) {
while (work_buffer + vect_size_int8 <= buf_end) {
__builtin_prefetch(work_buffer + 16*64);
DEBUG_PRINTF("work_buffer %p \n", work_buffer);
svuint8_t chars = svld1(svptrue_b8(), work_buffer);
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true);
if (ret_val) return ret_val;
work_buffer += vect_size_int8;
}
@@ -147,7 +180,7 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
chars = svld1(svptrue_b8(), buf_end - vect_size_int8);
end_buf = buf_end - vect_size_int8;
}
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, true);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, vect_size_int8, true);
DEBUG_PRINTF("ret_val %p \n", ret_val);
if (ret_val && ret_val < buf_end) return ret_val;
}
@@ -155,18 +188,26 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
return buf_end;
}
really_inline
const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){
template <bool is_wide, bool is_vector_128b>
static really_inline
const u8 *rtruffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end){
const int vect_size_int8 = svcntb();
// Activate only 16 lanes to read the m128 buffers
const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear);
svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset);
svuint8_t wide_shuf_mask_lo_highclear;
svuint8_t wide_shuf_mask_lo_highset;
if (is_wide && !is_vector_128b) {
const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo);
wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */
} else {
const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo);
wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi);
}
const u8 *work_buffer = buf_end;
const u8 *ret_val;
@@ -182,7 +223,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset
if (!ISALIGNED_N(work_buffer, vect_size_int8)) {
svuint8_t chars = svld1(svptrue_b8(), work_buffer - vect_size_int8);
const u8 *alligned_buffer = ROUNDDOWN_PTR(work_buffer, vect_size_int8);
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, false);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, vect_size_int8, false);
DEBUG_PRINTF("ret_val %p \n", ret_val);
if (ret_val >= alligned_buffer) return ret_val;
work_buffer = alligned_buffer;
@@ -195,7 +236,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset
work_buffer -= vect_size_int8;
svuint8_t chars = svld1(svptrue_b8(), work_buffer);
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, false);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, false);
if (ret_val) return ret_val;
}
}
@@ -211,7 +252,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset
} else {
chars = svld1(svptrue_b8(), buf);
}
ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, false);
ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, vect_size_int8, false);
DEBUG_PRINTF("ret_val %p \n", ret_val);
if (ret_val && ret_val < buf_end) return ret_val;
}
@@ -253,7 +294,7 @@ const u8 *truffleExecReal(const m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_
d = dup;
}
while(d + S <= buf_end) {
while (d + S <= buf_end) {
__builtin_prefetch(d + 16*64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> chars = SuperVector<S>::load(d);