Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle.
Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s
onhe microbenchmark.
SVE2 also offer this capability for 128b vector with a speedup around
25% compared to normal SVE

Add unit tests and benchmark for this wide variant

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
Yoan Picchi
2024-04-23 12:04:40 +00:00
parent b312112e87
commit 938c026256
23 changed files with 1125 additions and 78 deletions

View File

@@ -1,5 +1,6 @@
/*
* Copyright (c) 2020, 2021, VectorCamp PC
* Copyright (c) 2023, 2024, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -34,6 +35,7 @@
#include <iostream>
#include <memory>
#include "util/arch.h"
#include "benchmarks.hpp"
#define MAX_LOOPS 1000000000
@@ -145,11 +147,13 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::shuftiBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::shuftiBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return shuftiExec(b.lo, b.hi, b.buf.data(),
return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
@@ -160,11 +164,13 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::shuftiBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::shuftiBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return rshuftiExec(b.lo, b.hi, b.buf.data(),
return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
@@ -175,11 +181,13 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return truffleExec(b.lo, b.hi, b.buf.data(),
return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
@@ -190,14 +198,47 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return rtruffleExec(b.lo, b.hi, b.buf.data(),
return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
b.buf.data() + b.size);
});
}
#ifdef CAN_USE_WIDE_TRUFFLE
if(CAN_USE_WIDE_TRUFFLE) {
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Truffle Wide", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
}
);
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Truffle Wide", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
}
);
}
}
#endif
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Vermicelli", sizes[i]);
@@ -205,7 +246,9 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
@@ -220,7 +263,9 @@ int main(){
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, reinterpret_cast<u8 *>(&b.lo), reinterpret_cast<u8 *>(&b.hi));
ue2::truffleBuildMasks(b.chars,
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {

View File

@@ -1,5 +1,6 @@
/*
* Copyright (c) 2020, 2021, VectorCamp PC
* Copyright (c) 2024, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -44,7 +45,18 @@ public:
size_t size;
// Shufti/Truffle
m128 lo, hi;
union {
m256 truffle_mask;
struct {
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
m128 truffle_mask_lo;
m128 truffle_mask_hi;
#else
m128 truffle_mask_hi;
m128 truffle_mask_lo;
#endif
};
};
ue2::CharReach chars;
std::vector<u8> buf;