From 04d79629de8b3da96d8b90f760fff433a6ba01d9 Mon Sep 17 00:00:00 2001 From: "Xu, Chi" Date: Thu, 1 Sep 2016 07:48:04 +0800 Subject: [PATCH] rose: add shufti-based lookaround instructions More lookaround specialisations that use the shufti approach. --- CMakeLists.txt | 1 + src/rose/program_runtime.h | 275 +++++++++++++++++++++++++++++++ src/rose/rose_build_bytecode.cpp | 180 ++++++++++++++++++++ src/rose/rose_build_program.cpp | 54 ++++++ src/rose/rose_build_program.h | 184 +++++++++++++++++++++ src/rose/rose_dump.cpp | 65 ++++++++ src/rose/rose_program.h | 46 ++++++ src/rose/validate_shufti.h | 175 ++++++++++++++++++++ src/util/simd_utils.h | 33 ++++ 9 files changed, 1013 insertions(+) create mode 100644 src/rose/validate_shufti.h diff --git a/CMakeLists.txt b/CMakeLists.txt index de51c016..76d79821 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -555,6 +555,7 @@ set (hs_exec_SRCS src/rose/rose_types.h src/rose/rose_common.h src/rose/validate_mask.h + src/rose/validate_shufti.h src/util/bitutils.h src/util/copybytes.h src/util/exhaust.h diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h index 100d9140..57f39bbe 100644 --- a/src/rose/program_runtime.h +++ b/src/rose/program_runtime.h @@ -45,6 +45,7 @@ #include "rose_program.h" #include "rose_types.h" #include "validate_mask.h" +#include "validate_shufti.h" #include "runtime.h" #include "scratch.h" #include "ue2common.h" @@ -793,6 +794,231 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask, return 0; } +// get 128/256 bits data from history and current buffer. +// return data and valid_data_mask. +static rose_inline +u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, + u8 *data, const u32 data_len) { + assert(data_len == 16 || data_len == 32); + s32 c_shift = 0; // blank bytes after current. + s32 h_shift = 0; // blank bytes before history. + s32 h_len = data_len; // number of bytes from history buffer. + s32 c_len = 0; // number of bytes from current buffer. + if (loc < 0) { + s32 h_offset = 0; // the start offset in history buffer. + if (loc < -(s64a)ci->hlen) { + if (loc + data_len <= -(s64a)ci->hlen) { + DEBUG_PRINTF("all before history\n"); + return 0; + } + h_shift = -(loc + (s64a)ci->hlen); + h_len = data_len - h_shift; + } else { + h_offset = ci->hlen + loc; + } + if (loc + data_len > 0) { + // part in current buffer. + c_len = loc + data_len; + h_len = -(loc + h_shift); + if (c_len > (s64a)ci->len) { + // out of current buffer. + c_shift = c_len - ci->len; + c_len = ci->len; + } + copy_upto_32_bytes(data - loc, ci->buf, c_len); + } + assert(h_shift + h_len + c_len + c_shift == (s32)data_len); + copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len); + } else { + if (loc + data_len > (s64a)ci->len) { + if (loc >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future.\n"); + return 0; + } + c_len = ci->len - loc; + c_shift = data_len - c_len; + copy_upto_32_bytes(data, ci->buf + loc, c_len); + } else { + if (data_len == 16) { + storeu128(data, loadu128(ci->buf + loc)); + return 0xffff; + } else { + storeu256(data, loadu256(ci->buf + loc)); + return 0xffffffff; + } + } + } + DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); + DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); + + if (data_len == 16) { + return (u16)(0xffff << (h_shift + c_shift)) >> c_shift; + } else { + return (~0u) << (h_shift + c_shift) >> c_shift; + } +} + +static rose_inline +m128 getData128(const struct core_info *ci, s64a offset, u16 *valid_data_mask) { + if (offset > 0 && offset + sizeof(m128) <= ci->len) { + *valid_data_mask = 0xffff; + return loadu128(ci->buf + offset); + } + u8 data[sizeof(m128)] ALIGN_DIRECTIVE; + *valid_data_mask = (u16)getBufferDataComplex(ci, offset, data, 16); + return *(m128 *)data; +} + +static rose_inline +m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) { + if (offset > 0 && offset + sizeof(m256) <= ci->len) { + *valid_data_mask = ~0u; + return loadu256(ci->buf + offset); + } + u8 data[sizeof(m256)] ALIGN_DIRECTIVE; + *valid_data_mask = getBufferDataComplex(ci, offset, data, 32); + return *(m256 *)data; +} + +static rose_inline +int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask, + const u8 *bucket_select_mask, u32 neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u16 valid_data_mask = 0; + m128 data = getData128(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 nib_mask_m256 = loadu256(nib_mask); + m128 bucket_select_mask_m128 = loadu128(bucket_select_mask); + if (validateShuftiMask16x8(data, nib_mask_m256, + bucket_select_mask_m128, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 16x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u32 neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u16 valid_data_mask = 0; + m128 data = getData128(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 data_m256 = set2x128(data); + m256 hi_mask_m256 = loadu256(hi_mask); + m256 lo_mask_m256 = loadu256(lo_mask); + m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); + if (validateShuftiMask16x16(data_m256, hi_mask_m256, lo_mask_m256, + bucket_select_mask_m256, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 16x16 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u32 neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m256 data = getData256(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m128 hi_mask_m128 = loadu128(hi_mask); + m128 lo_mask_m128 = loadu128(lo_mask); + m256 hi_mask_m256 = set2x128(hi_mask_m128); + m256 lo_mask_m256 = set2x128(lo_mask_m128); + m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); + if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256, + bucket_select_mask_m256, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 32x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask_hi, + const u8 *bucket_select_mask_lo, u32 neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m256 data = getData256(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 hi_mask_1 = loadu2x128(hi_mask); + m256 hi_mask_2 = loadu2x128(hi_mask + 16); + m256 lo_mask_1 = loadu2x128(lo_mask); + m256 lo_mask_2 = loadu2x128(lo_mask + 16); + + m256 bucket_mask_hi = loadu256(bucket_select_mask_hi); + m256 bucket_mask_lo = loadu256(bucket_select_mask_lo); + if (validateShuftiMask32x16(data, hi_mask_1, hi_mask_2, + lo_mask_1, lo_mask_2, bucket_mask_hi, + bucket_mask_lo, neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 32x16 successfully\n"); + return 1; + } else { + return 0; + } +} + /** * \brief Scan around a literal, checking that that "lookaround" reach masks * are satisfied. @@ -1235,6 +1461,55 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t, } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_SHUFTI_16x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti16x8(ci, ri->nib_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + continue; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti32x8(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + continue; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_16x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti16x16(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + continue; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti32x16(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask_hi, + ri->bucket_select_mask_lo, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + continue; + } + } + PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_INFIX) { if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report, end)) { diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 3356d214..04ab52ff 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -88,6 +88,7 @@ #include "util/verify_types.h" #include +#include #include #include #include @@ -2888,6 +2889,181 @@ bool makeRoleMask32(const vector &look, return true; } +// Sorting by the size of every bucket. +// Used in map, cmpNibble>. +struct cmpNibble { + bool operator()(const u32 data1, const u32 data2) const{ + u32 size1 = popcount32(data1 >> 16) * popcount32(data1 << 16); + u32 size2 = popcount32(data2 >> 16) * popcount32(data2 << 16); + return std::tie(size1, data1) < std::tie(size2, data2); + } +}; + +// Insert all pairs of bucket and offset into buckets. +static really_inline +void getAllBuckets(const vector &look, + map, cmpNibble> &buckets, u32 &neg_mask) { + s32 base_offset = verify_s32(look.front().offset); + for (const auto &entry : look) { + CharReach cr = entry.reach; + // Flip heavy character classes to save buckets. + if (cr.count() > 128 ) { + cr.flip(); + } else { + neg_mask ^= 1 << (entry.offset - base_offset); + } + map lo2hi; + // We treat Ascii Table as a 16x16 grid. + // Push every row in cr into lo2hi and mark the row number. + for (size_t i = cr.find_first(); i != CharReach::npos;) { + u8 it_hi = i >> 4; + u16 low_encode = 0; + while (i != CharReach::npos && (i >> 4) == it_hi) { + low_encode |= 1 << (i & 0xf); + i = cr.find_next(i); + } + lo2hi[low_encode] |= 1 << it_hi; + } + for (const auto &it : lo2hi) { + u32 hi_lo = (it.second << 16) | it.first; + buckets[hi_lo].push_back(entry.offset); + } + } +} + +// Once we have a new bucket, we'll try to combine it with all old buckets. +static really_inline +void nibUpdate(map &nib, u32 hi_lo) { + u16 hi = hi_lo >> 16; + u16 lo = hi_lo & 0xffff; + for (const auto pairs : nib) { + u32 old = pairs.first; + if ((old >> 16) == hi || (old & 0xffff) == lo) { + if (!nib[old | hi_lo]) { + nib[old | hi_lo] = nib[old] | nib[hi_lo]; + } + } + } +} + +static really_inline +void nibMaskUpdate(array &mask, u32 data, u8 bit_index) { + for (u8 index = 0; data > 0; data >>= 1, index++) { + if (data & 1) { + // 0 ~ 7 bucket in first 16 bytes, + // 8 ~ 15 bucket in second 16 bytes. + if (bit_index >= 8) { + mask[index + 16] |= 1 << (bit_index - 8); + } else { + mask[index] |= 1 << bit_index; + } + } + } +} + +static +bool makeRoleShufti(const vector &look, + RoseProgram &program) { + + s32 base_offset = verify_s32(look.front().offset); + if (look.back().offset >= base_offset + 32) { + return false; + } + array hi_mask, lo_mask; + hi_mask.fill(0); + lo_mask.fill(0); + array bucket_select_hi, bucket_select_lo; + bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8. + bucket_select_lo.fill(0); + u8 bit_index = 0; // number of buckets + map nib; // map every bucket to its bucket number. + map, cmpNibble> bucket2offsets; + u32 neg_mask = ~0u; + + getAllBuckets(look, bucket2offsets, neg_mask); + + for (const auto &it : bucket2offsets) { + u32 hi_lo = it.first; + // New bucket. + if (!nib[hi_lo]) { + if (bit_index >= 16) { + return false; + } + nib[hi_lo] = 1 << bit_index; + + nibUpdate(nib, hi_lo); + nibMaskUpdate(hi_mask, hi_lo >> 16, bit_index); + nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_index); + bit_index++; + } + + DEBUG_PRINTF("hi_lo %x bucket %x\n", hi_lo, nib[hi_lo]); + + // Update bucket_select_mask. + u8 nib_hi = nib[hi_lo] >> 8; + u8 nib_lo = nib[hi_lo] & 0xff; + for (const auto offset : it.second) { + bucket_select_hi[offset - base_offset] |= nib_hi; + bucket_select_lo[offset - base_offset] |= nib_lo; + } + } + + DEBUG_PRINTF("hi_mask %s\n", + convertMaskstoString(hi_mask.data(), 32).c_str()); + DEBUG_PRINTF("lo_mask %s\n", + convertMaskstoString(lo_mask.data(), 32).c_str()); + DEBUG_PRINTF("bucket_select_hi %s\n", + convertMaskstoString(bucket_select_hi.data(), 32).c_str()); + DEBUG_PRINTF("bucket_select_lo %s\n", + convertMaskstoString(bucket_select_lo.data(), 32).c_str()); + + const auto *end_inst = program.end_instruction(); + if (bit_index < 8) { + if (look.back().offset < base_offset + 16) { + neg_mask &= 0xffff; + array nib_mask; + array bucket_select_mask_16; + copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin()); + copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin() + 16); + copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16, + bucket_select_mask_16.begin()); + auto ri = make_unique + (nib_mask, bucket_select_mask_16, + neg_mask, base_offset, end_inst); + program.add_before_end(move(ri)); + } else { + array hi_mask_16; + array lo_mask_16; + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin()); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin()); + auto ri = make_unique + (hi_mask_16, lo_mask_16, bucket_select_lo, + neg_mask, base_offset, end_inst); + program.add_before_end(move(ri)); + } + } else { + if (look.back().offset < base_offset + 16) { + neg_mask &= 0xffff; + array bucket_select_mask_32; + copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16, + bucket_select_mask_32.begin()); + copy(bucket_select_hi.begin(), bucket_select_hi.begin() + 16, + bucket_select_mask_32.begin() + 16); + auto ri = make_unique + (hi_mask, lo_mask, bucket_select_mask_32, + neg_mask, base_offset, end_inst); + program.add_before_end(move(ri)); + } else { + return false; + auto ri = make_unique + (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo, + neg_mask, base_offset, end_inst); + program.add_before_end(move(ri)); + } + } + return true; +} + /** * Builds a lookaround instruction, or an appropriate specialization if one is * available. @@ -2909,6 +3085,10 @@ void makeLookaroundInstruction(build_context &bc, const vector &look, return; } + if (makeRoleShufti(look, program)) { + return; + } + u32 look_idx = addLookaround(bc, look); u32 look_count = verify_u32(look.size()); diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp index 168022f3..69ad31a9 100644 --- a/src/rose/rose_build_program.cpp +++ b/src/rose/rose_build_program.cpp @@ -154,6 +154,60 @@ void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob, inst->fail_jump = calc_jump(offset_map, this, target); } +void RoseInstrCheckShufti16x8::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(nib_mask), end(nib_mask), inst->nib_mask); + copy(begin(bucket_select_mask), end(bucket_select_mask), + inst->bucket_select_mask); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + +void RoseInstrCheckShufti32x8::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(hi_mask), end(hi_mask), inst->hi_mask); + copy(begin(lo_mask), end(lo_mask), inst->lo_mask); + copy(begin(bucket_select_mask), end(bucket_select_mask), + inst->bucket_select_mask); + + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + +void RoseInstrCheckShufti16x16::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(hi_mask), end(hi_mask), inst->hi_mask); + copy(begin(lo_mask), end(lo_mask), inst->lo_mask); + copy(begin(bucket_select_mask), end(bucket_select_mask), + inst->bucket_select_mask); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + +void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(hi_mask), end(hi_mask), inst->hi_mask); + copy(begin(lo_mask), end(lo_mask), inst->lo_mask); + copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi), + inst->bucket_select_mask_hi); + copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo), + inst->bucket_select_mask_lo); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob, const OffsetMap &offset_map) const { RoseInstrBase::write(dest, blob, offset_map); diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h index 27aeffbe..309a1b3e 100644 --- a/src/rose/rose_build_program.h +++ b/src/rose/rose_build_program.h @@ -524,6 +524,190 @@ public: } }; +class RoseInstrCheckShufti16x8 + : public RoseInstrBaseOneTarget { +public: + std::array nib_mask; + std::array bucket_select_mask; + u32 neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti16x8(std::array nib_mask_in, + std::array bucket_select_mask_in, + u32 neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : nib_mask(move(nib_mask_in)), + bucket_select_mask(move(bucket_select_mask_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti16x8 &ri) const { + return nib_mask == ri.nib_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(static_cast(opcode), nib_mask, + bucket_select_mask, neg_mask, offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti16x8 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return nib_mask == ri.nib_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + +class RoseInstrCheckShufti32x8 + : public RoseInstrBaseOneTarget { +public: + std::array hi_mask; + std::array lo_mask; + std::array bucket_select_mask; + u32 neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti32x8(std::array hi_mask_in, + std::array lo_mask_in, + std::array bucket_select_mask_in, + u32 neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)), + bucket_select_mask(move(bucket_select_mask_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti32x8 &ri) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(static_cast(opcode), hi_mask, lo_mask, + bucket_select_mask, neg_mask, offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti32x8 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + +class RoseInstrCheckShufti16x16 + : public RoseInstrBaseOneTarget { +public: + std::array hi_mask; + std::array lo_mask; + std::array bucket_select_mask; + u32 neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti16x16(std::array hi_mask_in, + std::array lo_mask_in, + std::array bucket_select_mask_in, + u32 neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)), + bucket_select_mask(move(bucket_select_mask_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti16x16 &ri) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(static_cast(opcode), hi_mask, lo_mask, + bucket_select_mask, neg_mask, offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti16x16 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + +class RoseInstrCheckShufti32x16 + : public RoseInstrBaseOneTarget { +public: + std::array hi_mask; + std::array lo_mask; + std::array bucket_select_mask_hi; + std::array bucket_select_mask_lo; + u32 neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti32x16(std::array hi_mask_in, + std::array lo_mask_in, + std::array bucket_select_mask_hi_in, + std::array bucket_select_mask_lo_in, + u32 neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)), + bucket_select_mask_hi(move(bucket_select_mask_hi_in)), + bucket_select_mask_lo(move(bucket_select_mask_lo_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti32x16 &ri) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask_hi == ri.bucket_select_mask_hi && + bucket_select_mask_lo == ri.bucket_select_mask_lo && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(static_cast(opcode), hi_mask, lo_mask, + bucket_select_mask_hi, bucket_select_mask_lo, + neg_mask, offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti32x16 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask_hi == ri.bucket_select_mask_hi && + bucket_select_mask_lo == ri.bucket_select_mask_lo && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + class RoseInstrCheckInfix : public RoseInstrBaseOneTargetnib_mask, sizeof(ri->nib_mask)) + << endl; + os << " bucket_select_mask " + << dumpStrMask(ri->bucket_select_mask, + sizeof(ri->bucket_select_mask)) + << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x8) { + os << " hi_mask " + << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask)) + << endl; + os << " lo_mask " + << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask)) + << endl; + os << " bucket_select_mask " + << dumpStrMask(ri->bucket_select_mask, + sizeof(ri->bucket_select_mask)) + << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_16x16) { + os << " hi_mask " + << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask)) + << endl; + os << " lo_mask " + << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask)) + << endl; + os << " bucket_select_mask " + << dumpStrMask(ri->bucket_select_mask, + sizeof(ri->bucket_select_mask)) + << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x16) { + os << " hi_mask " + << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask)) + << endl; + os << " lo_mask " + << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask)) + << endl; + os << " bucket_select_mask_hi " + << dumpStrMask(ri->bucket_select_mask_hi, + sizeof(ri->bucket_select_mask_hi)) + << endl; + os << " bucket_select_mask_lo " + << dumpStrMask(ri->bucket_select_mask_lo, + sizeof(ri->bucket_select_mask_lo)) + << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + } + PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_INFIX) { os << " queue " << ri->queue << endl; os << " lag " << ri->lag << endl; diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h index ba3e586b..44d5d524 100644 --- a/src/rose/rose_program.h +++ b/src/rose/rose_program.h @@ -52,6 +52,10 @@ enum RoseInstructionCode { ROSE_INSTR_CHECK_MASK, //!< 8-bytes mask check. ROSE_INSTR_CHECK_MASK_32, //!< 32-bytes and/cmp/neg mask check. ROSE_INSTR_CHECK_BYTE, //!< Single Byte check. + ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti. ROSE_INSTR_CHECK_INFIX, //!< Infix engine must be in accept state. ROSE_INSTR_CHECK_PREFIX, //!< Prefix engine must be in accept state. ROSE_INSTR_PUSH_DELAYED, //!< Push delayed literal matches. @@ -184,6 +188,48 @@ struct ROSE_STRUCT_CHECK_BYTE { u32 fail_jump; //!< Jump forward this many bytes on failure. }; +// Since m128 and m256 could be missaligned in the bytecode, +// we'll use u8[16] and u8[32] instead in all rose_check_shufti structures. +struct ROSE_STRUCT_CHECK_SHUFTI_16x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 nib_mask[32]; //!< High 16 and low 16 bits nibble mask in shufti. + u8 bucket_select_mask[16]; //!< Mask for bucket assigning. + u32 neg_mask; //!< Negation mask in low 16 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_32x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[16]; //!< High nibble mask in shufti. + u8 lo_mask[16]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[32]; //!< Mask for bucket assigning. + u32 neg_mask; //!< 32 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_16x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[32]; //!< High nibble mask in shufti. + u8 lo_mask[32]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[32]; //!< Mask for bucket assigning. + u32 neg_mask; //!< Negation mask in low 16 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_32x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[32]; //!< High nibble mask in shufti. + u8 lo_mask[32]; //!< Low nibble mask in shufti. + u8 bucket_select_mask_hi[32]; //!< Bucket mask for high 8 buckets. + u8 bucket_select_mask_lo[32]; //!< Bucket mask for low 8 buckets. + u32 neg_mask; //!< 32 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + struct ROSE_STRUCT_CHECK_INFIX { u8 code; //!< From enum RoseInstructionCode. u32 queue; //!< Queue of leftfix to check. diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h new file mode 100644 index 00000000..49d2c2fe --- /dev/null +++ b/src/rose/validate_shufti.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VALIDATE_SHUFTI_H +#define VALIDATE_SHUFTI_H + +#include "ue2common.h" +#include "util/simd_utils.h" + +#if defined(DEBUG) +static +void dumpMask(const void *mask, int len) { + const u8 *c = (const u8 *)mask; + for (int i = 0; i < len; i++) { + printf("%02x", c[i]); + } + printf("\n"); +} +#endif + +static really_inline +int validateShuftiMask16x16(const m256 data, const m256 hi_mask, + const m256 lo_mask, const m256 and_mask, + const u32 neg_mask, const u16 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 c_lo = vpshufb(lo_mask, and256(data, low4bits)); + m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); + m256 t = and256(c_lo, c_hi); + u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256())); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 32); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 32); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 32); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 32); + DEBUG_PRINTF("and_mask\n"); + dumpMask(&and_mask, 32); + DEBUG_PRINTF("nresult %x\n", nresult); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (((nresult >> 16) & nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask16x8(const m128 data, const m256 nib_mask, + const m128 and_mask, const u32 neg_mask, + const u16 valid_data_mask) { + m256 data_m256 = combine2x128(rshift64_m128(data, 4), data); + m256 low4bits = set32x8(0xf); + m256 c_nib = vpshufb(nib_mask, and256(data_m256, low4bits)); + m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); + m128 nresult = eq128(and128(t, and_mask), zeroes128()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data_m256, 32); + DEBUG_PRINTF("nib_mask\n"); + dumpMask(&nib_mask, 32); + DEBUG_PRINTF("c_nib\n"); + dumpMask(&c_nib, 32); + DEBUG_PRINTF("nresult\n"); + dumpMask(&nresult, 16); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (movemask128(nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask32x8(const m256 data, const m256 hi_mask, + const m256 lo_mask, const m256 and_mask, + const u32 neg_mask, const u32 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 c_lo = vpshufb(lo_mask, and256(data, low4bits)); + m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); + m256 t = and256(c_lo, c_hi); + m256 nresult = eq256(and256(t, and_mask), zeroes256()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 32); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 32); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 32); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 32); + DEBUG_PRINTF("nresult\n"); + dumpMask(&nresult, 32); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (movemask256(nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask32x16(const m256 data, + const m256 hi_mask_1, const m256 hi_mask_2, + const m256 lo_mask_1, const m256 lo_mask_2, + const m256 bucket_mask_hi, + const m256 bucket_mask_lo, const u32 neg_mask, + const u32 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 data_lo = and256(data, low4bits); + m256 data_hi = and256(rshift64_m256(data, 4), low4bits); + m256 c_lo_1 = vpshufb(lo_mask_1, data_lo); + m256 c_lo_2 = vpshufb(lo_mask_2, data_lo); + m256 c_hi_1 = vpshufb(hi_mask_1, data_hi); + m256 c_hi_2 = vpshufb(hi_mask_2, data_hi); + m256 t1 = and256(c_lo_1, c_hi_1); + m256 t2 = and256(c_lo_2, c_hi_2); + m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi)); + u32 nresult = movemask256(eq256(result, zeroes256())); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("data_lo\n"); + dumpMask(&data_lo, 32); + DEBUG_PRINTF("data_hi\n"); + dumpMask(&data_hi, 32); + DEBUG_PRINTF("hi_mask_1\n"); + dumpMask(&hi_mask_1, 16); + DEBUG_PRINTF("hi_mask_2\n"); + dumpMask(&hi_mask_2, 16); + DEBUG_PRINTF("lo_mask_1\n"); + dumpMask(&lo_mask_1, 16); + DEBUG_PRINTF("lo_mask_2\n"); + dumpMask(&lo_mask_2, 16); + DEBUG_PRINTF("c_lo_1\n"); + dumpMask(&c_lo_1, 32); + DEBUG_PRINTF("c_lo_2\n"); + dumpMask(&c_lo_2, 32); + DEBUG_PRINTF("c_hi_1\n"); + dumpMask(&c_hi_1, 32); + DEBUG_PRINTF("c_hi_2\n"); + dumpMask(&c_hi_2, 32); + DEBUG_PRINTF("result\n"); + dumpMask(&result, 32); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} +#endif diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index b7cb1c0f..4bb055df 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -384,6 +384,11 @@ u32 movemask256(m256 a) { return lo_mask | (hi_mask << 16); } +static really_inline +m256 set2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} #endif static really_inline m256 zeroes256(void) { @@ -534,6 +539,10 @@ static really_inline m256 load2x128(const void *ptr) { #endif } +static really_inline m256 loadu2x128(const void *ptr) { + return set2x128(loadu128(ptr)); +} + // aligned store static really_inline void store256(void *ptr, m256 a) { assert(ISALIGNED_N(ptr, alignof(m256))); @@ -632,6 +641,22 @@ char testbit256(m256 val, unsigned int n) { return testbit128(sub, n); } +static really_really_inline +m128 movdq_hi(m256 x) { + return x.hi; +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return x.lo; +} + +static really_inline +m256 combine2x128(m128 a, m128 b) { + m256 rv = {a, b}; + return rv; +} + #else // AVX2 // switches on bit N in the given vector. @@ -676,6 +701,14 @@ m128 movdq_lo(m256 x) { #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b); #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) +static really_inline +m256 combine2x128(m128 hi, m128 lo) { +#if defined(_mm256_set_m128i) + return _mm256_set_m128i(hi, lo); +#else + return insert128to256(cast128to256(hi), lo, 1); +#endif +} #endif //AVX2 /****