diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index d01e30e8..a574052a 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2019, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -767,10 +767,10 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask, c_shift = c_len - ci->len; c_len = ci->len; } - copy_upto_32_bytes((u8 *)&data - offset, ci->buf, c_len); + copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len); } assert(h_shift + h_len + c_len + c_shift == 32); - copy_upto_32_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); + copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); } else { if (offset + 32 > (s64a)ci->len) { if (offset >= (s64a)ci->len) { @@ -779,7 +779,7 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask, } c_len = ci->len - offset; c_shift = 32 - c_len; - copy_upto_32_bytes((u8 *)&data, ci->buf + offset, c_len); + copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len); } else { data = loadu256(ci->buf + offset); } @@ -800,12 +800,90 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask, return 0; } -// get 128/256 bits data from history and current buffer. +#ifdef HAVE_AVX512 +static rose_inline +int roseCheckMask64(const struct core_info *ci, const u8 *and_mask, + const u8 *cmp_mask, const u64a neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + m512 data = zeroes512(); // consists of the following four parts. + s32 c_shift = 0; // blank bytes after current. + s32 h_shift = 0; // blank bytes before history. + s32 h_len = 64; // number of bytes from history buffer. + s32 c_len = 0; // number of bytes from current buffer. + /* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/ + + if (offset < 0) { + s32 h_offset = 0; // the start offset in history buffer. + if (offset < -(s64a)ci->hlen) { + if (offset + 64 <= -(s64a)ci->hlen) { + DEBUG_PRINTF("all before history\n"); + return 1; + } + h_shift = -(offset + (s64a)ci->hlen); + h_len = 64 - h_shift; + } else { + h_offset = ci->hlen + offset; + } + if (offset + 64 > 0) { + // part in current buffer. + c_len = offset + 64; + h_len = -(offset + h_shift); + if (c_len > (s64a)ci->len) { + // out of current buffer. + c_shift = c_len - ci->len; + c_len = ci->len; + } + copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len); + } + assert(h_shift + h_len + c_len + c_shift == 64); + copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); + } else { + if (offset + 64 > (s64a)ci->len) { + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future.\n"); + return 1; + } + c_len = ci->len - offset; + c_shift = 64 - c_len; + copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len); + } else { + data = loadu512(ci->buf + offset); + } + } + DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); + DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); + // we use valid_data_mask to blind bytes before history/in the future. + u64a valid_data_mask; + valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift); + + m512 and_mask_m512 = loadu512(and_mask); + m512 cmp_mask_m512 = loadu512(cmp_mask); + + if (validateMask64(data, valid_data_mask, and_mask_m512, + cmp_mask_m512, neg_mask)) { + DEBUG_PRINTF("Mask64 passed\n"); + return 1; + } + return 0; +} +#endif + +// get 128/256/512 bits data from history and current buffer. // return data and valid_data_mask. static rose_inline -u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, +u64a getBufferDataComplex(const struct core_info *ci, const s64a loc, u8 *data, const u32 data_len) { - assert(data_len == 16 || data_len == 32); + assert(data_len == 16 || data_len == 32 || data_len == 64); s32 c_shift = 0; // blank bytes after current. s32 h_shift = 0; // blank bytes before history. s32 h_len = data_len; // number of bytes from history buffer. @@ -831,10 +909,10 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, c_shift = c_len - ci->len; c_len = ci->len; } - copy_upto_32_bytes(data - loc, ci->buf, c_len); + copy_upto_64_bytes(data - loc, ci->buf, c_len); } assert(h_shift + h_len + c_len + c_shift == (s32)data_len); - copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len); + copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len); } else { if (loc + data_len > (s64a)ci->len) { if (loc >= (s64a)ci->len) { @@ -843,8 +921,14 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, } c_len = ci->len - loc; c_shift = data_len - c_len; - copy_upto_32_bytes(data, ci->buf + loc, c_len); + copy_upto_64_bytes(data, ci->buf + loc, c_len); } else { +#ifdef HAVE_AVX512 + if (data_len == 64) { + storeu512(data, loadu512(ci->buf + loc)); + return ~0ULL; + } +#endif if (data_len == 16) { storeu128(data, loadu128(ci->buf + loc)); return 0xffff; @@ -857,6 +941,11 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc, DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); +#ifdef HAVE_AVX512 + if (data_len == 64) { + return (~0ULL) << (h_shift + c_shift) >> c_shift; + } +#endif if (data_len == 16) { return (u16)(0xffff << (h_shift + c_shift)) >> c_shift; } else { @@ -886,6 +975,19 @@ m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) { return *(m256 *)data; } +#ifdef HAVE_AVX512 +static rose_inline +m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) { + if (offset > 0 && offset + sizeof(m512) <= ci->len) { + *valid_data_mask = ~0ULL; + return loadu512(ci->buf + offset); + } + ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)]; + *valid_data_mask = getBufferDataComplex(ci, offset, data, 64); + return *(m512 *)data; +} +#endif + static rose_inline int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask, const u8 *bucket_select_mask, u32 neg_mask, @@ -1025,6 +1127,83 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask, } } +#ifdef HAVE_AVX512 +static rose_inline +int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u64a neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u64a valid_data_mask = 0; + m512 data = getData512(ci, offset, &valid_data_mask); + + if (unlikely(!valid_data_mask)) { + return 1; + } + + m512 hi_mask_m512 = loadu512(hi_mask); + m512 lo_mask_m512 = loadu512(lo_mask); + m512 bucket_select_mask_m512 = loadu512(bucket_select_mask); + if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512, + bucket_select_mask_m512, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 64x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1, + const u8 *hi_mask_2, const u8 *lo_mask_1, + const u8 *lo_mask_2, const u8 *bucket_select_mask_hi, + const u8 *bucket_select_mask_lo, u64a neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u64a valid_data_mask = 0; + m512 data = getData512(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m512 hi_mask_1_m512 = loadu512(hi_mask_1); + m512 hi_mask_2_m512 = loadu512(hi_mask_2); + m512 lo_mask_1_m512 = loadu512(lo_mask_1); + m512 lo_mask_2_m512 = loadu512(lo_mask_2); + + m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi); + m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo); + if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512, + lo_mask_1_m512, lo_mask_2_m512, + bucket_select_mask_hi_m512, + bucket_select_mask_lo_m512, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 64x16 successfully\n"); + return 1; + } else { + return 0; + } +} +#endif + static rose_inline int roseCheckSingleLookaround(const struct RoseEngine *t, const struct hs_scratch *scratch, @@ -2068,6 +2247,12 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, &&LABEL_ROSE_INSTR_FLUSH_COMBINATION, &&LABEL_ROSE_INSTR_SET_EXHAUST, &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION +#ifdef HAVE_AVX512 + , + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_MASK_64 //!< 64-bytes and/cmp/neg mask check. +#endif }; #endif @@ -2258,6 +2443,45 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, } PROGRAM_NEXT_INSTRUCTION +#ifdef HAVE_AVX512 + PROGRAM_CASE(CHECK_MASK_64) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_64x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_64x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2, + ri->lo_mask_1, ri->lo_mask_2, + ri->bucket_select_mask_hi, + ri->bucket_select_mask_lo, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP; + } + } + PROGRAM_NEXT_INSTRUCTION +#endif + PROGRAM_CASE(CHECK_INFIX) { if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report, end)) { @@ -2945,6 +3169,19 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION +#ifdef HAVE_AVX512 + L_PROGRAM_CASE(CHECK_MASK_64) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION +#endif + L_PROGRAM_CASE(CHECK_BYTE) { const struct core_info *ci = &scratch->core_info; if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp index 8999daef..dbc938a5 100644 --- a/src/rose/rose_build_dump.cpp +++ b/src/rose/rose_build_dump.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2019, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -757,13 +757,12 @@ CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) { static void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi, - const u8 *bucket_mask, u32 neg_mask, s32 offset) { - assert(len == 16 || len == 32); + const u8 *bucket_mask, u64a neg_mask, s32 offset) { + assert(len == 16 || len == 32 || len == 64); os << " contents:" << endl; for (u32 idx = 0; idx < len; idx++) { CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]); - - if (neg_mask & (1U << idx)) { + if (neg_mask & (1ULL << idx)) { cr.flip(); } @@ -779,14 +778,13 @@ void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi, static void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi, const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask, - const u8 *bucket_mask_2, u32 neg_mask, s32 offset) { - assert(len == 16 || len == 32); + const u8 *bucket_mask_2, u64a neg_mask, s32 offset) { + assert(len == 16 || len == 32 || len == 64); os << " contents:" << endl; for (u32 idx = 0; idx < len; idx++) { CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]); cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]); - - if (neg_mask & (1U << idx)) { + if (neg_mask & (1ULL << idx)) { cr.flip(); } @@ -970,6 +968,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) { } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_MASK_64) { + os << " and_mask " + << dumpStrMask(ri->and_mask, sizeof(ri->and_mask)) + << endl; + os << " cmp_mask " + << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask)) + << endl; + os << " neg_mask 0x" << std::hex << std::setw(8) + << std::setfill('0') << ri->neg_mask << std::dec << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + } + PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_BYTE) { os << " and_mask 0x" << std::hex << std::setw(2) << std::setfill('0') << u32{ri->and_mask} << std::dec @@ -1072,6 +1084,60 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) { } PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_SHUFTI_64x8) { + os << " hi_mask " + << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask)) + << endl; + os << " lo_mask " + << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask)) + << endl; + os << " bucket_select_mask " + << dumpStrMask(ri->bucket_select_mask, + sizeof(ri->bucket_select_mask)) + << endl; + os << " neg_mask 0x" << std::hex << std::setw(8) + << std::setfill('0') << ri->neg_mask << std::dec << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + dumpLookaroundShufti(os, 64, ri->lo_mask, ri->hi_mask, + ri->bucket_select_mask, ri->neg_mask, + ri->offset); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_64x16) { + os << " hi_mask_1 " + << dumpStrMask(ri->hi_mask_1, sizeof(ri->hi_mask_1)) + << endl; + os << " hi_mask_2 " + << dumpStrMask(ri->hi_mask_2, sizeof(ri->hi_mask_2)) + << endl; + os << " lo_mask_1 " + << dumpStrMask(ri->lo_mask_1, sizeof(ri->lo_mask_1)) + << endl; + os << " lo_mask_2 " + << dumpStrMask(ri->lo_mask_2, sizeof(ri->lo_mask_2)) + << endl; + os << " bucket_select_mask_hi " + << dumpStrMask(ri->bucket_select_mask_hi, + sizeof(ri->bucket_select_mask_hi)) + << endl; + os << " bucket_select_mask_lo " + << dumpStrMask(ri->bucket_select_mask_lo, + sizeof(ri->bucket_select_mask_lo)) + << endl; + os << " neg_mask 0x" << std::hex << std::setw(8) + << std::setfill('0') << ri->neg_mask << std::dec << endl; + os << " offset " << ri->offset << endl; + os << " fail_jump " << offset + ri->fail_jump << endl; + dumpLookaroundShufti(os, 64, ri->lo_mask_1, ri->hi_mask_1, + ri->lo_mask_2, ri->hi_mask_2, + ri->bucket_select_mask_lo, + ri->bucket_select_mask_hi, + ri->neg_mask, ri->offset); + } + PROGRAM_NEXT_INSTRUCTION + PROGRAM_CASE(CHECK_INFIX) { os << " queue " << ri->queue << endl; os << " lag " << ri->lag << endl; diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp index c503f731..f96221b2 100644 --- a/src/rose/rose_build_instructions.cpp +++ b/src/rose/rose_build_instructions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019, Intel Corporation + * Copyright (c) 2017-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -162,6 +162,17 @@ void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob, inst->fail_jump = calc_jump(offset_map, this, target); } +void RoseInstrCheckMask64::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(and_mask), end(and_mask), inst->and_mask); + copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob, const OffsetMap &offset_map) const { RoseInstrBase::write(dest, blob, offset_map); @@ -227,6 +238,36 @@ void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob, inst->fail_jump = calc_jump(offset_map, this, target); } +void RoseInstrCheckShufti64x8::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(hi_mask), end(hi_mask), inst->hi_mask); + copy(begin(lo_mask), end(lo_mask), inst->lo_mask); + copy(begin(bucket_select_mask), end(bucket_select_mask), + inst->bucket_select_mask); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + +void RoseInstrCheckShufti64x16::write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const { + RoseInstrBase::write(dest, blob, offset_map); + auto *inst = static_cast(dest); + copy(begin(hi_mask_1), end(hi_mask_1), inst->hi_mask_1); + copy(begin(hi_mask_2), end(hi_mask_2), inst->hi_mask_2); + copy(begin(lo_mask_1), end(lo_mask_1), inst->lo_mask_1); + copy(begin(lo_mask_2), end(lo_mask_2), inst->lo_mask_2); + copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi), + inst->bucket_select_mask_hi); + copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo), + inst->bucket_select_mask_lo); + inst->neg_mask = neg_mask; + inst->offset = offset; + inst->fail_jump = calc_jump(offset_map, this, target); +} + void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob, const OffsetMap &offset_map) const { RoseInstrBase::write(dest, blob, offset_map); diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h index 306a4166..f18f4a47 100644 --- a/src/rose/rose_build_instructions.h +++ b/src/rose/rose_build_instructions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019, Intel Corporation + * Copyright (c) 2017-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -519,6 +519,43 @@ public: } }; +class RoseInstrCheckMask64 + : public RoseInstrBaseOneTarget { +public: + std::array and_mask; + std::array cmp_mask; + u64a neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckMask64(std::array and_mask_in, + std::array cmp_mask_in, u64a neg_mask_in, + s32 offset_in, const RoseInstruction *target_in) + : and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + bool operator==(const RoseInstrCheckMask64 &ri) const { + return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckMask64 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + class RoseInstrCheckByte : public RoseInstrBaseOneTarget { +public: + std::array hi_mask; + std::array lo_mask; + std::array bucket_select_mask; + u64a neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti64x8(std::array hi_mask_in, + std::array lo_mask_in, + std::array bucket_select_mask_in, + u64a neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)), + bucket_select_mask(std::move(bucket_select_mask_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti64x8 &ri) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask, + offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti64x8 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask && + bucket_select_mask == ri.bucket_select_mask && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + +class RoseInstrCheckShufti64x16 + : public RoseInstrBaseOneTarget { +public: + std::array hi_mask_1; + std::array hi_mask_2; + std::array lo_mask_1; + std::array lo_mask_2; + std::array bucket_select_mask_hi; + std::array bucket_select_mask_lo; + u64a neg_mask; + s32 offset; + const RoseInstruction *target; + + RoseInstrCheckShufti64x16(std::array hi_mask_1_in, + std::array hi_mask_2_in, + std::array lo_mask_1_in, + std::array lo_mask_2_in, + std::array bucket_select_mask_hi_in, + std::array bucket_select_mask_lo_in, + u64a neg_mask_in, s32 offset_in, + const RoseInstruction *target_in) + : hi_mask_1(std::move(hi_mask_1_in)), hi_mask_2(std::move(hi_mask_2_in)), + lo_mask_1(std::move(lo_mask_1_in)), lo_mask_2(std::move(lo_mask_2_in)), + bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)), + bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)), + neg_mask(neg_mask_in), offset(offset_in), target(target_in) {} + + bool operator==(const RoseInstrCheckShufti64x16 &ri) const { + return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 && + lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 && + bucket_select_mask_hi == ri.bucket_select_mask_hi && + bucket_select_mask_lo == ri.bucket_select_mask_lo && + neg_mask == ri.neg_mask && offset == ri.offset && + target == ri.target; + } + + size_t hash() const override { + return hash_all(opcode, hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2, + bucket_select_mask_hi, bucket_select_mask_lo, neg_mask, + offset); + } + + void write(void *dest, RoseEngineBlob &blob, + const OffsetMap &offset_map) const override; + + bool equiv_to(const RoseInstrCheckShufti64x16 &ri, const OffsetMap &offsets, + const OffsetMap &other_offsets) const { + return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 && + lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 && + bucket_select_mask_hi == ri.bucket_select_mask_hi && + bucket_select_mask_lo == ri.bucket_select_mask_lo && + neg_mask == ri.neg_mask && offset == ri.offset && + offsets.at(target) == other_offsets.at(ri.target); + } +}; + class RoseInstrCheckInfix : public RoseInstrBaseOneTarget &look, return true; } +static +bool makeRoleMask64(const vector &look, + RoseProgram &program, const target_t &target) { + if (!target.has_avx512()) { + return false; + } + + if (look.back().offset >= look.front().offset + 64) { + return false; + } + s32 base_offset = verify_s32(look.front().offset); + array and_mask, cmp_mask; + and_mask.fill(0); + cmp_mask.fill(0); + u64a neg_mask = 0; + for (const auto &entry : look) { + u8 andmask_u8, cmpmask_u8, flip; + if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) { + return false; + } + u32 shift = entry.offset - base_offset; + assert(shift < 64); + and_mask[shift] = andmask_u8; + cmp_mask[shift] = cmpmask_u8; + if (flip) { + neg_mask |= 1ULL << shift; + } + } + + DEBUG_PRINTF("and_mask %s\n", + convertMaskstoString(and_mask.data(), 64).c_str()); + DEBUG_PRINTF("cmp_mask %s\n", + convertMaskstoString(cmp_mask.data(), 64).c_str()); + DEBUG_PRINTF("neg_mask %llx\n", neg_mask); + DEBUG_PRINTF("base_offset %d\n", base_offset); + + const auto *end_inst = program.end_instruction(); + auto ri = make_unique(and_mask, cmp_mask, neg_mask, + base_offset, end_inst); + program.add_before_end(move(ri)); + return true; +} + // Sorting by the size of every bucket. // Used in map, cmpNibble>. struct cmpNibble { @@ -1084,6 +1127,7 @@ void getAllBuckets(const vector &look, } else { neg_mask ^= 1ULL << (entry.offset - base_offset); } + map lo2hi; // We treat Ascii Table as a 16x16 grid. // Push every row in cr into lo2hi and mark the row number. @@ -1237,6 +1281,7 @@ makeCheckShufti16x16(u32 offset_range, u8 bucket_idx, (hi_mask, lo_mask, bucket_select_mask_32, neg_mask & 0xffff, base_offset, end_inst); } + static unique_ptr makeCheckShufti32x16(u32 offset_range, u8 bucket_idx, @@ -1255,10 +1300,83 @@ makeCheckShufti32x16(u32 offset_range, u8 bucket_idx, } static -bool makeRoleShufti(const vector &look, RoseProgram &program) { +unique_ptr +makeCheckShufti64x8(u32 offset_range, u8 bucket_idx, + const array &hi_mask, const array &lo_mask, + const array &bucket_select_mask, + u64a neg_mask, s32 base_offset, + const RoseInstruction *end_inst) { + if (offset_range > 64 || bucket_idx > 8) { + return nullptr; + } + array hi_mask_64; + array lo_mask_64; + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin()); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 16); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 32); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 48); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin()); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 16); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 32); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 48); + + return make_unique + (hi_mask_64, lo_mask_64, bucket_select_mask, + neg_mask, base_offset, end_inst); +} + +static +unique_ptr +makeCheckShufti64x16(u32 offset_range, u8 bucket_idx, + const array &hi_mask, const array &lo_mask, + const array &bucket_select_mask_lo, + const array &bucket_select_mask_hi, + u64a neg_mask, s32 base_offset, + const RoseInstruction *end_inst) { + if (offset_range > 64 || bucket_idx > 16) { + return nullptr; + } + + array hi_mask_1; + array hi_mask_2; + array lo_mask_1; + array lo_mask_2; + + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin()); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 16); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 32); + copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 48); + copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin()); + copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 16); + copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 32); + copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 48); + + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin()); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 16); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 32); + copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 48); + copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin()); + copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 16); + copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 32); + copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 48); + + return make_unique + (hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2, bucket_select_mask_hi, + bucket_select_mask_lo, neg_mask, base_offset, end_inst); +} + +static +bool makeRoleShufti(const vector &look, RoseProgram &program, + const target_t &target) { + s32 offset_limit; + if (target.has_avx512()) { + offset_limit = 64; + } else { + offset_limit = 32; + } s32 base_offset = verify_s32(look.front().offset); - if (look.back().offset >= base_offset + 32) { + if (look.back().offset >= base_offset + offset_limit) { return false; } @@ -1266,17 +1384,40 @@ bool makeRoleShufti(const vector &look, RoseProgram &program) { u64a neg_mask_64; array hi_mask; array lo_mask; + array bucket_select_hi_64; // for AVX512 + array bucket_select_lo_64; // for AVX512 array bucket_select_hi; array bucket_select_lo; hi_mask.fill(0); lo_mask.fill(0); + bucket_select_hi_64.fill(0); + bucket_select_lo_64.fill(0); bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8. bucket_select_lo.fill(0); - if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(), - bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) { - return false; + if (target.has_avx512()) { + if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi_64.data(), + bucket_select_lo_64.data(), neg_mask_64, bucket_idx, + 32)) { + return false; + } + copy(bucket_select_hi_64.begin(), bucket_select_hi_64.begin() + 32, + bucket_select_hi.begin()); + copy(bucket_select_lo_64.begin(), bucket_select_lo_64.begin() + 32, + bucket_select_lo.begin()); + + DEBUG_PRINTF("bucket_select_hi_64 %s\n", + convertMaskstoString(bucket_select_hi_64.data(), 64).c_str()); + DEBUG_PRINTF("bucket_select_lo_64 %s\n", + convertMaskstoString(bucket_select_lo_64.data(), 64).c_str()); + } else { + if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(), + bucket_select_lo.data(), neg_mask_64, bucket_idx, + 32)) { + return false; + } } + u32 neg_mask = (u32)neg_mask_64; DEBUG_PRINTF("hi_mask %s\n", @@ -1299,6 +1440,13 @@ bool makeRoleShufti(const vector &look, RoseProgram &program) { bucket_select_lo, neg_mask, base_offset, end_inst); } + if (target.has_avx512()) { + if (!ri) { + ri = makeCheckShufti64x8(offset_range, bucket_idx, hi_mask, lo_mask, + bucket_select_lo_64, neg_mask_64, + base_offset, end_inst); + } + } if (!ri) { ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask, bucket_select_lo, bucket_select_hi, @@ -1309,6 +1457,13 @@ bool makeRoleShufti(const vector &look, RoseProgram &program) { bucket_select_lo, bucket_select_hi, neg_mask, base_offset, end_inst); } + if (target.has_avx512()) { + if (!ri) { + ri = makeCheckShufti64x16(offset_range, bucket_idx, hi_mask, lo_mask, + bucket_select_lo_64, bucket_select_hi_64, + neg_mask_64, base_offset, end_inst); + } + } assert(ri); program.add_before_end(move(ri)); @@ -1321,7 +1476,7 @@ bool makeRoleShufti(const vector &look, RoseProgram &program) { */ static void makeLookaroundInstruction(const vector &look, - RoseProgram &program) { + RoseProgram &program, const target_t &target) { assert(!look.empty()); if (makeRoleByte(look, program)) { @@ -1345,7 +1500,11 @@ void makeLookaroundInstruction(const vector &look, return; } - if (makeRoleShufti(look, program)) { + if (makeRoleMask64(look, program, target)) { + return; + } + + if (makeRoleShufti(look, program, target)) { return; } @@ -1386,7 +1545,7 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id, return; // all caseful chars handled by HWLM mask. } - makeLookaroundInstruction(look, program); + makeLookaroundInstruction(look, program, build.cc.target_info); } static @@ -1730,7 +1889,7 @@ void makeRoleLookaround(const RoseBuildImpl &build, findLookaroundMasks(build, v, look_more); mergeLookaround(look, look_more); if (!look.empty()) { - makeLookaroundInstruction(look, program); + makeLookaroundInstruction(look, program, build.cc.target_info); } return; } diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h index e5485476..7e21303c 100644 --- a/src/rose/rose_program.h +++ b/src/rose/rose_program.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2019, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -208,7 +208,11 @@ enum RoseInstructionCode { */ ROSE_INSTR_LAST_FLUSH_COMBINATION, - LAST_ROSE_INSTRUCTION = ROSE_INSTR_LAST_FLUSH_COMBINATION //!< Sentinel. + ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti. + ROSE_INSTR_CHECK_MASK_64, //!< 64-bytes and/cmp/neg mask check. + + LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel. }; struct ROSE_STRUCT_END { @@ -285,6 +289,15 @@ struct ROSE_STRUCT_CHECK_MASK_32 { u32 fail_jump; //!< Jump forward this many bytes on failure. }; +struct ROSE_STRUCT_CHECK_MASK_64 { + u8 code; //!< From enum RoseInstructionCode. + u8 and_mask[64]; //!< 64-byte and mask. + u8 cmp_mask[64]; //!< 64-byte cmp mask. + u64a neg_mask; //!< negation mask with 32 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + struct ROSE_STRUCT_CHECK_BYTE { u8 code; //!< From enum RoseInstructionCode. u8 and_mask; //!< 8-bits and mask. @@ -336,6 +349,29 @@ struct ROSE_STRUCT_CHECK_SHUFTI_32x16 { u32 fail_jump; //!< Jump forward this many bytes on failure. }; +struct ROSE_STRUCT_CHECK_SHUFTI_64x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[64]; //!< High nibble mask in shufti. + u8 lo_mask[64]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[64]; //!< Mask for bucket assigning. + u64a neg_mask; //!< 64 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_64x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask. + u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask. + u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask. + u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask. + u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets. + u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets. + u64a neg_mask; //!< 64 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + struct ROSE_STRUCT_CHECK_INFIX { u8 code; //!< From enum RoseInstructionCode. u32 queue; //!< Queue of leftfix to check. diff --git a/src/rose/stream_long_lit.h b/src/rose/stream_long_lit.h index 34867608..df9b57f4 100644 --- a/src/rose/stream_long_lit.h +++ b/src/rose/stream_long_lit.h @@ -201,12 +201,12 @@ const u8 *prepScanBuffer(const struct core_info *ci, } else { // Copy: first chunk from history buffer. assert(overhang <= ci->hlen); - copy_upto_32_bytes(tempbuf, ci->hbuf + ci->hlen - overhang, + copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang, overhang); // Copy: second chunk from current buffer. size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang; assert(copy_buf_len <= ci->len); - copy_upto_32_bytes(tempbuf + overhang, ci->buf, copy_buf_len); + copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len); // Read from our temporary buffer for the hash. base = tempbuf; } diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h index ac8cc312..8191db52 100644 --- a/src/rose/validate_mask.h +++ b/src/rose/validate_mask.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,6 +41,17 @@ void validateMask32Print(const u8 *mask) { } printf("\n"); } + +#ifdef HAVE_AVX512 +static +void validateMask64Print(const u8 *mask) { + int i; + for (i = 0; i < 64; i++) { + printf("%02x ", mask[i]); + } + printf("\n"); +} +#endif #endif // check positive bytes in cmp_result. @@ -115,4 +126,29 @@ int validateMask32(const m256 data, const u32 valid_data_mask, } } +#ifdef HAVE_AVX512 +static really_inline +int validateMask64(const m512 data, const u64a valid_data_mask, + const m512 and_mask, const m512 cmp_mask, + const u64a neg_mask) { + u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + validateMask64Print((const u8 *)&data); + DEBUG_PRINTF("cmp_result\n"); + validateMask64Print((const u8 *)&cmp_result); +#endif + DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask); + DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask); + + if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) { + DEBUG_PRINTF("checkCompareResult64 passed\n"); + return 1; + } else { + DEBUG_PRINTF("checkCompareResult64 failed\n"); + return 0; + } +} +#endif + #endif diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h index 3b91f091..88427027 100644 --- a/src/rose/validate_shufti.h +++ b/src/rose/validate_shufti.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -175,6 +175,84 @@ int validateShuftiMask32x16(const m256 data, return !cmp_result; } +#ifdef HAVE_AVX512 +static really_inline +int validateShuftiMask64x8(const m512 data, const m512 hi_mask, + const m512 lo_mask, const m512 and_mask, + const u64a neg_mask, const u64a valid_data_mask) { + m512 low4bits = set64x8(0xf); + m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits)); + m512 c_hi = pshufb_m512(hi_mask, + rshift64_m512(andnot512(low4bits, data), 4)); + m512 t = and512(c_lo, c_hi); + u64a nresult = eq512mask(and512(t, and_mask), zeroes512()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 64); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 64); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 64); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 64); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 64); + DEBUG_PRINTF("nresult %llx\n", nresult); + DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask); +#endif + u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask64x16(const m512 data, + const m512 hi_mask_1, const m512 hi_mask_2, + const m512 lo_mask_1, const m512 lo_mask_2, + const m512 and_mask_hi, const m512 and_mask_lo, + const u64a neg_mask, const u64a valid_data_mask) { + m512 low4bits = set64x8(0xf); + m512 data_lo = and512(data, low4bits); + m512 data_hi = and512(rshift64_m512(data, 4), low4bits); + m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo); + m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo); + m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi); + m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi); + m512 t1 = and512(c_lo_1, c_hi_1); + m512 t2 = and512(c_lo_2, c_hi_2); + m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi)); + u64a nresult = eq512mask(result, zeroes512()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 64); + DEBUG_PRINTF("data_lo\n"); + dumpMask(&data_lo, 64); + DEBUG_PRINTF("data_hi\n"); + dumpMask(&data_hi, 64); + DEBUG_PRINTF("hi_mask_1\n"); + dumpMask(&hi_mask_1, 64); + DEBUG_PRINTF("hi_mask_2\n"); + dumpMask(&hi_mask_2, 64); + DEBUG_PRINTF("lo_mask_1\n"); + dumpMask(&lo_mask_1, 64); + DEBUG_PRINTF("lo_mask_2\n"); + dumpMask(&lo_mask_2, 64); + DEBUG_PRINTF("c_lo_1\n"); + dumpMask(&c_lo_1, 64); + DEBUG_PRINTF("c_lo_2\n"); + dumpMask(&c_lo_2, 64); + DEBUG_PRINTF("c_hi_1\n"); + dumpMask(&c_hi_1, 64); + DEBUG_PRINTF("c_hi_2\n"); + dumpMask(&c_hi_2, 64); + DEBUG_PRINTF("result\n"); + dumpMask(&result, 64); + DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask); +#endif + u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} +#endif + static really_inline int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) { u32 t = ~(data | hi_bits); diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 5270808a..81816cf1 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -424,6 +424,11 @@ static really_inline m256 loadu256(const void *ptr) { return _mm256_loadu_si256((const m256 *)ptr); } +static really_inline +m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { + return _mm256_maskz_loadu_epi8(k, ptr); +} + // unaligned store static really_inline void storeu256(void *ptr, m256 a) { _mm256_storeu_si256((m256 *)ptr, a); @@ -712,6 +717,22 @@ m512 loadu512(const void *ptr) { return _mm512_loadu_si512(ptr); } +// unaligned store +static really_inline +void storeu512(void *ptr, m512 a) { +#if defined(HAVE_AVX512) + _mm512_storeu_si512((m512 *)ptr, a); +#elif defined(HAVE_AVX2) + storeu256(ptr, a.lo); + storeu256((char *)ptr + 32, a.hi); +#else + storeu128(ptr, a.lo.lo); + storeu128((char *)ptr + 16, a.lo.hi); + storeu128((char *)ptr + 32, a.hi.lo); + storeu128((char *)ptr + 48, a.hi.hi); +#endif +} + static really_inline m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { return _mm512_maskz_loadu_epi8(k, ptr); @@ -722,6 +743,11 @@ m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { return _mm512_mask_loadu_epi8(src, k, ptr); } +static really_inline +void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { + _mm512_mask_storeu_epi8(ptr, k, a); +} + static really_inline m512 set_mask_m512(__mmask64 k) { return _mm512_movm_epi8(k); diff --git a/src/util/copybytes.h b/src/util/copybytes.h index 872b8d28..7f37d96b 100644 --- a/src/util/copybytes.h +++ b/src/util/copybytes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,7 +33,7 @@ #include "simd_utils.h" static really_inline -void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) { +void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) { switch (len) { case 0: break; @@ -72,14 +72,41 @@ void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) { case 16: storeu128(dst, loadu128(src)); break; - case 32: - storeu256(dst, loadu256(src)); - break; - default: - assert(len < 32); + case 17: + case 18: + case 19: + case 20: + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + case 31: storeu128(dst + len - 16, loadu128(src + len - 16)); storeu128(dst, loadu128(src)); break; + case 32: + storeu256(dst, loadu256(src)); + break; +#ifdef HAVE_AVX512 + case 64: + storebytes512(dst, loadu512(src), 64); + break; + default: + assert(len < 64); + u64a k = (1ULL << len) - 1; + storeu_mask_m512(dst, k, loadu_maskz_m512(k, src)); + break; +#else + default: + assert(0); + break; +#endif } }