diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 92e75aaa..aa5f1804 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -725,13 +725,18 @@ static never_inline hwlm_error_t fdr_engine_exec(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { + assert(ISALIGNED_CL(fdr)); + u32 floodBackoff = FLOOD_BACKOFF_START; u32 last_match_id = INVALID_MATCH_ID; u32 domain_mask_flipped = ~fdr->domainMask; u8 stride = fdr->stride; const u64a *ft = - (const u64a *)((const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR))); - const u32 *confBase = (const u32 *)((const u8 *)ft + fdr->tabSize); + (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); + assert(ISALIGNED_CL(ft)); + const u32 *confBase = + (const u32 *)((const u8 *)ft + ROUNDUP_CL(fdr->tabSize)); + assert(ISALIGNED_CL(confBase)); struct zone zones[ZONE_MAX]; assert(fdr->domain > 8 && fdr->domain < 16); diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index c4ea50f2..987379dd 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -144,50 +144,59 @@ void FDRCompiler::createInitialState(FDR *fdr) { } } +/** + * \brief Lay out FDR structures in bytecode. + * + * Note that each major structure (header, table, confirm, flood control) is + * cacheline-aligned. + */ bytecode_ptr FDRCompiler::setupFDR() { - size_t tabSize = eng.getTabSizeBytes(); + size_t tabSize = ROUNDUP_CL(eng.getTabSizeBytes()); - auto floodControlTmp = setupFDRFloodControl(lits, eng, grey); - auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small); + auto floodTable = setupFDRFloodControl(lits, eng, grey); + auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small); - assert(ISALIGNED_16(tabSize)); - assert(ISALIGNED_16(confirmTmp.size())); - assert(ISALIGNED_16(floodControlTmp.size())); - size_t headerSize = ROUNDUP_16(sizeof(FDR)); - size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.size() + - floodControlTmp.size()); + size_t headerSize = ROUNDUP_CL(sizeof(FDR)); + size_t size = headerSize + tabSize + ROUNDUP_CL(confirmTable.size()) + + floodTable.size(); DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu " "total=%zu\n", - headerSize, tabSize, confirmTmp.size(), floodControlTmp.size(), + headerSize, tabSize, confirmTable.size(), floodTable.size(), size); auto fdr = make_zeroed_bytecode_ptr(size, 64); assert(fdr); // otherwise would have thrown std::bad_alloc + u8 *fdr_base = (u8 *)fdr.get(); + + // Write header. fdr->size = size; fdr->engineID = eng.getID(); fdr->maxStringLen = verify_u32(maxLen(lits)); - createInitialState(fdr.get()); - - u8 *fdr_base = (u8 *)fdr.get(); - u8 *ptr = fdr_base + ROUNDUP_16(sizeof(FDR)); - copy(tab.begin(), tab.end(), ptr); - ptr += tabSize; - - memcpy(ptr, confirmTmp.get(), confirmTmp.size()); - ptr += confirmTmp.size(); - - fdr->floodOffset = verify_u32(ptr - fdr_base); - memcpy(ptr, floodControlTmp.get(), floodControlTmp.size()); - ptr += floodControlTmp.size(); - - /* we are allowing domains 9 to 15 only */ - assert(eng.bits > 8 && eng.bits < 16); + assert(eng.bits > 8 && eng.bits < 16); // we allow domains 9 to 15 only fdr->domain = eng.bits; fdr->domainMask = (1 << eng.bits) - 1; fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8); fdr->stride = eng.stride; + createInitialState(fdr.get()); + + // Write table. + u8 *ptr = fdr_base + ROUNDUP_CL(sizeof(FDR)); + assert(ISALIGNED_CL(ptr)); + copy(tab.begin(), tab.end(), ptr); + ptr += tabSize; + + // Write confirm structures. + assert(ISALIGNED_CL(ptr)); + memcpy(ptr, confirmTable.get(), confirmTable.size()); + ptr += ROUNDUP_CL(confirmTable.size()); + + // Write flood control structures. + assert(ISALIGNED_CL(ptr)); + fdr->floodOffset = verify_u32(ptr - fdr_base); + memcpy(ptr, floodTable.get(), floodTable.size()); + ptr += floodTable.size(); // last write, no need to round up return fdr; } diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp index 7eacb495..fa064bf3 100644 --- a/src/fdr/fdr_confirm_compile.cpp +++ b/src/fdr/fdr_confirm_compile.cpp @@ -367,7 +367,7 @@ setupFullConfs(const vector &lits, u32 totalConfSwitchSize = nBuckets * sizeof(u32); u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize); - auto buf = make_zeroed_bytecode_ptr(totalSize, 16); + auto buf = make_zeroed_bytecode_ptr(totalSize, 64); assert(buf); // otherwise would have thrown std::bad_alloc u32 *confBase = (u32 *)buf.get();