From de61b32e98ce272ff7f8d3b1aa0ef90515c925ad Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Thu, 7 Jan 2016 11:56:57 +1100 Subject: [PATCH] Use fatbit for anch log, delay slots in scratch Since these structures are in scratch, they do not have to be as small as possible and we can use fatbit instead of multibit to improve performance. --- src/rose/catchup.c | 12 ++++----- src/rose/match.c | 43 +++++++++++++++----------------- src/rose/match.h | 9 ++++--- src/rose/program_runtime.h | 8 +++--- src/rose/rose_build_bytecode.cpp | 1 - src/rose/rose_dump.cpp | 1 - src/rose/rose_internal.h | 1 - src/scratch.c | 42 ++++++++++++++++++------------- src/scratch.h | 19 +++++++------- src/util/fatbit.h | 6 ++++- 10 files changed, 74 insertions(+), 68 deletions(-) diff --git a/src/rose/catchup.c b/src/rose/catchup.c index d1ef41ff..6893df0e 100644 --- a/src/rose/catchup.c +++ b/src/rose/catchup.c @@ -105,13 +105,13 @@ void nextAnchoredMatch(const struct RoseEngine *t, struct RoseContext *tctxt, assert(tctxt->curr_anchored_loc != MMB_INVALID); struct hs_scratch *scratch = tctxtToScratch(tctxt); - u8 **anchoredRows = getAnchoredLog(scratch); + struct fatbit **anchoredRows = getAnchoredLog(scratch); u32 region_width = t->anchoredMatches; - u8 *curr_row = anchoredRows[tctxt->curr_anchored_loc]; + struct fatbit *curr_row = anchoredRows[tctxt->curr_anchored_loc]; - tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width, - tctxt->curr_row_offset); + tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width, + tctxt->curr_row_offset); DEBUG_PRINTF("next %u [idx = %u] @%llu\n", *reportId, tctxt->curr_row_offset, *end); if (tctxt->curr_row_offset != MMB_INVALID) { @@ -132,8 +132,8 @@ void nextAnchoredMatch(const struct RoseEngine *t, struct RoseContext *tctxt, assert(tctxt->curr_anchored_loc < scratch->anchored_region_len); curr_row = anchoredRows[tctxt->curr_anchored_loc]; - tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width, - MMB_INVALID); + tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width, + MMB_INVALID); assert(tctxt->curr_row_offset != MMB_INVALID); *end = tctxt->curr_anchored_loc + t->maxSafeAnchoredDROffset + 1; diff --git a/src/rose/match.c b/src/rose/match.c index f614423b..6397b90e 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -125,7 +125,7 @@ void recordAnchoredMatch(struct RoseContext *tctxt, ReportID reportId, u64a end) { struct hs_scratch *scratch = tctxtToScratch(tctxt); const struct RoseEngine *t = scratch->core_info.rose; - u8 **anchoredRows = getAnchoredLog(scratch); + struct fatbit **anchoredRows = getAnchoredLog(scratch); DEBUG_PRINTF("record %u @ %llu\n", reportId, end); assert(end - t->maxSafeAnchoredDROffset >= 1); @@ -135,13 +135,13 @@ void recordAnchoredMatch(struct RoseContext *tctxt, ReportID reportId, if (!bf64_set(&scratch->am_log_sum, adj_end)) { // first time, clear row - mmbit_clear(anchoredRows[adj_end], t->anchoredMatches); + fatbit_clear(anchoredRows[adj_end]); } u32 idx = getAnchoredInverseMap(t)[reportId]; DEBUG_PRINTF("record %u @ %llu index %u\n", reportId, end, idx); assert(idx < t->anchoredMatches); - mmbit_set(anchoredRows[adj_end], t->anchoredMatches, idx); + fatbit_set(anchoredRows[adj_end], t->anchoredMatches, idx); } static rose_inline @@ -150,21 +150,21 @@ void recordAnchoredLiteralMatch(struct RoseContext *tctxt, u32 literal_id, assert(end); struct hs_scratch *scratch = tctxtToScratch(tctxt); const struct RoseEngine *t = scratch->core_info.rose; - u8 **anchoredLiteralRows = getAnchoredLiteralLog(scratch); + struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch); DEBUG_PRINTF("record %u @ %llu\n", literal_id, end); if (!bf64_set(&scratch->al_log_sum, end - 1)) { // first time, clear row DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count); - mmbit_clear(anchoredLiteralRows[end - 1], t->anchored_count); + fatbit_clear(anchoredLiteralRows[end - 1]); } u32 rel_idx = literal_id - t->anchored_base_id; DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx, t->anchored_count); assert(rel_idx < t->anchored_count); - mmbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx); + fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx); } hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t, ReportID r, @@ -447,11 +447,11 @@ hwlmcb_rv_t roseProcessMainMatch(const struct RoseEngine *t, u64a end, static rose_inline hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt, - const u8 *delaySlotBase, size_t delaySlotSize, - u32 vicIndex, u64a offset) { + struct fatbit **delaySlots, u32 vicIndex, + u64a offset) { /* assert(!tctxt->in_anchored); */ assert(vicIndex < DELAY_SLOT_COUNT); - const u8 *vicSlot = delaySlotBase + delaySlotSize * vicIndex; + const struct fatbit *vicSlot = delaySlots[vicIndex]; u32 delay_count = t->delay_count; if (offset < t->floatingMinLiteralMatchOffset) { @@ -463,8 +463,8 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt, roseFlushLastByteHistory(t, scratch->core_info.state, offset, tctxt); tctxt->lastEndOffset = offset; - for (u32 it = mmbit_iterate(vicSlot, delay_count, MMB_INVALID); - it != MMB_INVALID; it = mmbit_iterate(vicSlot, delay_count, it)) { + for (u32 it = fatbit_iterate(vicSlot, delay_count, MMB_INVALID); + it != MMB_INVALID; it = fatbit_iterate(vicSlot, delay_count, it)) { u32 literal_id = t->delay_base_id + it; UNUSED rose_group old_groups = tctxt->groups; @@ -490,12 +490,13 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt, static really_inline hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t, struct RoseContext *tctxt, u32 curr_loc) { - u8 *curr_row = getAnchoredLiteralLog(tctxtToScratch(tctxt))[curr_loc - 1]; + struct hs_scratch *scratch = tctxtToScratch(tctxt); + struct fatbit *curr_row = getAnchoredLiteralLog(scratch)[curr_loc - 1]; u32 region_width = t->anchored_count; DEBUG_PRINTF("report matches at curr loc\n"); - for (u32 it = mmbit_iterate(curr_row, region_width, MMB_INVALID); - it != MMB_INVALID; it = mmbit_iterate(curr_row, region_width, it)) { + for (u32 it = fatbit_iterate(curr_row, region_width, MMB_INVALID); + it != MMB_INVALID; it = fatbit_iterate(curr_row, region_width, it)) { DEBUG_PRINTF("it = %u/%u\n", it, region_width); u32 literal_id = t->anchored_base_id + it; @@ -519,7 +520,6 @@ hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t, } /* clear row; does not invalidate iteration */ - struct hs_scratch *scratch = tctxtToScratch(tctxt); bf64_unset(&scratch->al_log_sum, curr_loc - 1); return HWLM_CONTINUE_MATCHING; @@ -566,7 +566,7 @@ hwlmcb_rv_t flushAnchoredLiterals(const struct RoseEngine *t, static really_inline hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct RoseContext *tctxt, u32 *anchored_it, u64a lastEnd, u64a victimDelaySlots, - u8 *delaySlotBase, size_t delaySlotSize) { + struct fatbit **delaySlots) { /* assert (!tctxt->in_anchored); */ while (victimDelaySlots) { @@ -579,9 +579,8 @@ hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct RoseContext *tctxt, return HWLM_TERMINATE_MATCHING; } - if (playDelaySlot(t, tctxt, delaySlotBase, delaySlotSize, - vic % DELAY_SLOT_COUNT, vicOffset) - == HWLM_TERMINATE_MATCHING) { + if (playDelaySlot(t, tctxt, delaySlots, vic % DELAY_SLOT_COUNT, + vicOffset) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; } } @@ -609,8 +608,7 @@ hwlmcb_rv_t flushQueuedLiterals_i(struct RoseContext *tctxt, u64a currEnd) { } { - u8 *delaySlotBase = getDelaySlots(scratch); - size_t delaySlotSize = t->delay_slot_size; + struct fatbit **delaySlots = getDelaySlots(tctxtToScratch(tctxt)); u32 lastIndex = lastEnd & DELAY_MASK; u32 currIndex = currEnd & DELAY_MASK; @@ -664,8 +662,7 @@ hwlmcb_rv_t flushQueuedLiterals_i(struct RoseContext *tctxt, u64a currEnd) { } if (playVictims(t, tctxt, &anchored_it, lastEnd, victimDelaySlots, - delaySlotBase, delaySlotSize) - == HWLM_TERMINATE_MATCHING) { + delaySlots) == HWLM_TERMINATE_MATCHING) { return HWLM_TERMINATE_MATCHING; } } diff --git a/src/rose/match.h b/src/rose/match.h index f3b8fe73..2b6dfb5d 100644 --- a/src/rose/match.h +++ b/src/rose/match.h @@ -40,6 +40,7 @@ #include "nfa/nfa_api_util.h" #include "som/som_runtime.h" #include "util/bitutils.h" +#include "util/fatbit.h" #include "util/internal_report.h" #include "util/multibit.h" @@ -60,16 +61,16 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx); static rose_inline void resetAnchoredLog(const struct RoseEngine *t, struct hs_scratch *scratch) { - u8 **anchoredRows = getAnchoredLog(scratch); + struct fatbit **anchoredRows = getAnchoredLog(scratch); u32 region_width = t->anchoredMatches; struct RoseContext *tctxt = &scratch->tctxt; tctxt->curr_anchored_loc = bf64_iterate(scratch->am_log_sum, MMB_INVALID); if (tctxt->curr_anchored_loc != MMB_INVALID) { assert(tctxt->curr_anchored_loc < scratch->anchored_region_len); - u8 *curr_row = anchoredRows[tctxt->curr_anchored_loc]; - tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width, - MMB_INVALID); + struct fatbit *curr_row = anchoredRows[tctxt->curr_anchored_loc]; + tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width, + MMB_INVALID); assert(tctxt->curr_row_offset != MMB_INVALID); } DEBUG_PRINTF("AL reset --> %u, %u\n", tctxt->curr_anchored_loc, diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h index e8e60c7f..309fee5b 100644 --- a/src/rose/program_runtime.h +++ b/src/rose/program_runtime.h @@ -127,16 +127,16 @@ void rosePushDelayedMatch(const struct RoseEngine *t, u32 delay, } const u32 delay_count = t->delay_count; - u8 *slot = getDelaySlots(tctxtToScratch(tctxt)) + - (t->delay_slot_size * slot_index); + struct fatbit **delaySlots = getDelaySlots(tctxtToScratch(tctxt)); + struct fatbit *slot = delaySlots[slot_index]; DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index); if (!(tctxt->filledDelayedSlots & (1U << slot_index))) { tctxt->filledDelayedSlots |= 1U << slot_index; - mmbit_clear(slot, delay_count); + fatbit_clear(slot); } - mmbit_set(slot, delay_count, delay_index); + fatbit_set(slot, delay_count, delay_index); } static rose_inline diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 45af3bb7..c640f091 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -4311,7 +4311,6 @@ aligned_unique_ptr RoseBuildImpl::buildFinalEngine(u32 minWidth) { u32 delay_count = verify_u32(final_id_to_literal.size() - delay_base_id); engine->delay_count = delay_count; - engine->delay_slot_size = mmbit_size(delay_count); engine->delay_base_id = delay_base_id; engine->anchored_base_id = anchored_base_id; engine->anchored_count = delay_base_id - anchored_base_id; diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp index cd70c734..25ec7bae 100644 --- a/src/rose/rose_dump.cpp +++ b/src/rose/rose_dump.cpp @@ -884,7 +884,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) { DUMP_U32(t, size); DUMP_U32(t, anchoredMatches); DUMP_U32(t, delay_count); - DUMP_U32(t, delay_slot_size); DUMP_U32(t, delay_base_id); DUMP_U32(t, anchored_count); DUMP_U32(t, anchored_base_id); diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index c9025600..a1f91cd3 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -447,7 +447,6 @@ struct RoseEngine { u32 size; // (bytes) u32 anchoredMatches; /* number of anchored roles generating matches */ u32 delay_count; /* number of delayed literal ids. */ - u32 delay_slot_size; /* size of delay slot mmbit. */ u32 delay_base_id; /* literal id of the first delayed literal. * delayed literal ids are contiguous */ u32 anchored_count; /* number of anchored literal ids */ diff --git a/src/scratch.c b/src/scratch.c index 30241ab4..eff2289a 100644 --- a/src/scratch.c +++ b/src/scratch.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -74,14 +74,16 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { assert(anchored_literal_region_len < 8 * sizeof(s->am_log_sum)); size_t anchored_region_size = anchored_region_len - * (mmbit_size(anchored_region_width) + sizeof(u8 *)); + * (fatbit_size(anchored_region_width) + sizeof(struct fatbit *)); anchored_region_size = ROUNDUP_N(anchored_region_size, 8); size_t anchored_literal_region_size = anchored_literal_region_len - * (mmbit_size(anchored_literal_region_width) + sizeof(u8 *)); + * (fatbit_size(anchored_literal_region_width) + sizeof(struct fatbit *)); anchored_literal_region_size = ROUNDUP_N(anchored_literal_region_size, 8); - size_t delay_size = mmbit_size(proto->delay_count) * DELAY_SLOT_COUNT; + size_t delay_region_size = DELAY_SLOT_COUNT * + (fatbit_size(proto->delay_count) + sizeof(struct fatbit *)); + delay_region_size = ROUNDUP_N(delay_region_size, 8); size_t nfa_context_size = 2 * sizeof(struct NFAContext512) + 127; @@ -96,7 +98,8 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { + 2 * fatbit_size(deduperCount) /* ditto som logs */ + 2 * sizeof(u64a) * deduperCount /* start offsets for som */ + anchored_region_size - + anchored_literal_region_size + qmpq_size + delay_size + + anchored_literal_region_size + qmpq_size + + delay_region_size + som_store_size + som_now_size + som_attempted_size @@ -140,23 +143,28 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { s->som_attempted_store = (u64a *)current; current += som_attempted_store_size; - s->delay_slots = (u8 *)current; - current += delay_size; - current = ROUNDUP_PTR(current, 8); - s->am_log = (u8 **)current; - current += sizeof(u8 *) * anchored_region_len; - for (u32 i = 0; i < anchored_region_len; i++) { - s->am_log[i] = (u8 *)current; - current += mmbit_size(anchored_region_width); + s->delay_slots = (struct fatbit **)current; + current += sizeof(struct fatbit *) * DELAY_SLOT_COUNT; + for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) { + s->delay_slots[i] = (struct fatbit *)current; + current += fatbit_size(proto->delay_count); } current = ROUNDUP_PTR(current, 8); - s->al_log = (u8 **)current; - current += sizeof(u8 *) * anchored_literal_region_len; + s->am_log = (struct fatbit **)current; + current += sizeof(struct fatbit *) * anchored_region_len; + for (u32 i = 0; i < anchored_region_len; i++) { + s->am_log[i] = (struct fatbit *)current; + current += fatbit_size(anchored_region_width); + } + + current = ROUNDUP_PTR(current, 8); + s->al_log = (struct fatbit **)current; + current += sizeof(struct fatbit *) * anchored_literal_region_len; for (u32 i = 0; i < anchored_literal_region_len; i++) { - s->al_log[i] = (u8 *)current; - current += mmbit_size(anchored_literal_region_width); + s->al_log[i] = (struct fatbit *)current; + current += fatbit_size(anchored_literal_region_width); } current = ROUNDUP_PTR(current, 8); diff --git a/src/scratch.h b/src/scratch.h index f23ff5dc..fa112a56 100644 --- a/src/scratch.h +++ b/src/scratch.h @@ -37,7 +37,6 @@ #define SCRATCH_H_DA6D4FC06FF410 #include "ue2common.h" -#include "util/multibit_internal.h" #include "rose/rose_types.h" #ifdef __cplusplus @@ -133,7 +132,7 @@ struct RoseContext { struct match_deduper { struct fatbit *log[2]; /**< even, odd logs */ - struct fatbit *som_log[2]; /**< even, odd mmbit logs for som */ + struct fatbit *som_log[2]; /**< even, odd fatbit logs for som */ u64a *som_start_log[2]; /**< even, odd start offset logs for som */ u32 log_size; u64a current_report_offset; @@ -162,9 +161,9 @@ struct ALIGN_CL_DIRECTIVE hs_scratch { struct mq *queues; struct fatbit *aqa; /**< active queue array; fatbit of queues that are valid * & active */ - u8 *delay_slots; - u8 **am_log; - u8 **al_log; + struct fatbit **delay_slots; + struct fatbit **am_log; + struct fatbit **al_log; u64a am_log_sum; u64a al_log_sum; struct catchup_pq catchup_pq; @@ -178,7 +177,7 @@ struct ALIGN_CL_DIRECTIVE hs_scratch { u32 scratchSize; u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE]; u32 handledKeyCount; - struct fatbit *handled_roles; /**< mmbit of ROLES (not states) already + struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already * handled by this literal */ u64a *som_store; /**< array of som locations */ u64a *som_attempted_store; /**< array of som locations for fail stores */ @@ -198,18 +197,18 @@ struct hs_scratch *tctxtToScratch(struct RoseContext *tctxt) { } static really_inline -u8 **getAnchoredLog(struct hs_scratch *scratch) { /* array of mmbit ptr */ +struct fatbit **getAnchoredLog(struct hs_scratch *scratch) { return scratch->am_log; } -/* array of mmbit ptr; TODO: why not an array of mmbits? */ +/* array of fatbit ptr; TODO: why not an array of fatbits? */ static really_inline -u8 **getAnchoredLiteralLog(struct hs_scratch *scratch) { +struct fatbit **getAnchoredLiteralLog(struct hs_scratch *scratch) { return scratch->al_log; } static really_inline -u8 *getDelaySlots(struct hs_scratch *scratch) { +struct fatbit **getDelaySlots(struct hs_scratch *scratch) { return scratch->delay_slots; } diff --git a/src/util/fatbit.h b/src/util/fatbit.h index cf906269..ad607638 100644 --- a/src/util/fatbit.h +++ b/src/util/fatbit.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -58,21 +58,25 @@ void fatbit_clear(struct fatbit *bits) { static really_inline char fatbit_set(struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); return mmbit_set(bits->fb_int.raw, total_bits, key); } static really_inline void fatbit_unset(struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); mmbit_unset(bits->fb_int.raw, total_bits, key); } static really_inline char fatbit_isset(const struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); return mmbit_isset(bits->fb_int.raw, total_bits, key); } static really_inline u32 fatbit_iterate(const struct fatbit *bits, u32 total_bits, u32 it_in) { + assert(ISALIGNED(bits)); /* TODO: iterate_flat could be specialised as we don't have to worry about * partial blocks. */ return mmbit_iterate(bits->fb_int.raw, total_bits, it_in);