Use fatbit for anch log, delay slots in scratch

Since these structures are in scratch, they do not have to be as small
as possible and we can use fatbit instead of multibit to improve
performance.
This commit is contained in:
Justin Viiret 2016-01-07 11:56:57 +11:00 committed by Matthew Barr
parent 1c2fca8840
commit de61b32e98
10 changed files with 74 additions and 68 deletions

View File

@ -105,13 +105,13 @@ void nextAnchoredMatch(const struct RoseEngine *t, struct RoseContext *tctxt,
assert(tctxt->curr_anchored_loc != MMB_INVALID);
struct hs_scratch *scratch = tctxtToScratch(tctxt);
u8 **anchoredRows = getAnchoredLog(scratch);
struct fatbit **anchoredRows = getAnchoredLog(scratch);
u32 region_width = t->anchoredMatches;
u8 *curr_row = anchoredRows[tctxt->curr_anchored_loc];
struct fatbit *curr_row = anchoredRows[tctxt->curr_anchored_loc];
tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width,
tctxt->curr_row_offset);
tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width,
tctxt->curr_row_offset);
DEBUG_PRINTF("next %u [idx = %u] @%llu\n", *reportId,
tctxt->curr_row_offset, *end);
if (tctxt->curr_row_offset != MMB_INVALID) {
@ -132,8 +132,8 @@ void nextAnchoredMatch(const struct RoseEngine *t, struct RoseContext *tctxt,
assert(tctxt->curr_anchored_loc < scratch->anchored_region_len);
curr_row = anchoredRows[tctxt->curr_anchored_loc];
tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width,
MMB_INVALID);
tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width,
MMB_INVALID);
assert(tctxt->curr_row_offset != MMB_INVALID);
*end = tctxt->curr_anchored_loc + t->maxSafeAnchoredDROffset + 1;

View File

@ -125,7 +125,7 @@ void recordAnchoredMatch(struct RoseContext *tctxt, ReportID reportId,
u64a end) {
struct hs_scratch *scratch = tctxtToScratch(tctxt);
const struct RoseEngine *t = scratch->core_info.rose;
u8 **anchoredRows = getAnchoredLog(scratch);
struct fatbit **anchoredRows = getAnchoredLog(scratch);
DEBUG_PRINTF("record %u @ %llu\n", reportId, end);
assert(end - t->maxSafeAnchoredDROffset >= 1);
@ -135,13 +135,13 @@ void recordAnchoredMatch(struct RoseContext *tctxt, ReportID reportId,
if (!bf64_set(&scratch->am_log_sum, adj_end)) {
// first time, clear row
mmbit_clear(anchoredRows[adj_end], t->anchoredMatches);
fatbit_clear(anchoredRows[adj_end]);
}
u32 idx = getAnchoredInverseMap(t)[reportId];
DEBUG_PRINTF("record %u @ %llu index %u\n", reportId, end, idx);
assert(idx < t->anchoredMatches);
mmbit_set(anchoredRows[adj_end], t->anchoredMatches, idx);
fatbit_set(anchoredRows[adj_end], t->anchoredMatches, idx);
}
static rose_inline
@ -150,21 +150,21 @@ void recordAnchoredLiteralMatch(struct RoseContext *tctxt, u32 literal_id,
assert(end);
struct hs_scratch *scratch = tctxtToScratch(tctxt);
const struct RoseEngine *t = scratch->core_info.rose;
u8 **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
if (!bf64_set(&scratch->al_log_sum, end - 1)) {
// first time, clear row
DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
mmbit_clear(anchoredLiteralRows[end - 1], t->anchored_count);
fatbit_clear(anchoredLiteralRows[end - 1]);
}
u32 rel_idx = literal_id - t->anchored_base_id;
DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
t->anchored_count);
assert(rel_idx < t->anchored_count);
mmbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
}
hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t, ReportID r,
@ -447,11 +447,11 @@ hwlmcb_rv_t roseProcessMainMatch(const struct RoseEngine *t, u64a end,
static rose_inline
hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt,
const u8 *delaySlotBase, size_t delaySlotSize,
u32 vicIndex, u64a offset) {
struct fatbit **delaySlots, u32 vicIndex,
u64a offset) {
/* assert(!tctxt->in_anchored); */
assert(vicIndex < DELAY_SLOT_COUNT);
const u8 *vicSlot = delaySlotBase + delaySlotSize * vicIndex;
const struct fatbit *vicSlot = delaySlots[vicIndex];
u32 delay_count = t->delay_count;
if (offset < t->floatingMinLiteralMatchOffset) {
@ -463,8 +463,8 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt,
roseFlushLastByteHistory(t, scratch->core_info.state, offset, tctxt);
tctxt->lastEndOffset = offset;
for (u32 it = mmbit_iterate(vicSlot, delay_count, MMB_INVALID);
it != MMB_INVALID; it = mmbit_iterate(vicSlot, delay_count, it)) {
for (u32 it = fatbit_iterate(vicSlot, delay_count, MMB_INVALID);
it != MMB_INVALID; it = fatbit_iterate(vicSlot, delay_count, it)) {
u32 literal_id = t->delay_base_id + it;
UNUSED rose_group old_groups = tctxt->groups;
@ -490,12 +490,13 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, struct RoseContext *tctxt,
static really_inline
hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t,
struct RoseContext *tctxt, u32 curr_loc) {
u8 *curr_row = getAnchoredLiteralLog(tctxtToScratch(tctxt))[curr_loc - 1];
struct hs_scratch *scratch = tctxtToScratch(tctxt);
struct fatbit *curr_row = getAnchoredLiteralLog(scratch)[curr_loc - 1];
u32 region_width = t->anchored_count;
DEBUG_PRINTF("report matches at curr loc\n");
for (u32 it = mmbit_iterate(curr_row, region_width, MMB_INVALID);
it != MMB_INVALID; it = mmbit_iterate(curr_row, region_width, it)) {
for (u32 it = fatbit_iterate(curr_row, region_width, MMB_INVALID);
it != MMB_INVALID; it = fatbit_iterate(curr_row, region_width, it)) {
DEBUG_PRINTF("it = %u/%u\n", it, region_width);
u32 literal_id = t->anchored_base_id + it;
@ -519,7 +520,6 @@ hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t,
}
/* clear row; does not invalidate iteration */
struct hs_scratch *scratch = tctxtToScratch(tctxt);
bf64_unset(&scratch->al_log_sum, curr_loc - 1);
return HWLM_CONTINUE_MATCHING;
@ -566,7 +566,7 @@ hwlmcb_rv_t flushAnchoredLiterals(const struct RoseEngine *t,
static really_inline
hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct RoseContext *tctxt,
u32 *anchored_it, u64a lastEnd, u64a victimDelaySlots,
u8 *delaySlotBase, size_t delaySlotSize) {
struct fatbit **delaySlots) {
/* assert (!tctxt->in_anchored); */
while (victimDelaySlots) {
@ -579,9 +579,8 @@ hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct RoseContext *tctxt,
return HWLM_TERMINATE_MATCHING;
}
if (playDelaySlot(t, tctxt, delaySlotBase, delaySlotSize,
vic % DELAY_SLOT_COUNT, vicOffset)
== HWLM_TERMINATE_MATCHING) {
if (playDelaySlot(t, tctxt, delaySlots, vic % DELAY_SLOT_COUNT,
vicOffset) == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATE_MATCHING;
}
}
@ -609,8 +608,7 @@ hwlmcb_rv_t flushQueuedLiterals_i(struct RoseContext *tctxt, u64a currEnd) {
}
{
u8 *delaySlotBase = getDelaySlots(scratch);
size_t delaySlotSize = t->delay_slot_size;
struct fatbit **delaySlots = getDelaySlots(tctxtToScratch(tctxt));
u32 lastIndex = lastEnd & DELAY_MASK;
u32 currIndex = currEnd & DELAY_MASK;
@ -664,8 +662,7 @@ hwlmcb_rv_t flushQueuedLiterals_i(struct RoseContext *tctxt, u64a currEnd) {
}
if (playVictims(t, tctxt, &anchored_it, lastEnd, victimDelaySlots,
delaySlotBase, delaySlotSize)
== HWLM_TERMINATE_MATCHING) {
delaySlots) == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATE_MATCHING;
}
}

View File

@ -40,6 +40,7 @@
#include "nfa/nfa_api_util.h"
#include "som/som_runtime.h"
#include "util/bitutils.h"
#include "util/fatbit.h"
#include "util/internal_report.h"
#include "util/multibit.h"
@ -60,16 +61,16 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx);
static rose_inline
void resetAnchoredLog(const struct RoseEngine *t, struct hs_scratch *scratch) {
u8 **anchoredRows = getAnchoredLog(scratch);
struct fatbit **anchoredRows = getAnchoredLog(scratch);
u32 region_width = t->anchoredMatches;
struct RoseContext *tctxt = &scratch->tctxt;
tctxt->curr_anchored_loc = bf64_iterate(scratch->am_log_sum, MMB_INVALID);
if (tctxt->curr_anchored_loc != MMB_INVALID) {
assert(tctxt->curr_anchored_loc < scratch->anchored_region_len);
u8 *curr_row = anchoredRows[tctxt->curr_anchored_loc];
tctxt->curr_row_offset = mmbit_iterate(curr_row, region_width,
MMB_INVALID);
struct fatbit *curr_row = anchoredRows[tctxt->curr_anchored_loc];
tctxt->curr_row_offset = fatbit_iterate(curr_row, region_width,
MMB_INVALID);
assert(tctxt->curr_row_offset != MMB_INVALID);
}
DEBUG_PRINTF("AL reset --> %u, %u\n", tctxt->curr_anchored_loc,

View File

@ -127,16 +127,16 @@ void rosePushDelayedMatch(const struct RoseEngine *t, u32 delay,
}
const u32 delay_count = t->delay_count;
u8 *slot = getDelaySlots(tctxtToScratch(tctxt)) +
(t->delay_slot_size * slot_index);
struct fatbit **delaySlots = getDelaySlots(tctxtToScratch(tctxt));
struct fatbit *slot = delaySlots[slot_index];
DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
tctxt->filledDelayedSlots |= 1U << slot_index;
mmbit_clear(slot, delay_count);
fatbit_clear(slot);
}
mmbit_set(slot, delay_count, delay_index);
fatbit_set(slot, delay_count, delay_index);
}
static rose_inline

View File

@ -4311,7 +4311,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
u32 delay_count = verify_u32(final_id_to_literal.size() - delay_base_id);
engine->delay_count = delay_count;
engine->delay_slot_size = mmbit_size(delay_count);
engine->delay_base_id = delay_base_id;
engine->anchored_base_id = anchored_base_id;
engine->anchored_count = delay_base_id - anchored_base_id;

View File

@ -884,7 +884,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
DUMP_U32(t, size);
DUMP_U32(t, anchoredMatches);
DUMP_U32(t, delay_count);
DUMP_U32(t, delay_slot_size);
DUMP_U32(t, delay_base_id);
DUMP_U32(t, anchored_count);
DUMP_U32(t, anchored_base_id);

View File

@ -447,7 +447,6 @@ struct RoseEngine {
u32 size; // (bytes)
u32 anchoredMatches; /* number of anchored roles generating matches */
u32 delay_count; /* number of delayed literal ids. */
u32 delay_slot_size; /* size of delay slot mmbit. */
u32 delay_base_id; /* literal id of the first delayed literal.
* delayed literal ids are contiguous */
u32 anchored_count; /* number of anchored literal ids */

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Intel Corporation
* Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -74,14 +74,16 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
assert(anchored_literal_region_len < 8 * sizeof(s->am_log_sum));
size_t anchored_region_size = anchored_region_len
* (mmbit_size(anchored_region_width) + sizeof(u8 *));
* (fatbit_size(anchored_region_width) + sizeof(struct fatbit *));
anchored_region_size = ROUNDUP_N(anchored_region_size, 8);
size_t anchored_literal_region_size = anchored_literal_region_len
* (mmbit_size(anchored_literal_region_width) + sizeof(u8 *));
* (fatbit_size(anchored_literal_region_width) + sizeof(struct fatbit *));
anchored_literal_region_size = ROUNDUP_N(anchored_literal_region_size, 8);
size_t delay_size = mmbit_size(proto->delay_count) * DELAY_SLOT_COUNT;
size_t delay_region_size = DELAY_SLOT_COUNT *
(fatbit_size(proto->delay_count) + sizeof(struct fatbit *));
delay_region_size = ROUNDUP_N(delay_region_size, 8);
size_t nfa_context_size = 2 * sizeof(struct NFAContext512) + 127;
@ -96,7 +98,8 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
+ 2 * fatbit_size(deduperCount) /* ditto som logs */
+ 2 * sizeof(u64a) * deduperCount /* start offsets for som */
+ anchored_region_size
+ anchored_literal_region_size + qmpq_size + delay_size
+ anchored_literal_region_size + qmpq_size
+ delay_region_size
+ som_store_size
+ som_now_size
+ som_attempted_size
@ -140,23 +143,28 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
s->som_attempted_store = (u64a *)current;
current += som_attempted_store_size;
s->delay_slots = (u8 *)current;
current += delay_size;
current = ROUNDUP_PTR(current, 8);
s->am_log = (u8 **)current;
current += sizeof(u8 *) * anchored_region_len;
for (u32 i = 0; i < anchored_region_len; i++) {
s->am_log[i] = (u8 *)current;
current += mmbit_size(anchored_region_width);
s->delay_slots = (struct fatbit **)current;
current += sizeof(struct fatbit *) * DELAY_SLOT_COUNT;
for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) {
s->delay_slots[i] = (struct fatbit *)current;
current += fatbit_size(proto->delay_count);
}
current = ROUNDUP_PTR(current, 8);
s->al_log = (u8 **)current;
current += sizeof(u8 *) * anchored_literal_region_len;
s->am_log = (struct fatbit **)current;
current += sizeof(struct fatbit *) * anchored_region_len;
for (u32 i = 0; i < anchored_region_len; i++) {
s->am_log[i] = (struct fatbit *)current;
current += fatbit_size(anchored_region_width);
}
current = ROUNDUP_PTR(current, 8);
s->al_log = (struct fatbit **)current;
current += sizeof(struct fatbit *) * anchored_literal_region_len;
for (u32 i = 0; i < anchored_literal_region_len; i++) {
s->al_log[i] = (u8 *)current;
current += mmbit_size(anchored_literal_region_width);
s->al_log[i] = (struct fatbit *)current;
current += fatbit_size(anchored_literal_region_width);
}
current = ROUNDUP_PTR(current, 8);

View File

@ -37,7 +37,6 @@
#define SCRATCH_H_DA6D4FC06FF410
#include "ue2common.h"
#include "util/multibit_internal.h"
#include "rose/rose_types.h"
#ifdef __cplusplus
@ -133,7 +132,7 @@ struct RoseContext {
struct match_deduper {
struct fatbit *log[2]; /**< even, odd logs */
struct fatbit *som_log[2]; /**< even, odd mmbit logs for som */
struct fatbit *som_log[2]; /**< even, odd fatbit logs for som */
u64a *som_start_log[2]; /**< even, odd start offset logs for som */
u32 log_size;
u64a current_report_offset;
@ -162,9 +161,9 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
struct mq *queues;
struct fatbit *aqa; /**< active queue array; fatbit of queues that are valid
* & active */
u8 *delay_slots;
u8 **am_log;
u8 **al_log;
struct fatbit **delay_slots;
struct fatbit **am_log;
struct fatbit **al_log;
u64a am_log_sum;
u64a al_log_sum;
struct catchup_pq catchup_pq;
@ -178,7 +177,7 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
u32 scratchSize;
u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
u32 handledKeyCount;
struct fatbit *handled_roles; /**< mmbit of ROLES (not states) already
struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already
* handled by this literal */
u64a *som_store; /**< array of som locations */
u64a *som_attempted_store; /**< array of som locations for fail stores */
@ -198,18 +197,18 @@ struct hs_scratch *tctxtToScratch(struct RoseContext *tctxt) {
}
static really_inline
u8 **getAnchoredLog(struct hs_scratch *scratch) { /* array of mmbit ptr */
struct fatbit **getAnchoredLog(struct hs_scratch *scratch) {
return scratch->am_log;
}
/* array of mmbit ptr; TODO: why not an array of mmbits? */
/* array of fatbit ptr; TODO: why not an array of fatbits? */
static really_inline
u8 **getAnchoredLiteralLog(struct hs_scratch *scratch) {
struct fatbit **getAnchoredLiteralLog(struct hs_scratch *scratch) {
return scratch->al_log;
}
static really_inline
u8 *getDelaySlots(struct hs_scratch *scratch) {
struct fatbit **getDelaySlots(struct hs_scratch *scratch) {
return scratch->delay_slots;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Intel Corporation
* Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -58,21 +58,25 @@ void fatbit_clear(struct fatbit *bits) {
static really_inline
char fatbit_set(struct fatbit *bits, u32 total_bits, u32 key) {
assert(ISALIGNED(bits));
return mmbit_set(bits->fb_int.raw, total_bits, key);
}
static really_inline
void fatbit_unset(struct fatbit *bits, u32 total_bits, u32 key) {
assert(ISALIGNED(bits));
mmbit_unset(bits->fb_int.raw, total_bits, key);
}
static really_inline
char fatbit_isset(const struct fatbit *bits, u32 total_bits, u32 key) {
assert(ISALIGNED(bits));
return mmbit_isset(bits->fb_int.raw, total_bits, key);
}
static really_inline
u32 fatbit_iterate(const struct fatbit *bits, u32 total_bits, u32 it_in) {
assert(ISALIGNED(bits));
/* TODO: iterate_flat could be specialised as we don't have to worry about
* partial blocks. */
return mmbit_iterate(bits->fb_int.raw, total_bits, it_in);