mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
440 lines
14 KiB
C++
440 lines
14 KiB
C++
/*
|
|
* Copyright (c) 2015, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "multiaccel_compilehelper.h"
|
|
|
|
using namespace std;
|
|
using namespace ue2;
|
|
|
|
#ifdef DEBUG
|
|
static const char* state_to_str[] = {
|
|
"FIRST_RUN",
|
|
"SECOND_RUN",
|
|
"WAITING_FOR_GRAB",
|
|
"FIRST_TAIL",
|
|
"SECOND_TAIL",
|
|
"STOPPED",
|
|
"INVALID"
|
|
};
|
|
static const char* type_to_str[] = {
|
|
"SHIFT",
|
|
"SHIFTGRAB",
|
|
"DOUBLESHIFT",
|
|
"DOUBLESHIFTGRAB",
|
|
"LONG",
|
|
"LONGGRAB",
|
|
"NONE"
|
|
};
|
|
|
|
static
|
|
void dumpMultiaccelState(const accel_data &d) {
|
|
DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
|
|
type_to_str[(unsigned) d.type],
|
|
state_to_str[(unsigned) d.state],
|
|
d.len1, d.tlen1, d.len2, d.tlen2);
|
|
}
|
|
#endif
|
|
|
|
/* stop all the matching. this may render most schemes invalid. */
|
|
static
|
|
void stop(accel_data &d) {
|
|
switch (d.state) {
|
|
case STATE_STOPPED:
|
|
case STATE_INVALID:
|
|
break;
|
|
case STATE_FIRST_TAIL:
|
|
case STATE_SECOND_RUN:
|
|
/*
|
|
* Shift matchers are special case, because they have "tails".
|
|
* When shift matcher reaches a mid/endpoint, tail mode is
|
|
* activated, which looks for more matches to extend the match.
|
|
*
|
|
* For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
|
|
* long-grab matcher will be picked for this pattern (matching a run of a's,
|
|
* followed by a not-a), because doubleshift matcher would be confused by
|
|
* consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
|
|
* by 1) and throw out the rest of the pattern.
|
|
*
|
|
* With tails, we defer ending the run until we actually run out of
|
|
* matching characters, so the above pattern will now be parsed by
|
|
* doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
|
|
*
|
|
* So if we are stopping shift matchers, we should check if we aren't in
|
|
* the process of matching first tail or second run. If we are, we can't
|
|
* finish the second run as we are stopping, but we can try and split
|
|
* the first tail instead to obtain a valid second run.
|
|
*/
|
|
if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
|
|
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
|
|
// can't split an empty void...
|
|
d.state = STATE_INVALID;
|
|
break;
|
|
}
|
|
d.len2 = 0;
|
|
d.state = STATE_STOPPED;
|
|
break;
|
|
case STATE_SECOND_TAIL:
|
|
d.state = STATE_STOPPED;
|
|
break;
|
|
case STATE_WAITING_FOR_GRAB:
|
|
case STATE_FIRST_RUN:
|
|
if (d.type == MultibyteAccelInfo::MAT_LONG) {
|
|
d.state = STATE_STOPPED;
|
|
} else {
|
|
d.state = STATE_INVALID;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
static
|
|
void validate(accel_data &d, unsigned max_len) {
|
|
// try and fit in all our tails
|
|
if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
|
|
// case 1: everything fits in
|
|
d.len1 += d.tlen1;
|
|
d.len2 += d.tlen2;
|
|
d.tlen1 = 0;
|
|
d.tlen2 = 0;
|
|
} else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
|
|
// case 2: everything but the second tail fits in
|
|
d.len1 += d.tlen1;
|
|
d.tlen1 = 0;
|
|
// try going for a partial tail
|
|
if (d.tlen2 != 0) {
|
|
int new_tlen2 = max_len - 1 - d.len1 - d.len2;
|
|
if (new_tlen2 > 0) {
|
|
d.len2 += new_tlen2;
|
|
}
|
|
d.tlen2 = 0;
|
|
}
|
|
} else if (d.len1 + d.tlen1 < max_len) {
|
|
// case 3: first run and its tail fits in
|
|
if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
|
|
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
|
|
// split the tail into a second run
|
|
d.len2 = d.tlen1;
|
|
} else {
|
|
d.len1 += d.tlen1;
|
|
d.len2 = 0;
|
|
}
|
|
d.tlen1 = 0;
|
|
d.tlen2 = 0;
|
|
} else if (d.len1 < max_len) {
|
|
// case 4: nothing but the first run fits in
|
|
// try going for a partial tail
|
|
if (d.tlen1 != 0) {
|
|
int new_tlen1 = max_len - 1 - d.len1;
|
|
if (new_tlen1 > 0) {
|
|
d.len1 += new_tlen1;
|
|
}
|
|
d.tlen1 = 0;
|
|
}
|
|
d.len2 = 0;
|
|
d.tlen2 = 0;
|
|
}
|
|
// if we removed our second run, doubleshift matchers are no longer valid
|
|
if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
|
|
d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
|
|
d.state = STATE_INVALID;
|
|
} else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
|
|
// long matchers can just stop whenever they want to
|
|
d.len1 = max_len - 1;
|
|
}
|
|
|
|
// now, general sanity checks
|
|
if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
|
|
d.state = STATE_INVALID;
|
|
}
|
|
if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
|
|
d.state = STATE_INVALID;
|
|
}
|
|
}
|
|
|
|
static
|
|
void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
|
|
switch (d.type) {
|
|
case MultibyteAccelInfo::MAT_LONG:
|
|
{
|
|
/*
|
|
* For long matcher, we want lots of consecutive same-or-subset
|
|
* char-reaches
|
|
*/
|
|
if ((ref_cr & cur_cr) == cur_cr) {
|
|
d.len1++;
|
|
} else {
|
|
d.state = STATE_STOPPED;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case MultibyteAccelInfo::MAT_LONGGRAB:
|
|
{
|
|
/*
|
|
* For long-grab matcher, we want lots of consecutive same-or-subset
|
|
* char-reaches with a negative match in the end.
|
|
*/
|
|
if ((ref_cr & cur_cr) == cur_cr) {
|
|
d.len1++;
|
|
} else if (!(ref_cr & cur_cr).any()) {
|
|
/* we grabbed, stop immediately */
|
|
d.state = STATE_STOPPED;
|
|
} else {
|
|
/* our run-n-grab was interrupted; mark as invalid */
|
|
d.state = STATE_INVALID;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case MultibyteAccelInfo::MAT_SHIFTGRAB:
|
|
{
|
|
/*
|
|
* For shift-grab matcher, we want two matches separated by anything;
|
|
* however the second vertex *must* be a negative (non-overlapping) match.
|
|
*
|
|
* Shiftgrab matcher is identical to shift except for presence of grab.
|
|
*/
|
|
if (d.state == STATE_WAITING_FOR_GRAB) {
|
|
if ((ref_cr & cur_cr).any()) {
|
|
d.state = STATE_INVALID;
|
|
} else {
|
|
d.state = STATE_FIRST_RUN;
|
|
d.len1++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
/* no break, falling through */
|
|
case MultibyteAccelInfo::MAT_SHIFT:
|
|
{
|
|
/*
|
|
* For shift-matcher, we want two matches separated by anything.
|
|
*/
|
|
if (ref_cr == cur_cr) {
|
|
// keep matching tail
|
|
switch (d.state) {
|
|
case STATE_FIRST_RUN:
|
|
d.state = STATE_FIRST_TAIL;
|
|
break;
|
|
case STATE_FIRST_TAIL:
|
|
d.tlen1++;
|
|
break;
|
|
default:
|
|
// shouldn't happen
|
|
assert(0);
|
|
}
|
|
} else {
|
|
switch (d.state) {
|
|
case STATE_FIRST_RUN:
|
|
// simply advance
|
|
d.len1++;
|
|
break;
|
|
case STATE_FIRST_TAIL:
|
|
// we found a non-matching char after tail, so stop
|
|
d.state = STATE_STOPPED;
|
|
break;
|
|
default:
|
|
// shouldn't happen
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case MultibyteAccelInfo::MAT_DSHIFTGRAB:
|
|
{
|
|
/*
|
|
* For double shift-grab matcher, we want two matches separated by
|
|
* either negative matches or dots; however the second vertex *must*
|
|
* be a negative match.
|
|
*
|
|
* Doubleshiftgrab matcher is identical to doubleshift except for
|
|
* presence of grab.
|
|
*/
|
|
if (d.state == STATE_WAITING_FOR_GRAB) {
|
|
if ((ref_cr & cur_cr).any()) {
|
|
d.state = STATE_INVALID;
|
|
} else {
|
|
d.state = STATE_FIRST_RUN;
|
|
d.len1++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
/* no break, falling through */
|
|
case MultibyteAccelInfo::MAT_DSHIFT:
|
|
{
|
|
/*
|
|
* For double shift matcher, we want three matches, each separated
|
|
* by a lot of anything.
|
|
*
|
|
* Doubleshift matcher is complicated by presence of tails.
|
|
*/
|
|
if (ref_cr == cur_cr) {
|
|
// decide if we are activating second shift or matching tails
|
|
switch (d.state) {
|
|
case STATE_FIRST_RUN:
|
|
d.state = STATE_FIRST_TAIL;
|
|
d.len2 = 1; // we're now ready for our second run
|
|
break;
|
|
case STATE_FIRST_TAIL:
|
|
d.tlen1++;
|
|
break;
|
|
case STATE_SECOND_RUN:
|
|
d.state = STATE_SECOND_TAIL;
|
|
break;
|
|
case STATE_SECOND_TAIL:
|
|
d.tlen2++;
|
|
break;
|
|
default:
|
|
// shouldn't happen
|
|
assert(0);
|
|
}
|
|
} else {
|
|
switch (d.state) {
|
|
case STATE_FIRST_RUN:
|
|
d.len1++;
|
|
break;
|
|
case STATE_FIRST_TAIL:
|
|
// start second run
|
|
d.state = STATE_SECOND_RUN;
|
|
d.len2++;
|
|
break;
|
|
case STATE_SECOND_RUN:
|
|
d.len2++;
|
|
break;
|
|
case STATE_SECOND_TAIL:
|
|
// stop
|
|
d.state = STATE_STOPPED;
|
|
break;
|
|
default:
|
|
// shouldn't happen
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
// shouldn't happen
|
|
assert(0);
|
|
break;
|
|
}
|
|
}
|
|
|
|
MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
|
|
unsigned max_len) :
|
|
cr(ref_cr), offset(off), max_len(max_len) {
|
|
int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
|
|
accels.resize(accel_num);
|
|
|
|
// mark everything as valid
|
|
for (int i = 0; i < accel_num; i++) {
|
|
accel_data &ad = accels[i];
|
|
ad.len1 = 1;
|
|
ad.type = (MultibyteAccelInfo::multiaccel_type) i;
|
|
|
|
/* for shift-grab matchers, we are waiting for the grab right at the start */
|
|
if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
|
|
|| ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
|
|
ad.state = STATE_WAITING_FOR_GRAB;
|
|
} else {
|
|
ad.state = STATE_FIRST_RUN;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool MultiaccelCompileHelper::canAdvance() {
|
|
for (const accel_data &ad : accels) {
|
|
if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
|
|
for (accel_data &ad : accels) {
|
|
if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
|
|
continue;
|
|
}
|
|
match(ad, cr, cur_cr);
|
|
#ifdef DEBUG
|
|
dumpMultiaccelState(ad);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
|
|
int best_len = 0;
|
|
accel_data best;
|
|
|
|
DEBUG_PRINTF("Stopping multiaccel compile\n");
|
|
|
|
for (accel_data &ad : accels) {
|
|
// stop our matching
|
|
stop(ad);
|
|
validate(ad, max_len);
|
|
|
|
#ifdef DEBUG
|
|
dumpMultiaccelState(ad);
|
|
#endif
|
|
|
|
// skip invalid schemes
|
|
if (ad.state == STATE_INVALID) {
|
|
continue;
|
|
}
|
|
DEBUG_PRINTF("Marking as viable\n");
|
|
|
|
// TODO: relative strengths of accel schemes? maybe e.g. a shorter
|
|
// long match would in some cases be preferable to a longer
|
|
// double shift match (for example, depending on length)?
|
|
int as_len = ad.len1 + ad.len2;
|
|
if (as_len >= best_len) {
|
|
DEBUG_PRINTF("Marking as best\n");
|
|
best_len = as_len;
|
|
best = ad;
|
|
}
|
|
}
|
|
// if we found at least one accel scheme, return it
|
|
if (best.state != STATE_INVALID) {
|
|
#ifdef DEBUG
|
|
DEBUG_PRINTF("Picked best multiaccel state:\n");
|
|
dumpMultiaccelState(best);
|
|
#endif
|
|
MultibyteAccelInfo info;
|
|
info.cr = cr;
|
|
info.offset = offset;
|
|
info.len1 = best.len1;
|
|
info.len2 = best.len2;
|
|
info.type = best.type;
|
|
return info;
|
|
}
|
|
return MultibyteAccelInfo();
|
|
}
|