mirror of
https://github.com/openappsec/openappsec.git
synced 2025-06-28 16:41:02 +03:00
2230 lines
84 KiB
C++
2230 lines
84 KiB
C++
// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <memory>
|
|
#include "pm_adaptor.h"
|
|
#include "kiss_hash.h"
|
|
#include "kiss_thin_nfa_impl.h"
|
|
#include "kiss_patterns.h"
|
|
|
|
// Flag for a Thin NFA state
|
|
typedef enum {
|
|
THIN_NFA_STATE_FULL = 0x01, // We want a full state table for this state
|
|
THIN_NFA_STATE_MATCH = 0x02, // A matching state
|
|
THIN_NFA_STATE_ROOT = 0x04, // The root or anchored root state
|
|
THIN_NFA_STATE_MAX_IDENTICAL_CHAR = 0x08, // Maximal sequence of identical characters
|
|
THIN_NFA_STATE_ANCHORED = 0x10, // A part of the anchored tree
|
|
THIN_NFA_STATE_BUILT_TABLE = 0x20, // Already built the BNFA transition table
|
|
THIN_NFA_STATE_REACH_FROM_FULL = 0x40, // The state is reachable from full state
|
|
} nfa_thin_state_flags_t;
|
|
|
|
// A Thin NFA state, or a node in the trie, during compilation time
|
|
struct kiss_thin_nfa_state_s {
|
|
u_int state_id; // Sequencial number, starting from 0
|
|
nfa_thin_state_flags_t flags;
|
|
u_int depth; // Level in the trie
|
|
kiss_thin_nfa_pattern_list_t *ids; // For finite state, patterns associated with it
|
|
struct kiss_thin_nfa_state_s *bfs_q; // Use for a BFS iteration on the trie
|
|
struct thin_nfa_comp_s *comp; // Saves passing this pointer around
|
|
// Outgoing transitions
|
|
struct kiss_thin_nfa_state_s *child; // First child of this state
|
|
u_int num_trans; // Number of transitions
|
|
struct kiss_thin_nfa_state_s *fail_state;
|
|
// Incoming transition
|
|
struct kiss_thin_nfa_state_s *sibling; // Next child of this state's father
|
|
u_char tran_char; // The character that takes us to this state
|
|
// BNFA offset
|
|
kiss_bnfa_offset_t bnfa_offset; // Where the real state is
|
|
kiss_bnfa_offset_t bnfa_incoming_off; // Where incoming transitions should jump (possibly a match state)
|
|
// DEBUG ONLY
|
|
const u_char *pattern_text; // Points into the user's pattern list. Not null terminated
|
|
};
|
|
typedef struct kiss_thin_nfa_state_s kiss_thin_nfa_state_t;
|
|
|
|
|
|
// Blocks to hold states . A pretty simple pool mechanism.
|
|
// Not very much needed. We currently use it to iterate states by ID order, and for state pointer validation.
|
|
#define MAX_THIN_NFA_STATES_BLOCKS 1000
|
|
#define KISS_NFA_MAX_STATES_PER_BLOCK 1000
|
|
#define KISS_NFA_MAX_STATES_BLOCK_SIZE (KISS_NFA_MAX_STATES_PER_BLOCK * sizeof(kiss_thin_nfa_state_t))
|
|
|
|
|
|
// When do we want a full state? In the first X tiers (root included) and if more than Y transitions.
|
|
u_int kiss_thin_nfa_full_tiers_num_small = 2; // Old values, for PMs which must remain small
|
|
u_int kiss_thin_nfa_full_tiers_num_medium = 3; // Used for VSX / 32bit kernel, where memory is expensive
|
|
u_int kiss_thin_nfa_full_tiers_num = 7; // New value
|
|
u_int kiss_thin_nfa_max_partial_trans = 15; // Can't exceed KISS_BNFA_MAX_TRANS_NUM anyway
|
|
u_int kiss_thin_nfa_optimize_contig_chars = 1;
|
|
|
|
|
|
// Character translation table for caseless/digitless comparisons.
|
|
//
|
|
// The idea:
|
|
// Each character has a canonic character. This can be itself, or another.
|
|
// In a caseless Thin NFA, 'a' and '7' are canonic themselves, 'B' has canonic character 'b'.
|
|
// In a digitless Thin NFA, '7' is not canonic - its canonic character is '0'.
|
|
// Each character is also a member of a group, containing all characters with the same canonic character.
|
|
// In a caseless Thin NFA, 'a' and 'A' are in one group.
|
|
// In a digitless Thin NFA, all digits are in one group.
|
|
// Notice that a single Thin NFA can be caseless, digitless, neither or both.
|
|
//
|
|
// The data structure:
|
|
// tab - Translates each character into its canonic characer (possibly itself).
|
|
// rev - A linked list of characters belonging to the same group. The character itself
|
|
// is used instead of a pointer. The last character in the group points to the first.
|
|
// Example: For a caseless Thin NFA, rev['a']='A' and rev['A']='a'.
|
|
struct thin_nfa_char_trans_tab_s {
|
|
u_char tab[KISS_PM_ALPHABET_SIZE];
|
|
u_char rev[KISS_PM_ALPHABET_SIZE];
|
|
};
|
|
|
|
|
|
// Flags for an entire Thin NFA during compilation
|
|
typedef enum {
|
|
THIN_NFA_FAIL_STATES_CALCULATED = 0x01, // Once we set this, we expect all states to have fail states.
|
|
THIN_NFA_ENABLE_ANCHOR_OPT = 0x02, // Enable optimization for anochored states
|
|
THIN_NFA_USE_RECURSIVE_COMPILE = 0x04, // Build full states recursively. Faster, unsuitable for kernel
|
|
} thin_nfa_comp_flags_e;
|
|
|
|
|
|
// A Thin NFA which is under construction. The compiled BNFA is constructed from this later.
|
|
struct thin_nfa_comp_s {
|
|
kiss_thin_nfa_state_t *root_state; // The root state (somewhere inside state_blocks)
|
|
kiss_thin_nfa_state_t *anchored_root_state; // The root for anchored patterns
|
|
u_int full_state_tier_num; // How many tiers would be full states?
|
|
u_int state_num; // How many states do we have so far
|
|
u_int match_state_num; // How many matching states do we have?
|
|
u_int full_state_num; // How many full states do we have?
|
|
KissPMError *error; // Error to be returned to the user.
|
|
thin_nfa_comp_flags_e flags;
|
|
struct thin_nfa_char_trans_tab_s *xlation_tab; // Caseless/digitless translation table
|
|
kiss_thin_nfa_state_t *state_blocks[MAX_THIN_NFA_STATES_BLOCKS]; // Dynamically allocated memory for states
|
|
std::unique_ptr<KissThinNFA> runtime_nfa; // The final NFA we're building
|
|
kiss_hash_t patterns_hash; // Pattern array to offset mapping
|
|
kiss_bnfa_offset_t min_bnfa_off, max_bnfa_off;
|
|
};
|
|
|
|
|
|
#if defined(DEBUG)
|
|
#define KISS_THIN_NFA_DO_VERIFICATIONS
|
|
#endif
|
|
|
|
|
|
#define MAX_STATE_NAME_LEN 100
|
|
#define MAX_STATE_NAME_BUFS 4
|
|
|
|
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_get_state_by_id(struct thin_nfa_comp_s *nfa_comp, u_int state_id, const char *caller)
|
|
{
|
|
u_int block_index;
|
|
u_int index_in_block;
|
|
kiss_thin_nfa_state_t *block;
|
|
|
|
// Find the block and the place in the block
|
|
block_index = state_id / KISS_NFA_MAX_STATES_PER_BLOCK;
|
|
index_in_block = state_id % KISS_NFA_MAX_STATES_PER_BLOCK;
|
|
if (block_index >= MAX_THIN_NFA_STATES_BLOCKS) {
|
|
thinnfa_debug_critical(("%s: State %d - invalid block index %d (max %d)\n", caller,
|
|
state_id, block_index, MAX_THIN_NFA_STATES_BLOCKS));
|
|
return NULL;
|
|
}
|
|
block = nfa_comp->state_blocks[block_index];
|
|
if (block == NULL) {
|
|
thinnfa_debug_critical((
|
|
"%s: State %d - block index %d is not allocated yet\n",
|
|
caller,
|
|
state_id,
|
|
block_index
|
|
));
|
|
return NULL;
|
|
}
|
|
|
|
return &block[index_in_block];
|
|
}
|
|
|
|
|
|
// DEBUG FUNCTION - return a printable name for the state, in a static buffer.
|
|
// Accepts a NULL state.
|
|
static const char *
|
|
state_name(const kiss_thin_nfa_state_t *state)
|
|
{
|
|
static char buffers[MAX_STATE_NAME_BUFS][MAX_STATE_NAME_LEN];
|
|
static u_int next_buf = 0;
|
|
u_int cur_buf;
|
|
char *name, *p;
|
|
|
|
if (!state) {
|
|
// Happens when printing the root's fail state
|
|
return "NULL/-1";
|
|
}
|
|
|
|
// What's a state's name?
|
|
// Each state represents a prefix of one or more patterns. This prefix is the natural name for the state.
|
|
// We have the pattern text on the state. Its depth tells us how much of it do we need.
|
|
// We add the state ID as a suffix, to prevent ambiguituies (particularly for unprintable characters).
|
|
|
|
//Choose a buffer to use. Allows calling several times under a single debug message.
|
|
cur_buf = next_buf;
|
|
if (cur_buf >= MAX_STATE_NAME_BUFS) cur_buf = 0;
|
|
next_buf = cur_buf + 1;
|
|
name = buffers[cur_buf];
|
|
p = name;
|
|
|
|
if (state->flags & THIN_NFA_STATE_ANCHORED) {
|
|
// Prefix for anchored states
|
|
*p = '^';
|
|
p++;
|
|
}
|
|
|
|
// Fill in the state name. Not null-terminated meanwhile.
|
|
if (state->pattern_text == NULL) {
|
|
const char *state_name;
|
|
// Only the root makes sense. But deal with a missing pattern text anyway
|
|
state_name = (state->flags & THIN_NFA_STATE_ROOT) ? "ROOT" : "INVALID";
|
|
strcpy(p, state_name);
|
|
p += strlen(state_name);
|
|
} else {
|
|
u_int i;
|
|
// Normal state - use the relevant prefix of the pattern text
|
|
for (i=0; (i<state->depth) && (p<name+MAX_STATE_NAME_LEN-10); i++) {
|
|
*p = isprint(state->pattern_text[i]) ? state->pattern_text[i] : '.';
|
|
p++;
|
|
}
|
|
}
|
|
|
|
// Append the state ID. Removes ambituities (e.g. for unprintable characters)
|
|
snprintf(p, MAX_STATE_NAME_LEN-(p-name), "/%u", state->state_id);
|
|
name[MAX_STATE_NAME_LEN-1] = '\0';
|
|
|
|
return name;
|
|
}
|
|
|
|
|
|
#if defined(KISS_THIN_NFA_DO_VERIFICATIONS)
|
|
|
|
// DEBUG FUNCTION - Verify that a state pointer points to a valid state
|
|
static BOOL
|
|
is_valid_state_ptr(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *state, const char *caller)
|
|
{
|
|
kiss_thin_nfa_state_t *state_by_id;
|
|
|
|
if (!state) {
|
|
thinnfa_debug_critical(("%s: Null state pointer\n", caller));
|
|
return FALSE;
|
|
}
|
|
|
|
state_by_id = kiss_thin_nfa_get_state_by_id(nfa_comp, state->state_id, caller);
|
|
if (!state_by_id) {
|
|
return FALSE;
|
|
}
|
|
|
|
// Is the state where we expect it to be in the block?
|
|
if (state != state_by_id) {
|
|
thinnfa_debug_critical(("%s: State %p ID %d is invalid - should be at %p\n", caller, state,
|
|
state->state_id, state_by_id));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
// DEBUG FUNCTION - Verify a state's transition table
|
|
static void
|
|
verify_state_ex(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *state, const char *caller)
|
|
{
|
|
kiss_thin_nfa_state_t *child, *prev_child;
|
|
u_int actual_tran_num;
|
|
|
|
// Is the pointer itself OK?
|
|
KISS_ASSERT(is_valid_state_ptr(nfa_comp, state, caller),
|
|
("%s: Invalid state pointer %p\n", caller, state));
|
|
|
|
// Go over the transition table
|
|
actual_tran_num = 0;
|
|
prev_child = NULL;
|
|
for (child = state->child; child != NULL; child = child->sibling) {
|
|
// Valid pointer?
|
|
KISS_ASSERT(is_valid_state_ptr(nfa_comp, child, caller),
|
|
("%s: State %s(%p) contains an invalid child %p after %02x\n", caller, state_name(state), state,
|
|
child, prev_child ? prev_child->tran_char : 0));
|
|
|
|
// Sorted in ascending order?
|
|
KISS_ASSERT(!prev_child || prev_child->tran_char < child->tran_char,
|
|
("%s: State %s(%p) transition %02x -> %s after %02x -> %s\n", caller,
|
|
state_name(state), state,
|
|
child->tran_char, state_name(child),
|
|
prev_child->tran_char, state_name(prev_child)));
|
|
|
|
actual_tran_num++;
|
|
if (actual_tran_num > state->num_trans) {
|
|
// We may be looping
|
|
break;
|
|
}
|
|
prev_child = child;
|
|
}
|
|
|
|
// Counter matches list?
|
|
KISS_ASSERT(actual_tran_num == state->num_trans,
|
|
("%s: State %s(%p) has %d transitions, but it should have %d\n", caller, state_name(state), state,
|
|
actual_tran_num, state->num_trans));
|
|
|
|
// Fail state?
|
|
if (nfa_comp->flags & THIN_NFA_FAIL_STATES_CALCULATED) {
|
|
if (state->fail_state == NULL) {
|
|
KISS_ASSERT(state == nfa_comp->root_state, ("%s: State %s has no fail state, but it is not root",
|
|
caller, state_name(state)));
|
|
} else {
|
|
KISS_ASSERT(
|
|
is_valid_state_ptr(nfa_comp, state->fail_state, caller),
|
|
"%s: State %s has an invalid fail state %p\n",
|
|
caller,
|
|
state_name(state),
|
|
state->fail_state
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Use this for sanity test on a state
|
|
#define verify_state(nfa_comp, state) verify_state_ex(nfa_comp, state, FILE_LINE)
|
|
|
|
#else // KISS_THIN_NFA_DO_VERIFICATIONS
|
|
|
|
// Verifications disabled
|
|
#define verify_state(nfa_comp, state)
|
|
|
|
#endif // KISS_THIN_NFA_DO_VERIFICATIONS
|
|
|
|
|
|
// Mark that a state needs to be full
|
|
static void
|
|
make_state_full(kiss_thin_nfa_state_t *state)
|
|
{
|
|
if (state->flags & THIN_NFA_STATE_FULL) return;
|
|
ENUM_SET_FLAG(state->flags, THIN_NFA_STATE_FULL);
|
|
state->comp->full_state_num++;
|
|
}
|
|
|
|
|
|
// Mark that a state is matching
|
|
static void
|
|
make_state_matching(kiss_thin_nfa_state_t *state)
|
|
{
|
|
if (state->flags & THIN_NFA_STATE_MATCH) return;
|
|
ENUM_SET_FLAG(state->flags, THIN_NFA_STATE_MATCH);
|
|
state->comp->match_state_num++;
|
|
}
|
|
|
|
|
|
// Allocate an empty state on an NFA.
|
|
// Initializes all fields to defaults.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_state_create(
|
|
struct thin_nfa_comp_s *nfa_comp,
|
|
u_int depth,
|
|
const u_char *pattern_text,
|
|
nfa_thin_state_flags_t flags
|
|
)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_state_create";
|
|
u_int state_id;
|
|
u_int block_index;
|
|
u_int index_in_block;
|
|
kiss_thin_nfa_state_t *block;
|
|
kiss_thin_nfa_state_t *state;
|
|
|
|
// Find the next ID and the block it should be in
|
|
state_id = nfa_comp->state_num;
|
|
block_index = state_id / KISS_NFA_MAX_STATES_PER_BLOCK;
|
|
index_in_block = state_id % KISS_NFA_MAX_STATES_PER_BLOCK;
|
|
|
|
thinnfa_debug_extended(("%s: Adding state %d depth %d\n", rname, state_id, depth));
|
|
|
|
// No more possible blocks?
|
|
if (block_index >= MAX_THIN_NFA_STATES_BLOCKS) {
|
|
thinnfa_debug_err(("%s: State %d in block %d exceeds the limit %d\n", rname,
|
|
state_id, block_index, MAX_THIN_NFA_STATES_BLOCKS));
|
|
return NULL;
|
|
}
|
|
|
|
// Allocate the block if needed (first state in the block)
|
|
block = nfa_comp->state_blocks[block_index];
|
|
if (block == NULL) {
|
|
block = (kiss_thin_nfa_state_t *)fw_kmalloc_ex(KISS_NFA_MAX_STATES_BLOCK_SIZE, rname, FW_KMEM_SLEEP);
|
|
if (block == NULL) {
|
|
thinnfa_debug_err(("%s: Failed to allocate a state block size %lu for the state %u\n", rname,
|
|
KISS_NFA_MAX_STATES_BLOCK_SIZE, state_id));
|
|
return NULL;
|
|
}
|
|
nfa_comp->state_blocks[block_index] = block;
|
|
}
|
|
|
|
// Initialize the state
|
|
state = &block[index_in_block];
|
|
|
|
state->state_id = state_id;
|
|
state->flags = flags;
|
|
state->ids = NULL;
|
|
state->bfs_q = NULL;
|
|
state->child = NULL;
|
|
state->num_trans = 0;
|
|
state->fail_state = NULL;
|
|
state->sibling = NULL;
|
|
state->tran_char = '\0'; // Will be modified, except for the root
|
|
state->pattern_text = pattern_text;
|
|
state->depth = depth;
|
|
state->comp = nfa_comp;
|
|
state->bnfa_offset = KISS_BNFA_OFFSET_INVALID;
|
|
state->bnfa_incoming_off = KISS_BNFA_OFFSET_INVALID;
|
|
|
|
// Do we want a full state? kiss_thin_nfa_full_tiers_num=2 means tiers 0 and 1, i.e. the root plus one, are full.
|
|
if (state->flags & THIN_NFA_STATE_ROOT) {
|
|
// The root must be full, because it has no fail state.
|
|
// The anchored root (if exists) is the first state, and must be full, for the bnfa_full_state_size
|
|
// condition to work.
|
|
make_state_full(state);
|
|
} else if (depth < nfa_comp->full_state_tier_num && !(state->flags & THIN_NFA_STATE_ANCHORED)) {
|
|
make_state_full(state);
|
|
}
|
|
|
|
// Advance the counter
|
|
nfa_comp->state_num++;
|
|
|
|
return state;
|
|
}
|
|
|
|
|
|
// Release all resources on a state structure.
|
|
// Doesn't release the states, because it's part of a state block.
|
|
static void
|
|
kiss_thin_nfa_state_free(kiss_thin_nfa_state_t *state)
|
|
{
|
|
// Clean up the pattern list
|
|
if (state->ids) {
|
|
kiss_thin_nfa_free_pattern_ids(state->ids);
|
|
state->ids = NULL;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
// Returns the following state, by ID order.
|
|
// With prev==NULL, returns the first state.
|
|
// With prev!=NULL, returns the next.
|
|
// If prev is the last state, returns NULL.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_get_subsequent_state(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *prev)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_get_subsequent_state";
|
|
u_int state_id;
|
|
|
|
// Find the next state's ID
|
|
state_id = prev ? prev->state_id + 1 : 0;
|
|
if (state_id >= nfa_comp->state_num) {
|
|
// prev was the last state.
|
|
return NULL;
|
|
}
|
|
|
|
// Get the state pointer
|
|
return kiss_thin_nfa_get_state_by_id(nfa_comp, state_id, rname);
|
|
}
|
|
|
|
|
|
// Find the transition for a given character from a given state.
|
|
// If no transition found, returns NULL and does not check the fail state.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_comp_get_next_state(kiss_thin_nfa_state_t *state, u_char ch)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_comp_get_next_state";
|
|
kiss_thin_nfa_state_t *child;
|
|
|
|
verify_state(state->comp, state);
|
|
|
|
// Find the child in the list
|
|
for (child = state->child; child != NULL; child = child->sibling) {
|
|
u_char tran_ch = child->tran_char;
|
|
|
|
if (tran_ch == ch) {
|
|
thinnfa_debug_extended((
|
|
"%s: Found transition from the state %s by 0x%02x to %s\n",
|
|
rname,
|
|
state_name(state),
|
|
ch,
|
|
state_name(child)
|
|
));
|
|
return child;
|
|
}
|
|
|
|
// The list is sorted, so we don't need to look beyond the character.
|
|
if (tran_ch > ch) break;
|
|
}
|
|
|
|
thinnfa_debug_extended(("%s: No transition from the state %s by 0x%02x\n", rname, state_name(state), ch));
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// Mark a state as finite and accepting a given kiss_thin_nfa_pattern_t pattern
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_state_set_match(kiss_thin_nfa_state_t *state, const kiss_thin_nfa_pattern_t *pat_info)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_state_set_match";
|
|
|
|
verify_state(state->comp, state);
|
|
|
|
// Add the pattern to this state's pattern list
|
|
if (kiss_thin_nfa_add_pattern_id(&(state->ids), pat_info) != KISS_OK) {
|
|
thinnfa_debug_err((
|
|
"%s: Could not add the 'pattern_id' %d to the final state %s\n",
|
|
rname,
|
|
pat_info->id,
|
|
state_name(state)
|
|
));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
thinnfa_debug((
|
|
"Setting state %s as the matching state for the 'pattern_id' %d\n",
|
|
state_name(state),
|
|
pat_info->id
|
|
));
|
|
make_state_matching(state);
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Mark a state as finite, and accepting a given kiss_pmglob_string pattern
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_state_set_match_pattern(kiss_thin_nfa_state_t *state, const kiss_pmglob_string_s *pattern)
|
|
{
|
|
kiss_thin_nfa_pattern_t pat_info;
|
|
|
|
pat_info.id = kiss_pmglob_string_get_id(pattern);
|
|
pat_info.pattern_id_flags = kiss_pmglob_string_get_flags(pattern);
|
|
pat_info.len = kiss_pmglob_string_get_size(pattern);
|
|
|
|
return kiss_thin_nfa_state_set_match(state, &pat_info);
|
|
}
|
|
|
|
|
|
// Copy the list of accepted patterns from one state to another.
|
|
// The destination state can already have patterns, and the lists would be concatenated.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_state_copy_match_ids(kiss_thin_nfa_state_t *dst, kiss_thin_nfa_state_t *src)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_state_copy_match_ids";
|
|
kiss_thin_nfa_pattern_list_t *curr_id;
|
|
|
|
verify_state(src->comp, src);
|
|
verify_state(dst->comp, dst);
|
|
|
|
thinnfa_debug(("%s: Copying the match IDs from %s to %s\n", rname, state_name(src), state_name(dst)));
|
|
|
|
// traversing on the state_src 'ids' adding each one to 'state_dst' list
|
|
for(curr_id = src->ids; curr_id; curr_id = curr_id->next) {
|
|
if (kiss_thin_nfa_state_set_match(dst, &curr_id->pattern) != KISS_OK) {
|
|
thinnfa_debug_err((
|
|
"%s: Failed to set the ID %d on the state %s\n",
|
|
rname,
|
|
curr_id->pattern.id,
|
|
state_name(dst)
|
|
));
|
|
|
|
// NOTE: We don't release the IDs we have added. Compilation will fail and clean up anyway.
|
|
return KISS_ERROR;
|
|
}
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Destroy the NFA we're compiling
|
|
static void
|
|
kiss_thin_nfa_comp_destroy(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_comp_destroy";
|
|
u_int i;
|
|
kiss_thin_nfa_state_t *state;
|
|
|
|
thinnfa_debug_major(("%s: Destroying the compilation information structure\n", rname));
|
|
|
|
// Cleanup whatever data we have on the states.
|
|
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
|
|
kiss_thin_nfa_state_free(state);
|
|
}
|
|
|
|
// Free the state blocks and transition blocks
|
|
for (i = 0; i < MAX_THIN_NFA_STATES_BLOCKS; i++) {
|
|
if (nfa_comp->state_blocks[i] != NULL) {
|
|
fw_kfree(nfa_comp->state_blocks[i], KISS_NFA_MAX_STATES_BLOCK_SIZE, rname);
|
|
nfa_comp->state_blocks[i] = NULL;
|
|
}
|
|
}
|
|
|
|
if (nfa_comp->xlation_tab!= NULL) {
|
|
fw_kfree(nfa_comp->xlation_tab, sizeof(*(nfa_comp->xlation_tab)), rname);
|
|
nfa_comp->xlation_tab= NULL;
|
|
}
|
|
|
|
nfa_comp->runtime_nfa.reset(nullptr);
|
|
|
|
if (nfa_comp->patterns_hash) {
|
|
kiss_hash_destroy(nfa_comp->patterns_hash);
|
|
nfa_comp->patterns_hash = NULL;
|
|
}
|
|
|
|
fw_kfree(nfa_comp, sizeof(*nfa_comp), rname);
|
|
}
|
|
|
|
|
|
// Allocate an empty thin NFA compilation data structure.
|
|
static struct thin_nfa_comp_s *
|
|
kiss_thin_nfa_comp_create(KissPMError *error)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_comp_create";
|
|
struct thin_nfa_comp_s *nfa_comp = NULL;
|
|
|
|
thinnfa_debug_major(("%s: Allocating the compilation information structure\n", rname));
|
|
|
|
// Allocate and initialize the compilation temporary structure
|
|
nfa_comp = (struct thin_nfa_comp_s *)fw_kmalloc(sizeof(*nfa_comp), rname);
|
|
if (!nfa_comp) {
|
|
thinnfa_debug_err(("%s: Failed to allocate 'nfa_comp'\n", rname));
|
|
goto failure;
|
|
}
|
|
bzero((void *)nfa_comp, sizeof(*nfa_comp));
|
|
|
|
nfa_comp->error = error;
|
|
|
|
// Build the root state
|
|
nfa_comp->root_state = kiss_thin_nfa_state_create(nfa_comp, 0, NULL, THIN_NFA_STATE_ROOT);
|
|
if (nfa_comp->root_state == NULL) {
|
|
thinnfa_debug_err(("%s: Failed to create the root state\n", rname));
|
|
goto failure;
|
|
}
|
|
|
|
return nfa_comp;
|
|
|
|
failure:
|
|
|
|
if (nfa_comp != NULL) {
|
|
kiss_thin_nfa_comp_destroy(nfa_comp);
|
|
}
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
// Specify the error for failed Thin NFA compilation
|
|
static void
|
|
kiss_thin_nfa_set_comp_error(struct thin_nfa_comp_s *nfa_comp, const char *err_text)
|
|
{
|
|
// We always use "internal", which is appropriate for both logical errors and resource shortage.
|
|
// We don't specify a pattern, because nothing is really pattern specific.
|
|
kiss_pm_error_set_details(nfa_comp->error, KISS_PM_ERROR_INTERNAL, err_text);
|
|
}
|
|
|
|
|
|
// Initialize a translation table for caseless/digitless comparison.
|
|
// According to compilation flags, builds a table to translate each character.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_create_xlation_tab(struct thin_nfa_comp_s *nfa_comp, int pm_comp_flags)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_create_xlation_tab";
|
|
enum kiss_pmglob_char_xlation_flags_e xlation_flags;
|
|
|
|
// Figure out which translations we need
|
|
xlation_flags = KISS_PMGLOB_CHAR_XLATION_NONE;
|
|
if (pm_comp_flags & KISS_PM_COMP_CASELESS) {
|
|
ENUM_SET_FLAG(xlation_flags, KISS_PMGLOB_CHAR_XLATION_CASE);
|
|
}
|
|
if (pm_comp_flags & KISS_PM_COMP_DIGITLESS) {
|
|
ENUM_SET_FLAG(xlation_flags, KISS_PMGLOB_CHAR_XLATION_DIGITS);
|
|
}
|
|
if (xlation_flags == KISS_PMGLOB_CHAR_XLATION_NONE) {
|
|
// No translation needed
|
|
nfa_comp->xlation_tab = NULL;
|
|
return KISS_OK;
|
|
}
|
|
|
|
thinnfa_debug_major(("%s: Using%s%s translation table\n", rname,
|
|
(xlation_flags&KISS_PMGLOB_CHAR_XLATION_CASE) ? " caseless" : "",
|
|
(xlation_flags&KISS_PMGLOB_CHAR_XLATION_DIGITS) ? " digitless" : ""));
|
|
|
|
// Allocate a translation table
|
|
nfa_comp->xlation_tab = (struct thin_nfa_char_trans_tab_s *)fw_kmalloc(sizeof(*(nfa_comp->xlation_tab)), rname);
|
|
if (!nfa_comp->xlation_tab) {
|
|
thinnfa_debug_err(("%s: Failed to allocate the translation table\n", rname));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Build the mapping - normal and reverse
|
|
kiss_pmglob_char_xlation_build(xlation_flags, nfa_comp->xlation_tab->tab);
|
|
kiss_pmglob_char_xlation_build_reverse(nfa_comp->xlation_tab->tab, nfa_comp->xlation_tab->rev);
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Translate a character to canonic form, if a translation table is defined.
|
|
static CP_INLINE u_char
|
|
kiss_thin_nfa_xlate_char(struct thin_nfa_comp_s *nfa_comp, u_char ch)
|
|
{
|
|
if (!nfa_comp->xlation_tab) return ch;
|
|
return nfa_comp->xlation_tab->tab[ch];
|
|
}
|
|
|
|
|
|
#if defined(KISS_THIN_NFA_DO_VERIFICATIONS) && !defined(KERNEL)
|
|
|
|
// DEBUG FUNCTION - uses a simple&slow algorithm to verify the result of kiss_thin_nfa_are_trans_contained.
|
|
// Can't run in the kernel because of the large stack consumption.
|
|
static void
|
|
verify_trans_contains_(
|
|
kiss_thin_nfa_state_t *state_contains,
|
|
kiss_thin_nfa_state_t *state_included,
|
|
BOOL should_contain
|
|
)
|
|
{
|
|
kiss_thin_nfa_state_t *trans_contains[KISS_PM_ALPHABET_SIZE];
|
|
kiss_thin_nfa_state_t *trans_included[KISS_PM_ALPHABET_SIZE];
|
|
kiss_thin_nfa_state_t *child;
|
|
u_int i;
|
|
int mismatch_pos;
|
|
|
|
// Fill in both transition tables
|
|
bzero(trans_contains, sizeof(trans_contains));
|
|
for (child = state_contains->child; child != NULL; child = child->sibling) {
|
|
trans_contains[child->tran_char] = child;
|
|
}
|
|
bzero(trans_included, sizeof(trans_included));
|
|
for (child = state_included->child; child != NULL; child = child->sibling) {
|
|
trans_included[child->tran_char] = child;
|
|
}
|
|
|
|
// Go over the table, looking for a character that's in "included" but not in "contains".
|
|
mismatch_pos = -1;
|
|
for (i=0; i<KISS_PM_ALPHABET_SIZE; i++) {
|
|
if (trans_included[i] != NULL && trans_contains[i] == NULL) {
|
|
mismatch_pos = i;
|
|
break;
|
|
}
|
|
}
|
|
if (mismatch_pos < 0) {
|
|
// No mismatch - really contains
|
|
KISS_ASSERT(
|
|
should_contain,
|
|
("State %s contains %s, but the kiss_thin_nfa_are_trans_contained says it does not",
|
|
state_name(state_contains),
|
|
state_name(state_contains))
|
|
);
|
|
} else {
|
|
// Mismatch - doesn't contain
|
|
KISS_ASSERT(
|
|
!should_contain,
|
|
("State %s does not contain %s (%02x -> %s), but the kiss_thin_nfa_are_trans_contained says it does",
|
|
state_name(state_contains),
|
|
state_name(state_included),
|
|
(u_char)mismatch_pos,
|
|
state_name(trans_included[i]))
|
|
);
|
|
}
|
|
}
|
|
|
|
|
|
#define verify_trans_contains(state_contains, state_included, expected) \
|
|
verify_trans_contains_(state_contains, state_included, expected)
|
|
|
|
#else // KISS_THIN_NFA_DO_VERIFICATIONS
|
|
|
|
#define verify_trans_contains(state_contains, state_included, expected)
|
|
|
|
#endif // KISS_THIN_NFA_DO_VERIFICATIONS
|
|
|
|
|
|
// Do all transactions of "included" also exist in "contains"?
|
|
static BOOL
|
|
kiss_thin_nfa_are_trans_contained(kiss_thin_nfa_state_t *state_contains, kiss_thin_nfa_state_t *state_included)
|
|
{
|
|
kiss_thin_nfa_state_t *included_child, *contains_child;
|
|
|
|
verify_state(state_contains->comp, state_contains);
|
|
verify_state(state_included->comp, state_included);
|
|
|
|
if (state_contains->num_trans < state_included->num_trans) {
|
|
// "contains" has fewer states - it can't include all "included"
|
|
verify_trans_contains(state_contains, state_included, FALSE);
|
|
return FALSE;
|
|
}
|
|
|
|
// Advance both included_child and contains_child, to iterate both transition tables.
|
|
// Keep them in sync - included_child passes children one by one, and contains_child is advanced
|
|
// to the same transition character at each step.
|
|
contains_child = state_contains->child;
|
|
|
|
// Go over the transitions in "included", see if they're in "contained"
|
|
for (included_child = state_included->child; included_child != NULL; included_child = included_child->sibling) {
|
|
// Advance "tran_contains" until we reach the character we want
|
|
for (; contains_child != NULL; contains_child = contains_child->sibling) {
|
|
if (contains_child->tran_char >= included_child->tran_char) break;
|
|
}
|
|
|
|
// Do we have this character in "contains"?
|
|
if (contains_child == NULL || contains_child->tran_char != included_child->tran_char) {
|
|
// This character doesn't exist in state_contains
|
|
verify_trans_contains(state_contains, state_included, FALSE);
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
verify_trans_contains(state_contains, state_included, TRUE);
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
// Get the root state, or the anchored root state, as appropriate for the pattern.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_get_root_state(struct thin_nfa_comp_s *nfa_comp, int anchored)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_get_root_state";
|
|
|
|
if (!anchored || !(nfa_comp->flags & THIN_NFA_ENABLE_ANCHOR_OPT)) {
|
|
thinnfa_debug(("%s: Using normal root: %s, feature %s\n", rname,
|
|
anchored?"anchored":"not anchored", (nfa_comp->flags & THIN_NFA_ENABLE_ANCHOR_OPT)?"enabled":"disabled"));
|
|
return nfa_comp->root_state;
|
|
}
|
|
|
|
if (!nfa_comp->anchored_root_state) {
|
|
// Lazy creation of the anchored root state
|
|
nfa_thin_state_flags_t flags = THIN_NFA_STATE_ROOT;
|
|
ENUM_SET_FLAG(flags, THIN_NFA_STATE_ANCHORED);
|
|
thinnfa_debug(("%s: Creating a new anchored root\n", rname));
|
|
nfa_comp->anchored_root_state = kiss_thin_nfa_state_create(nfa_comp, 0, NULL, flags);
|
|
if (nfa_comp->anchored_root_state == NULL) {
|
|
thinnfa_debug_err(("%s: Failed to create the anchored root state\n", rname));
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
thinnfa_debug(("%s: Returning the anchored root (%d)\n", rname, nfa_comp->anchored_root_state->state_id));
|
|
return nfa_comp->anchored_root_state;
|
|
}
|
|
|
|
|
|
// Find the state in the trie, which represents the longest prefix of a given string.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_find_longest_prefix(struct thin_nfa_comp_s *nfa_comp, const u_char *text, u_int len, int anchored)
|
|
{
|
|
u_int offset;
|
|
kiss_thin_nfa_state_t *state;
|
|
|
|
// Following the path labeled by chars in 'pattern' (skip the states which already exist)
|
|
state = kiss_thin_nfa_get_root_state(nfa_comp, anchored);
|
|
if (!state) return NULL;
|
|
for (offset = 0; offset < len; offset++) {
|
|
kiss_thin_nfa_state_t *next_state;
|
|
u_char ch = kiss_thin_nfa_xlate_char(nfa_comp, text[offset]);
|
|
|
|
verify_state(nfa_comp, state);
|
|
|
|
// Do we have a node for the next character?
|
|
next_state = kiss_thin_nfa_comp_get_next_state(state, ch);
|
|
|
|
if (next_state == NULL) {
|
|
// No next state - this is as far as we go
|
|
break;
|
|
} else {
|
|
state = next_state;
|
|
}
|
|
}
|
|
|
|
return state;
|
|
}
|
|
|
|
|
|
// Add a newly allocated state to the trie. Keep the transition list sorted.
|
|
static void
|
|
kiss_thin_nfa_add_transition(kiss_thin_nfa_state_t *parent, u_char tran_char, kiss_thin_nfa_state_t *new_child)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_add_transition";
|
|
kiss_thin_nfa_state_t **child_p;
|
|
|
|
// Go over existing children and find the place to add the transition
|
|
for (child_p = &parent->child; *child_p != NULL; child_p = &(*child_p)->sibling) {
|
|
kiss_thin_nfa_state_t *child = *child_p;
|
|
if (child->tran_char > tran_char) {
|
|
// Add before this one
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Add the transition
|
|
new_child->sibling = *child_p;
|
|
*child_p = new_child;
|
|
new_child->tran_char = tran_char;
|
|
parent->num_trans++;
|
|
|
|
thinnfa_debug_extended(("%s: Added transition from %s by 0x%2x to %s\n", rname,
|
|
state_name(parent), tran_char, state_name(new_child)));
|
|
|
|
if (parent->num_trans > MIN(kiss_thin_nfa_max_partial_trans, KISS_BNFA_MAX_TRANS_NUM)) {
|
|
thinnfa_debug((
|
|
"%s: State %s has %d transitions - making it full\n",
|
|
rname,
|
|
state_name(parent),
|
|
parent->num_trans
|
|
));
|
|
make_state_full(parent);
|
|
}
|
|
|
|
// Track states which represent a maximal sequence of identical characters
|
|
if ((parent->flags & THIN_NFA_STATE_ROOT) && !(parent->flags & THIN_NFA_STATE_ANCHORED)) {
|
|
// Single character - all characters are identical
|
|
ENUM_SET_FLAG(new_child->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
|
|
} else if ((parent->flags & THIN_NFA_STATE_MAX_IDENTICAL_CHAR) && (parent->tran_char == tran_char)) {
|
|
// The child, not the parent, is now the longest
|
|
ENUM_UNSET_FLAG(parent->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
|
|
ENUM_SET_FLAG(new_child->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
|
|
}
|
|
}
|
|
|
|
|
|
// Add a pattern to the trie, which would generate the Thin NFA.
|
|
// Upon failure, doesn't clean up states it may have created. Will be cleaned up when destroying nfa_comp.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_add_pattern_to_trie(struct thin_nfa_comp_s *nfa_comp, const kiss_pmglob_string_s *sm_cur_pattern)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_add_pattern_to_trie";
|
|
const u_char *pattern_text;
|
|
u_int pattern_len;
|
|
u_int i;
|
|
kiss_thin_nfa_state_t *current_state;
|
|
int anchored_pattern;
|
|
|
|
pattern_text = kiss_pmglob_string_get_pattern(sm_cur_pattern);
|
|
pattern_len = kiss_pmglob_string_get_size(sm_cur_pattern);
|
|
anchored_pattern = kiss_pmglob_string_get_flags(sm_cur_pattern) & KISS_PM_LSS_AT_BUF_START;
|
|
|
|
thinnfa_debug(("%s: Adding the pattern: %s flags=%x\n", rname, kiss_pmglob_string_to_debug_charp(sm_cur_pattern),
|
|
kiss_pmglob_string_get_flags(sm_cur_pattern)));
|
|
|
|
// How much of this pattern do we already have in the tree?
|
|
current_state = kiss_thin_nfa_find_longest_prefix(nfa_comp, pattern_text, pattern_len, anchored_pattern);
|
|
if (!current_state) return KISS_ERROR; // Messages printed inside
|
|
|
|
thinnfa_debug(("%s: State %s (flags %x) represents the longest prefix at the offset %d/%d\n", rname,
|
|
state_name(current_state), current_state->flags, current_state->depth, pattern_len));
|
|
|
|
// Go over the remaining bytes (if any) and add more states
|
|
for (i = current_state->depth; i < pattern_len; i++) {
|
|
kiss_thin_nfa_state_t *new_state;
|
|
u_char ch;
|
|
|
|
// Create a new state. Depth i+1, because the first character (i=0) is at depth 1.
|
|
new_state = kiss_thin_nfa_state_create(nfa_comp, i+1, pattern_text,
|
|
(nfa_thin_state_flags_t)(current_state->flags & THIN_NFA_STATE_ANCHORED));
|
|
if (!new_state) {
|
|
thinnfa_debug_err(("%s: Failed to allocate a new state\n", rname));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a new state");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Add a transition into the new state
|
|
ch = kiss_thin_nfa_xlate_char(nfa_comp, pattern_text[i]);
|
|
kiss_thin_nfa_add_transition(current_state, ch, new_state);
|
|
|
|
thinnfa_debug(("%s: Added new state+transition %s -> %s by 0x%02x offset %d\n", rname,
|
|
state_name(current_state), state_name(new_state), ch, i));
|
|
|
|
verify_state(nfa_comp, current_state);
|
|
|
|
// Add the following states after this one
|
|
current_state = new_state;
|
|
}
|
|
|
|
// Set state as finite and add the pattern ID to the list of patterns which this state accepts.
|
|
// Note: It's OK if the state isn't one we just added. E.g. the new pattern is a prefix of an existing one.
|
|
if (kiss_thin_nfa_state_set_match_pattern(current_state, sm_cur_pattern) != KISS_OK) {
|
|
thinnfa_debug_err((
|
|
"%s: Failed to save the pattern information for the state %s\n",
|
|
rname,
|
|
state_name(current_state)
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to save the pattern information for the state");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Find the transition from a state by a character, considering fail states.
|
|
// The state should alrady have its fail state calculated.
|
|
//
|
|
// Note: kiss_bnfa_build_full_trans_table may pass from_state=NULL. The result is returning the root, which is OK.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_calc_transition(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *from_state, u_char tran_char)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_calc_transition";
|
|
kiss_thin_nfa_state_t *state;
|
|
|
|
// Go down the fail state chain, until we find a transition.
|
|
for (state = from_state; state != NULL; state = state->fail_state) {
|
|
kiss_thin_nfa_state_t *next_state;
|
|
|
|
// Look up in this state's transition table
|
|
next_state = kiss_thin_nfa_comp_get_next_state(state, tran_char);
|
|
if (next_state != NULL) {
|
|
if (state == from_state) {
|
|
thinnfa_debug_extended(("%s: Found transition from %s by 0x%02x to %s\n", rname,
|
|
state_name(from_state), tran_char, state_name(next_state)));
|
|
} else {
|
|
thinnfa_debug_extended((
|
|
"%s: Found transition from %s by 0x%02x to %s using the fail state %s\n",
|
|
rname,
|
|
state_name(from_state),
|
|
tran_char,
|
|
state_name(next_state),
|
|
state_name(state)
|
|
));
|
|
}
|
|
return next_state;
|
|
}
|
|
}
|
|
|
|
// We've gone down to the root, and found nothing - so the next state is the root.
|
|
thinnfa_debug_extended(("%s: No transition from %s by 0x%02x - going to root\n", rname,
|
|
state_name(from_state), tran_char));
|
|
return nfa_comp->root_state;
|
|
}
|
|
|
|
|
|
// A callback function prototype for kiss_thin_nfa_iterate_trans
|
|
typedef kiss_ret_val (*kiss_thin_nfa_iterate_trans_cb)(kiss_thin_nfa_state_t *from_state,
|
|
u_char tran_char, kiss_thin_nfa_state_t *to_state);
|
|
|
|
|
|
// Iterate all the transitions in the trie, in BFS order.
|
|
// Note: The callback will be called once per transition, i.e. once per state, except for the initial state.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_iterate_trans_bfs(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_iterate_trans_cb iter_cb)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_iterate_trans_bfs";
|
|
kiss_thin_nfa_state_t *bfs_q_head, *bfs_q_tail;
|
|
|
|
thinnfa_debug(("%s: Starting BFS iteration, %d states\n", rname, nfa_comp->state_num));
|
|
|
|
// This queue contains states, whose children we want to iterate.
|
|
// We start with the root state followed by the anchored root state.
|
|
bfs_q_head = nfa_comp->root_state;
|
|
bfs_q_head->bfs_q = NULL;
|
|
bfs_q_tail = bfs_q_head;
|
|
if (nfa_comp->anchored_root_state) {
|
|
bfs_q_tail->bfs_q = nfa_comp->anchored_root_state;
|
|
nfa_comp->anchored_root_state->bfs_q = NULL;
|
|
bfs_q_tail = nfa_comp->anchored_root_state;
|
|
}
|
|
|
|
// Dequeue each of the states, call the iterator for each transition and enqueue the children
|
|
while (bfs_q_head != NULL) {
|
|
kiss_thin_nfa_state_t *from_state;
|
|
kiss_thin_nfa_state_t *to_state;
|
|
|
|
// Dequeue a state from the head
|
|
from_state = bfs_q_head;
|
|
bfs_q_head = from_state->bfs_q;
|
|
if (bfs_q_head == NULL) bfs_q_tail = NULL;
|
|
|
|
thinnfa_debug_extended((
|
|
"%s: Got the state %s with %d children\n",
|
|
rname,
|
|
state_name(from_state),
|
|
from_state->num_trans
|
|
));
|
|
|
|
// Go over the state's transitions
|
|
for (to_state = from_state->child; to_state != NULL; to_state = to_state->sibling) {
|
|
thinnfa_debug_extended((
|
|
"%s: Got the child state %s at the depth %d\n",
|
|
rname,
|
|
state_name(to_state),
|
|
to_state->depth
|
|
));
|
|
|
|
// Call the iterator function
|
|
if (iter_cb(from_state, to_state->tran_char, to_state) != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// No need to enqueue states with no children
|
|
if (to_state->num_trans == 0) continue;
|
|
|
|
// Enqueue the next state, so we'd iterate its transitions too
|
|
to_state->bfs_q = NULL;
|
|
if (bfs_q_tail != NULL) {
|
|
bfs_q_tail->bfs_q = to_state;
|
|
} else {
|
|
bfs_q_head = to_state;
|
|
}
|
|
bfs_q_tail = to_state;
|
|
}
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Set a state's fail state.
|
|
// To calculate this, we need the state's parent, and the character that takes us from the parent to the current.
|
|
// The parent's fail state must be calculated already.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_set_fail_state(kiss_thin_nfa_state_t *parent, u_char tran_char, kiss_thin_nfa_state_t *state)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_set_fail_state";
|
|
kiss_thin_nfa_state_t *fail_state;
|
|
|
|
// Calculate the fail state.
|
|
// The same character that takes us from parent to state would take us from parent->fail_state to state->fail_state
|
|
fail_state = kiss_thin_nfa_calc_transition(state->comp, parent->fail_state, tran_char);
|
|
state->fail_state = fail_state;
|
|
|
|
thinnfa_debug(("%s: The fail state of %s is %s (parent %s, parent->fail_state %s, char %02x)\n", rname,
|
|
state_name(state), state_name(fail_state), state_name(parent),
|
|
state_name(parent->fail_state), tran_char));
|
|
|
|
|
|
// If a state's fail state is finite, so is the state itself.
|
|
// This is because the fail state represents a suffix of the state, which is included in
|
|
// the suffix the state represents. If the shorter suffix is a match, so is the longer one.
|
|
// Example - The fail state of "abc" is "bc" (if it exists). If "bc" is a match, then so is "abc".
|
|
if (fail_state->flags & THIN_NFA_STATE_MATCH) {
|
|
thinnfa_debug(("%s: Fail state %s is finite - so is %s\n", rname,
|
|
state_name(fail_state), state_name(state)));
|
|
if (kiss_thin_nfa_state_copy_match_ids(state, fail_state)) {
|
|
thinnfa_debug_err((
|
|
"%s: Failed to copy the pattern IDs from %s to %s\n",
|
|
rname,
|
|
state_name(fail_state),
|
|
state_name(state)
|
|
));
|
|
kiss_thin_nfa_set_comp_error(state->comp, "Failed to copy the pattern IDs");
|
|
return KISS_ERROR;
|
|
}
|
|
}
|
|
|
|
// This isn't related to calculating fail states. It should be done after the trie was built, but before
|
|
// starting BNFA construction.
|
|
if (kiss_thin_nfa_optimize_contig_chars && (state->flags & THIN_NFA_STATE_MAX_IDENTICAL_CHAR)) {
|
|
// Optimization for identical character sequences. States which represent a maximal sequence of the same
|
|
// characters will be full. So for a long sequence of a single character, we'll always be in a full state.
|
|
// Great for the performance lab.
|
|
thinnfa_debug((
|
|
"%s: State %s is a maximal identical character sequence - making it full\n",
|
|
rname,
|
|
state_name(state)
|
|
));
|
|
make_state_full(state);
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// See if we can find a better fail state for a state.
|
|
// If the fail state contains only transitions the original state has anyway, we can use its fail state instead.
|
|
static kiss_thin_nfa_state_t *
|
|
kiss_thin_nfa_find_better_fail_state(kiss_thin_nfa_state_t *state)
|
|
{
|
|
kiss_thin_nfa_state_t *fail_state;
|
|
|
|
if (!state->fail_state) return NULL;
|
|
|
|
// Go down the fail state chain.
|
|
// Keep going as long as the states contain only transitions the current state has anyway.
|
|
for (fail_state = state->fail_state; fail_state->fail_state != NULL; fail_state = fail_state->fail_state) {
|
|
|
|
verify_state(state->comp, fail_state);
|
|
|
|
if (fail_state->flags & THIN_NFA_STATE_FULL) {
|
|
// Full state - failing to it will always give us the answer.
|
|
break;
|
|
}
|
|
|
|
if (!kiss_thin_nfa_are_trans_contained(state, fail_state)) {
|
|
// This state has transitions that the current state doesn't - we must fail to it,
|
|
// not lower.
|
|
break;
|
|
}
|
|
}
|
|
|
|
return fail_state;
|
|
}
|
|
|
|
|
|
// Change fail states to go faster up the tree, if possible.
|
|
// Normally, a fail state points one level upward. But sometimes it can be more upward.
|
|
//
|
|
// Note: This must be done after kiss_thin_nfa_set_fail_state was called for all states. This is because
|
|
// kiss_thin_nfa_set_fail_state uses the parent's fail state to calculate the child's. If the parent's fail stae
|
|
// was "reduced", we'll get the wrong fail state for the child.
|
|
static void
|
|
kiss_thin_nfa_reduce_fail_states(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_reduce_fail_states";
|
|
kiss_thin_nfa_state_t *state;
|
|
|
|
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
|
|
kiss_thin_nfa_state_t *fail_state;
|
|
|
|
if (state->flags & THIN_NFA_STATE_FULL) {
|
|
// A full state's fail state isn't interesting
|
|
continue;
|
|
}
|
|
|
|
fail_state = kiss_thin_nfa_find_better_fail_state(state);
|
|
if (fail_state != state->fail_state) {
|
|
// We have a better fail state
|
|
thinnfa_debug(("%s: Changing the fail state of %s from %s to %s\n", rname,
|
|
state_name(state), state_name(state->fail_state), state_name(fail_state)));
|
|
state->fail_state = fail_state;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Calculate fail states for all states.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_calc_fail_states(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_calc_fail_states";
|
|
|
|
// The root state has no fail state
|
|
nfa_comp->root_state->fail_state = NULL;
|
|
if (nfa_comp->anchored_root_state) {
|
|
// The anchored root fails to the root
|
|
nfa_comp->anchored_root_state->fail_state = nfa_comp->root_state;
|
|
}
|
|
|
|
thinnfa_debug(("%s: Calculating the fail states for all states\n", rname));
|
|
|
|
// Iterate all transitions, and calculate fail states for the target states.
|
|
// This would cover all states, except the initial (whose fail state was already set).
|
|
// BFS order assures that a parent's fail state is already calculated when we reach the child.
|
|
if (kiss_thin_nfa_iterate_trans_bfs(nfa_comp, kiss_thin_nfa_set_fail_state) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Failed to calculate the fail states\n", rname));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// All states now have their fail states calculated
|
|
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_FAIL_STATES_CALCULATED);
|
|
|
|
// Optimization - reduce fail states
|
|
kiss_thin_nfa_reduce_fail_states(nfa_comp);
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Set a state's BNFA offset to the size so far, and increment by the state size.
|
|
static void
|
|
set_state_offset(kiss_thin_nfa_state_t *state, kiss_bnfa_offset_t *cur_offset)
|
|
{
|
|
static const char rname[] = "set_state_offset";
|
|
u_int state_size=0, match_size=0;
|
|
|
|
verify_state(state->comp, state);
|
|
|
|
if (state->bnfa_offset == KISS_BNFA_OFFSET_INVALID) {
|
|
// Room for the actual state - negative offset for full states, positive for partial.
|
|
if ((state->flags & THIN_NFA_STATE_FULL) && (*cur_offset<0)) {
|
|
state_size = sizeof(kiss_bnfa_full_state_t);
|
|
} else if (!(state->flags & THIN_NFA_STATE_FULL) && (*cur_offset>=0)) {
|
|
state_size = kiss_bnfa_partial_state_size(state->num_trans);
|
|
}
|
|
}
|
|
|
|
if (state->bnfa_incoming_off == KISS_BNFA_OFFSET_INVALID) {
|
|
// Room for a match state - if needed, must be a positive offset.
|
|
if ((state->flags & THIN_NFA_STATE_MATCH) && (*cur_offset >= 0)) {
|
|
match_size = sizeof(kiss_bnfa_match_state_t);
|
|
if (state->flags & THIN_NFA_STATE_FULL) {
|
|
// Need a jump state too
|
|
match_size += kiss_bnfa_partial_state_size(0);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update the state offsets
|
|
if (match_size > 0) {
|
|
thinnfa_debug_extended(("%s: State %s was given a match offset %d size %d", rname, state_name(state),
|
|
*cur_offset, match_size));
|
|
state->bnfa_incoming_off = *cur_offset;
|
|
*cur_offset += match_size;
|
|
}
|
|
if (state_size > 0) {
|
|
thinnfa_debug_extended(("%s: State %s was given a real offset %d size %d", rname, state_name(state),
|
|
*cur_offset, state_size));
|
|
state->bnfa_offset = *cur_offset;
|
|
*cur_offset += state_size;
|
|
if (!(state->flags & THIN_NFA_STATE_MATCH)) {
|
|
// Incoming transitions go directly to the state
|
|
state->bnfa_incoming_off = state->bnfa_offset;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if compressed offset fits full state offset size
|
|
static BOOL
|
|
comp_offset_fits_short(kiss_bnfa_comp_offset_t comp_offset)
|
|
{
|
|
if ((comp_offset) != (kiss_bnfa_short_offset_t)(comp_offset)) {
|
|
return FALSE;
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
// Mark all child of a given state as reacheable as reachable from full state
|
|
static void
|
|
kiss_bnfa_mark_childs_reach_from_full(kiss_thin_nfa_state_t *state)
|
|
{
|
|
kiss_thin_nfa_state_t *child;
|
|
|
|
for (child = state->child; child != NULL; child = child->sibling) {
|
|
ENUM_SET_FLAG(child->flags, THIN_NFA_STATE_REACH_FROM_FULL);
|
|
}
|
|
}
|
|
|
|
// Mark all states that are reachable from a given full state,
|
|
// in order to place them at lower offsets to avoid possible overflow due to offset compression.
|
|
// If a state`s fail state is of partial type, mark it`s children too
|
|
static void
|
|
kiss_bnfa_mark_reachable_from_full(kiss_thin_nfa_state_t *state) {
|
|
|
|
kiss_bnfa_mark_childs_reach_from_full(state);
|
|
for (state = state->fail_state; state && !(state->flags & THIN_NFA_STATE_FULL); state = state->fail_state) {
|
|
kiss_bnfa_mark_childs_reach_from_full(state);
|
|
}
|
|
}
|
|
|
|
// Calcultate the offset of each BNFA state, and the entire BNFA size.
|
|
// Sets nfa_comp->offset_list to an array, holding the BNFA offset for each state at [state_id].
|
|
// Sets *bnfa_size_p to the total BNFA size.
|
|
static kiss_ret_val
|
|
kiss_bnfa_calc_offsets(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_calc_offsets";
|
|
kiss_thin_nfa_state_t *state;
|
|
kiss_bnfa_offset_t cur_offset;
|
|
|
|
// Full states have negative offsets. So the first state's offset depends on the number of full states.
|
|
cur_offset = -(kiss_bnfa_offset_t)(nfa_comp->full_state_num * sizeof(kiss_bnfa_full_state_t));
|
|
nfa_comp->min_bnfa_off = cur_offset;
|
|
|
|
// Put the anchored root state first, because it's the initial state
|
|
if (nfa_comp->anchored_root_state) {
|
|
KISS_ASSERT(nfa_comp->anchored_root_state->flags & THIN_NFA_STATE_FULL,
|
|
"%s: The anchored root %s must be a full state\n", rname, state_name(nfa_comp->anchored_root_state));
|
|
set_state_offset(nfa_comp->anchored_root_state, &cur_offset);
|
|
}
|
|
|
|
// If there's no anchored root, then root must be initial. If there is, validation expects it second.
|
|
set_state_offset(nfa_comp->root_state, &cur_offset);
|
|
|
|
// in this loop we add only the full states, which have negative offsets
|
|
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
|
|
if (state->flags & THIN_NFA_STATE_FULL) {
|
|
kiss_bnfa_mark_reachable_from_full(state); // Mark child states so they'll get low offsets
|
|
set_state_offset(state, &cur_offset);
|
|
}
|
|
}
|
|
// We added all full states and moving to partials - we must be at offset 0.
|
|
KISS_ASSERT(cur_offset==0,
|
|
"%s: Offset %d != 0 after adding %d full states\n", rname, cur_offset, nfa_comp->full_state_num);
|
|
|
|
// in this loop we add states that are reachable from full states. We want them at low offsets to avoid
|
|
// possible overflow due to offset compression
|
|
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
|
|
if (state->flags & THIN_NFA_STATE_REACH_FROM_FULL){
|
|
set_state_offset(state, &cur_offset);
|
|
}
|
|
}
|
|
|
|
// Make sure we have not exceede the limit of offsets that can be compressed to 16bit
|
|
// Note: the test is a little too strict - we check the first state that is not reachable from a full state
|
|
// instead of the last state that is reachable
|
|
if (!comp_offset_fits_short(kiss_bnfa_offset_compress(cur_offset))) {
|
|
thinnfa_debug_err(("%s: Current offset is %d, not reachable from the full state\n", rname, cur_offset));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Exceeded the limit of reachable states");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// in this loop we add the partial and mathing states, which weren't handled in the loop above.
|
|
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
|
|
set_state_offset(state, &cur_offset);
|
|
}
|
|
// The current offset is the size of partial states. Add the full state size to get the total size.
|
|
nfa_comp->max_bnfa_off = cur_offset;
|
|
|
|
thinnfa_debug_major(("%s: BNFA size - %u full states, %u partial states, total %u bytes\n", rname,
|
|
nfa_comp->full_state_num,
|
|
nfa_comp->state_num-nfa_comp->full_state_num,
|
|
nfa_comp->max_bnfa_off - nfa_comp->min_bnfa_off));
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Get a state's BNFA offset.
|
|
// skip_match makes a difference for matching states:
|
|
// TRUE - Get the actual state, where the transition table is.
|
|
// FALSE - Get the match state, where incoming transitions should go.
|
|
static kiss_bnfa_offset_t
|
|
state_bnfa_offset(kiss_thin_nfa_state_t *state, BOOL skip_match)
|
|
{
|
|
return skip_match ? state->bnfa_offset : state->bnfa_incoming_off;
|
|
}
|
|
|
|
|
|
// Convert a BNFA offset to a BNFA state pointer
|
|
static kiss_bnfa_state_t *
|
|
comp_bnfa_offset_to_state(struct thin_nfa_comp_s *nfa_comp, kiss_bnfa_offset_t bnfa_offset)
|
|
{
|
|
return kiss_bnfa_offset_to_state_write(nfa_comp->runtime_nfa->bnfa, bnfa_offset);
|
|
}
|
|
|
|
|
|
// Get a pointer to a state in the BNFA.
|
|
// skip_match makes a difference for matching states:
|
|
// TRUE - Get the actual state, where the transition table is.
|
|
// FALSE - Get the match state, where incoming transitions should go.
|
|
static kiss_bnfa_state_t *
|
|
comp_to_bnfa_state(kiss_thin_nfa_state_t *state, BOOL skip_match)
|
|
{
|
|
return comp_bnfa_offset_to_state(state->comp, state_bnfa_offset(state, skip_match));
|
|
}
|
|
|
|
// Move next to state_bnfa_offset. assert inside.
|
|
static kiss_bnfa_short_offset_t
|
|
state_bnfa_short_offset(kiss_thin_nfa_state_t *state)
|
|
{
|
|
static const char rname[] = "state_bnfa_short_offset";
|
|
kiss_bnfa_comp_offset_t comp_offset = kiss_bnfa_offset_compress(state_bnfa_offset(state, FALSE));
|
|
|
|
KISS_ASSERT(comp_offset_fits_short(comp_offset),
|
|
"%s: Compressed offset %d exceeds the allowed size\n", rname, comp_offset);
|
|
|
|
return (kiss_bnfa_short_offset_t) comp_offset;
|
|
}
|
|
|
|
|
|
// If character translation is enabled, duplicate ch's transition to all equivalents
|
|
static void
|
|
add_equivalent_transitions(struct thin_nfa_comp_s *nfa_comp, kiss_bnfa_full_state_t *bnfa_state, u_char ch)
|
|
{
|
|
static const char rname[] = "add_equivalent_transitions";
|
|
u_char other_ch;
|
|
u_int group_size;
|
|
|
|
if (!nfa_comp->xlation_tab) return;
|
|
|
|
// Go over all characters within the same group
|
|
group_size = 0;
|
|
for (other_ch = nfa_comp->xlation_tab->rev[ch]; other_ch != ch; other_ch = nfa_comp->xlation_tab->rev[other_ch]) {
|
|
thinnfa_debug_extended(("%s: Setting translated transition by %02x - same as %02x\n", rname, other_ch, ch));
|
|
|
|
bnfa_state->transitions[other_ch] = bnfa_state->transitions[ch];
|
|
|
|
// Prevent looping in case the table is corrupt
|
|
group_size++;
|
|
KISS_ASSERT_CRASH(group_size <= KISS_PM_ALPHABET_SIZE,
|
|
"%s: Too many characters to translate into %02x\n", rname, ch);
|
|
}
|
|
}
|
|
|
|
|
|
// Add a transition to a full transition table.
|
|
// If there's a translation table, add trnasitions for all equivalent characters.
|
|
static void
|
|
add_full_transition(
|
|
struct thin_nfa_comp_s *nfa_comp,
|
|
kiss_bnfa_full_state_t *bnfa_state,
|
|
kiss_thin_nfa_state_t *next_state
|
|
)
|
|
{
|
|
static const char rname[] = "add_full_transition";
|
|
u_char ch = next_state->tran_char;
|
|
|
|
thinnfa_debug_extended(("%s: Setting the transition by %02x to %s\n", rname,
|
|
next_state->tran_char, state_name(next_state)));
|
|
|
|
// Set the transition, for ch and equivalent characters
|
|
bnfa_state->transitions[ch] = state_bnfa_short_offset(next_state);
|
|
add_equivalent_transitions(nfa_comp, bnfa_state, ch);
|
|
}
|
|
|
|
#if !defined(KERNEL)
|
|
|
|
// A recursive algorithm to build full state tables.
|
|
// Much faster than the previous algorithm, but shouldn't be used in the kernel.
|
|
// Allow mutual recursion between these two functions:
|
|
static void build_full_trans_table(kiss_thin_nfa_state_t *comp_state);
|
|
static void get_full_trans_table(kiss_thin_nfa_state_t *target_state, kiss_thin_nfa_state_t *source_state);
|
|
|
|
|
|
// Get the transition table of source_state and write it in target_state's.
|
|
// source_state is somewhere in the fail state chain of target_state.
|
|
static void
|
|
get_full_trans_table(kiss_thin_nfa_state_t *target_state, kiss_thin_nfa_state_t *source_state)
|
|
{
|
|
kiss_thin_nfa_state_t *child;
|
|
kiss_bnfa_state_t *target_bnfa = comp_to_bnfa_state(target_state, TRUE);
|
|
|
|
if (source_state != target_state && (source_state->flags & THIN_NFA_STATE_FULL)) {
|
|
// We've reached a full state - just copy its transition table (build it first, if needed)
|
|
build_full_trans_table(source_state);
|
|
bcopy(comp_to_bnfa_state(source_state, TRUE)->full.transitions,
|
|
target_bnfa->full.transitions,
|
|
sizeof(target_bnfa->full.transitions));
|
|
return;
|
|
}
|
|
|
|
// Start with our fail state's state table
|
|
if (source_state->fail_state) {
|
|
get_full_trans_table(target_state, source_state->fail_state);
|
|
} else {
|
|
int i;
|
|
kiss_bnfa_short_offset_t root_bnfa_comp_offset = state_bnfa_short_offset(source_state);
|
|
|
|
// Reached the root - fill with transitions to root
|
|
for (i=0; i<KISS_PM_ALPHABET_SIZE; i++) {
|
|
target_bnfa->full.transitions[i] = root_bnfa_comp_offset;
|
|
}
|
|
}
|
|
|
|
// Override transitions which exist in this state
|
|
for (child = source_state->child; child != NULL; child = child->sibling) {
|
|
add_full_transition(target_state->comp, &target_bnfa->full, child);
|
|
}
|
|
}
|
|
|
|
|
|
// Recursive function for building a full state's state table.
|
|
// target_bnfa_state is the state who's table we're building.
|
|
// source_state changes when recursing over the tail state chain
|
|
static void
|
|
build_full_trans_table(kiss_thin_nfa_state_t *comp_state)
|
|
{
|
|
if (comp_state->flags & THIN_NFA_STATE_BUILT_TABLE) return;
|
|
|
|
get_full_trans_table(comp_state, comp_state);
|
|
|
|
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
|
|
}
|
|
|
|
#endif // KERNEL
|
|
|
|
static CP_INLINE kiss_ret_val
|
|
verify_add_state(kiss_thin_nfa_state_t *comp_state, kiss_bnfa_state_t *bnfa_state, u_int state_size,
|
|
const char *caller, const char *type)
|
|
{
|
|
const KissThinNFA *nfa_h = comp_state->comp->runtime_nfa.get();
|
|
kiss_bnfa_offset_t bnfa_offset = (char *)bnfa_state - (char *)(nfa_h->bnfa);
|
|
u_int state_alignment = (bnfa_offset < 0) ? sizeof(kiss_bnfa_full_state_t) : KISS_BNFA_STATE_ALIGNMENT;
|
|
|
|
if ((bnfa_offset < nfa_h->min_bnfa_offset) || (bnfa_offset+(int)state_size > nfa_h->max_bnfa_offset)) {
|
|
thinnfa_debug_err(("%s: Cannot add the %s state %s at the offset %d:%d - out of range %d:%d\n", caller, type,
|
|
state_name(comp_state),
|
|
bnfa_offset, bnfa_offset+state_size,
|
|
nfa_h->min_bnfa_offset, nfa_h->max_bnfa_offset));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
if ((bnfa_offset % state_alignment) != 0) {
|
|
thinnfa_debug_err((
|
|
"%s: Cannot add the %s state %s at the offset %d:%d - not aligned on %d bytes\n",
|
|
caller,
|
|
type,
|
|
state_name(comp_state),
|
|
bnfa_offset,
|
|
bnfa_offset + state_size,
|
|
state_alignment
|
|
));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
thinnfa_debug(("%s: Adding the %s state %s, offsets %d:%d\n", caller, type,
|
|
state_name(comp_state),
|
|
bnfa_offset, bnfa_offset+state_size));
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Old, non-recursive and slow version on build_full_trans_table.
|
|
static void
|
|
build_full_trans_table_no_recursion(kiss_thin_nfa_state_t *comp_state)
|
|
{
|
|
static const char rname[] = "build_full_trans_table_no_recursion";
|
|
struct thin_nfa_comp_s *nfa_comp = comp_state->comp;
|
|
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
|
|
kiss_thin_nfa_state_t *child;
|
|
u_int i;
|
|
|
|
// Go over all characters. Maintain a pointer to the next transition in the list.
|
|
// We rely on the list being sorted.
|
|
// We could simply call kiss_thin_nfa_calc_transition for each character. But it would look up again and
|
|
// again in the current state.
|
|
child = comp_state->child;
|
|
for (i = 0; i < KISS_PM_ALPHABET_SIZE; i++) {
|
|
u_char ch = (u_char)i;
|
|
kiss_thin_nfa_state_t *next_state;
|
|
|
|
// Check if it's a canonic character (e.g. lowercase when we're case insensitive)
|
|
if (kiss_thin_nfa_xlate_char(nfa_comp, ch) != ch) {
|
|
// We'll fill this in when we reach the canonic character.
|
|
continue;
|
|
}
|
|
|
|
if (child != NULL && child->tran_char == ch) {
|
|
// Use the explicit transition
|
|
next_state = child;
|
|
|
|
// Go forward in the transition table
|
|
child = child->sibling;
|
|
|
|
thinnfa_debug_extended(("%s: Setting the explicit transition by %02x to %s\n", rname,
|
|
ch, state_name(next_state)));
|
|
} else {
|
|
// Note: if comp_state is the initial, we pass from_state=NULL.
|
|
// This works as desired (returning the initial state).
|
|
next_state = kiss_thin_nfa_calc_transition(nfa_comp, comp_state->fail_state, ch);
|
|
|
|
thinnfa_debug_extended(("%s: Setting the fail-state transition by %02x to %s\n", rname,
|
|
ch, state_name(next_state)));
|
|
}
|
|
|
|
// Set the transition for this character and equivalents
|
|
bnfa_state->full.transitions[ch] = state_bnfa_short_offset(next_state);
|
|
add_equivalent_transitions(nfa_comp, &bnfa_state->full, ch);
|
|
}
|
|
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
|
|
}
|
|
|
|
|
|
// Build a full state's transition table in the BNFA.
|
|
// Either uses the explicit transition, or calculates using fail states.
|
|
static kiss_ret_val
|
|
kiss_bnfa_build_full_state(kiss_thin_nfa_state_t *comp_state)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_build_full_state";
|
|
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
|
|
|
|
if (verify_add_state(comp_state, bnfa_state, sizeof(kiss_bnfa_full_state_t), rname, "full") != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
#if !defined(KERNEL)
|
|
if (comp_state->comp->flags & THIN_NFA_USE_RECURSIVE_COMPILE) {
|
|
build_full_trans_table(comp_state);
|
|
return KISS_OK;
|
|
}
|
|
#endif // KERNEL
|
|
|
|
build_full_trans_table_no_recursion(comp_state);
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
static void
|
|
kiss_bnfa_build_partial_state_header(
|
|
kiss_bnfa_partial_state_t *bnfa_state,
|
|
u_int trans_num,
|
|
kiss_bnfa_offset_t fail_offset
|
|
)
|
|
{
|
|
bnfa_state->type = KISS_BNFA_STATE_PARTIAL;
|
|
bnfa_state->trans_num = trans_num;
|
|
bnfa_state->fail_state_offset = kiss_bnfa_offset_compress(fail_offset);
|
|
}
|
|
|
|
|
|
// Build a partial state's transition table in the BNFA.
|
|
// Temporary encoding - sets the state ID instead of the BNFA offset (which is yet unknown).
|
|
static kiss_ret_val
|
|
kiss_bnfa_build_partial_state(kiss_thin_nfa_state_t *comp_state)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_build_partial_state";
|
|
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
|
|
kiss_thin_nfa_state_t *child;
|
|
u_int trans_num;
|
|
|
|
if (verify_add_state(
|
|
comp_state,
|
|
bnfa_state,
|
|
kiss_bnfa_partial_state_size(comp_state->num_trans),
|
|
rname,
|
|
"partial"
|
|
) != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Fill in the transition number and fail state
|
|
kiss_bnfa_build_partial_state_header(&bnfa_state->partial, comp_state->num_trans,
|
|
state_bnfa_offset(comp_state->fail_state, TRUE));
|
|
thinnfa_debug_extended(("%s: The fail state is %s\n", rname, state_name(comp_state->fail_state)));
|
|
|
|
// Build a transition for each existing character
|
|
trans_num = 0;
|
|
for (child = comp_state->child; child != NULL; child = child->sibling) {
|
|
thinnfa_debug_extended(("%s: Setting the transition by %02x to %s\n", rname,
|
|
child->tran_char, state_name(child)));
|
|
bnfa_state->partial.transitions[trans_num].tran_char = child->tran_char;
|
|
bnfa_state->partial.transitions[trans_num].next_state_offset =
|
|
kiss_bnfa_offset_compress(state_bnfa_offset(child, FALSE));
|
|
trans_num++;
|
|
}
|
|
KISS_ASSERT(trans_num == comp_state->num_trans, "%s: State %s should have %d transitions, but it has %d",
|
|
rname, state_name(comp_state), comp_state->num_trans, trans_num);
|
|
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Build a match state.
|
|
static kiss_ret_val
|
|
kiss_bnfa_build_match_state(kiss_thin_nfa_state_t *comp_state, u_int match_id)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_build_match_state";
|
|
kiss_bnfa_offset_t match_bnfa_offset = state_bnfa_offset(comp_state, FALSE);
|
|
kiss_bnfa_state_t *match_state = comp_bnfa_offset_to_state(comp_state->comp, match_bnfa_offset);
|
|
kiss_bnfa_offset_t following_state_offset, real_state_offset;
|
|
|
|
if (verify_add_state(comp_state, match_state, sizeof(kiss_bnfa_match_state_t), rname, "match") != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Fill in the match state
|
|
match_state->match.type = KISS_BNFA_STATE_MATCH;
|
|
match_state->match.unused = 0;
|
|
match_state->match.match_id = match_id;
|
|
|
|
// Add a jump state if the real state isn't directly following the match state (i.e. for full-matching states).
|
|
real_state_offset = state_bnfa_offset(comp_state, TRUE);
|
|
following_state_offset = match_bnfa_offset + sizeof(kiss_bnfa_match_state_t);
|
|
if (following_state_offset != real_state_offset) {
|
|
kiss_bnfa_state_t *jump_state = comp_bnfa_offset_to_state(comp_state->comp, following_state_offset);
|
|
|
|
// Add a jump state (a 0-transition partial state) to the real state
|
|
if (verify_add_state(comp_state, jump_state, kiss_bnfa_partial_state_size(0), rname, "jump") != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
kiss_bnfa_build_partial_state_header(&jump_state->partial, 0, real_state_offset);
|
|
}
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
// Encode a state in binary NFA form.
|
|
static kiss_ret_val
|
|
kiss_bnfa_add_state(kiss_thin_nfa_state_t *comp_state, u_int offset_in_pat_match_array)
|
|
{
|
|
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
|
|
// Build a match state (a jump state too if needed)
|
|
if (kiss_bnfa_build_match_state(comp_state, offset_in_pat_match_array) != KISS_OK) return KISS_ERROR;
|
|
}
|
|
|
|
// Add the state
|
|
if (comp_state->flags & THIN_NFA_STATE_FULL) {
|
|
if (kiss_bnfa_build_full_state(comp_state) != KISS_OK) return KISS_ERROR;
|
|
} else {
|
|
if (kiss_bnfa_build_partial_state(comp_state) != KISS_OK) return KISS_ERROR;
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
static uintptr_t
|
|
pat_key_hash_func(const void *key, CP_MAYBE_UNUSED void *info)
|
|
{
|
|
const kiss_thin_nfa_pattern_array_t *pat_arr = (const kiss_thin_nfa_pattern_array_t *)key;
|
|
const char* buf = (const char *)key;
|
|
const char *buf_end;
|
|
uintptr_t val = 0;
|
|
|
|
buf_end = buf + kiss_thin_nfa_pattern_array_size(pat_arr->n_patterns);
|
|
|
|
for ( ; buf != buf_end; buf++) {
|
|
val = ((val >> 3) ^ (val<<5)) + *buf;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
static int
|
|
pat_key_cmp_func(const void *key1, const void *key2, CP_MAYBE_UNUSED void *info)
|
|
{
|
|
const kiss_thin_nfa_pattern_array_t *pat1 = (const kiss_thin_nfa_pattern_array_t *)key1;
|
|
const kiss_thin_nfa_pattern_array_t *pat2 = (const kiss_thin_nfa_pattern_array_t *)key2;
|
|
|
|
if (pat1->n_patterns != pat2->n_patterns) {
|
|
return 1; // No match
|
|
}
|
|
|
|
return memcmp(pat1, pat2, kiss_thin_nfa_pattern_array_size(pat1->n_patterns));
|
|
}
|
|
|
|
static u_int
|
|
pattern_list_len(const kiss_thin_nfa_pattern_list_t *pat_list)
|
|
{
|
|
const kiss_thin_nfa_pattern_list_t *pat;
|
|
u_int n = 0;
|
|
for (pat = pat_list; pat != NULL; pat = pat->next) {
|
|
n++;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
static kiss_ret_val
|
|
kiss_bnfa_match_patterns_prepare(struct thin_nfa_comp_s *nfa_comp, KissThinNFA *nfa)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_match_patterns_prepare";
|
|
kiss_thin_nfa_pattern_array_t *pat_arr;
|
|
kiss_thin_nfa_state_t *comp_state;
|
|
u_int total_size_for_patterns;
|
|
|
|
total_size_for_patterns = 0;
|
|
for (comp_state = nfa_comp->root_state;
|
|
comp_state != NULL;
|
|
comp_state = kiss_thin_nfa_get_subsequent_state(nfa_comp, comp_state)) {
|
|
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
|
|
if (!comp_state->ids) {
|
|
thinnfa_debug_critical((
|
|
"%s: State %s is finite, but its IDs are null\n",
|
|
rname,
|
|
state_name(comp_state)
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "The state is finite, but its IDs are null");
|
|
return KISS_ERROR;
|
|
}
|
|
total_size_for_patterns += kiss_thin_nfa_pattern_array_size(pattern_list_len(comp_state->ids));
|
|
}
|
|
}
|
|
|
|
if (total_size_for_patterns == 0) {
|
|
thinnfa_debug_critical(("%s: no finite states?!\n", rname));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "no finite states?!");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// We allocate according to maximum possible size.
|
|
// We might reduce it at the end, if duplicates exist.
|
|
thinnfa_debug(("%s: alocating %u bytes for a pattern array\n", rname, total_size_for_patterns));
|
|
pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_pmglob_memory_kmalloc_ex(
|
|
total_size_for_patterns,
|
|
rname,
|
|
FW_KMEM_SLEEP
|
|
);
|
|
if (!pat_arr) {
|
|
thinnfa_debug_critical((
|
|
"%s: failed to allocate %d bytes for a complete pattern array\n",
|
|
rname,
|
|
total_size_for_patterns
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate memory for a complete pattern array");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
nfa->pattern_arrays = pat_arr;
|
|
nfa->pattern_arrays_size = total_size_for_patterns;
|
|
|
|
nfa_comp->patterns_hash = kiss_hash_create_with_ksleep(
|
|
nfa->match_state_num,
|
|
pat_key_hash_func,
|
|
pat_key_cmp_func,
|
|
NULL
|
|
);
|
|
if (!nfa_comp->patterns_hash) {
|
|
thinnfa_debug((
|
|
"%s: failed to create patterns hash table for %u finite states\n",
|
|
rname,
|
|
nfa->match_state_num
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to create patterns hash table for finite states");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
static kiss_ret_val
|
|
kiss_bnfa_match_patterns_finalize(struct thin_nfa_comp_s *nfa_comp, KissThinNFA *nfa, u_int new_size)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_match_patterns_finalize";
|
|
kiss_thin_nfa_pattern_array_t *new_pat_arr;
|
|
|
|
// Compact the match patter array, if needed
|
|
if (new_size == nfa->pattern_arrays_size) {
|
|
thinnfa_debug(("%s: no size change - the pattern array size is %u bytes\n", rname, new_size));
|
|
return KISS_OK;
|
|
}
|
|
|
|
if (new_size > nfa->pattern_arrays_size) {
|
|
thinnfa_debug_critical((
|
|
"%s: new pattern array size (%u) is greater than the current size (%u). This should not happen.\n",
|
|
rname,
|
|
new_size,
|
|
nfa->pattern_arrays_size
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a complete pattern array");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
new_pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_pmglob_memory_kmalloc_ex(new_size, rname, FW_KMEM_SLEEP);
|
|
if (!new_pat_arr) {
|
|
thinnfa_debug_critical(("%s: failed to allocate %d bytes for a complete pattern array\n", rname, new_size));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a complete pattern array");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
thinnfa_debug(("%s: reducing the size from %u to %u\n", rname, nfa->pattern_arrays_size, new_size));
|
|
bcopy(nfa->pattern_arrays, new_pat_arr, new_size);
|
|
kiss_pmglob_memory_kfree(nfa->pattern_arrays, nfa->pattern_arrays_size, rname);
|
|
nfa->pattern_arrays = new_pat_arr;
|
|
nfa->pattern_arrays_size = new_size;
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
static kiss_ret_val
|
|
kiss_bnfa_copy_pat_list(
|
|
const KissThinNFA *nfa,
|
|
kiss_hash_t patterns_hash,
|
|
kiss_thin_nfa_state_t *comp_state,
|
|
u_int *last_used_offset_in_pat_match_array,
|
|
u_int *offset_for_cur_state)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_copy_pat_list";
|
|
|
|
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
|
|
kiss_thin_nfa_pattern_list_t *pat_list_ent;
|
|
kiss_thin_nfa_pattern_array_t *pat_arr;
|
|
kiss_thin_nfa_pattern_array_t *cached_pat_arr;
|
|
u_int pat_arr_size;
|
|
u_int n_patterns;
|
|
u_int i;
|
|
|
|
n_patterns = pattern_list_len(comp_state->ids);
|
|
pat_arr_size = kiss_thin_nfa_pattern_array_size(n_patterns);
|
|
|
|
if ((*last_used_offset_in_pat_match_array + pat_arr_size) > nfa->pattern_arrays_size) {
|
|
thinnfa_debug_critical(("%s: offset (%u) + required size (%u) exceeds the total array size (%u)\n",
|
|
rname, *last_used_offset_in_pat_match_array, pat_arr_size, nfa->pattern_arrays_size));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
pat_arr = kiss_thin_nfa_offset_to_pat_array_ptr(nfa, *last_used_offset_in_pat_match_array);
|
|
pat_arr->n_patterns = n_patterns;
|
|
for (i = 0, pat_list_ent = comp_state->ids; i < pat_arr->n_patterns; pat_list_ent = pat_list_ent->next, i++) {
|
|
bcopy(&(pat_list_ent->pattern), &(pat_arr->pattern[i]), sizeof(pat_list_ent->pattern));
|
|
}
|
|
|
|
kiss_thin_nfa_free_pattern_ids(comp_state->ids);
|
|
comp_state->ids = NULL; // Prevent release when the state is cleaned up
|
|
|
|
cached_pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_hash_lookkey(patterns_hash, pat_arr);
|
|
if (cached_pat_arr) {
|
|
u_int cached_offset;
|
|
cached_offset = kiss_thin_nfa_pat_array_ptr_to_offset(nfa, cached_pat_arr);
|
|
// No need to move the last_used_offset
|
|
*offset_for_cur_state = cached_offset;
|
|
thinnfa_debug((
|
|
"%s: returning cached offset of %u for the state ID %u. "
|
|
"%u patterns %u bytes. The offset stays at %u.\n",
|
|
rname,
|
|
*offset_for_cur_state,
|
|
comp_state->state_id,
|
|
n_patterns, pat_arr_size,
|
|
*last_used_offset_in_pat_match_array
|
|
));
|
|
} else {
|
|
*offset_for_cur_state = *last_used_offset_in_pat_match_array;
|
|
if (!kiss_hash_insert(patterns_hash, pat_arr, NULL)) {
|
|
thinnfa_debug(("%s: failed to insert a pattern into a hash (non-critical error)\n", rname));
|
|
}
|
|
*last_used_offset_in_pat_match_array += pat_arr_size;
|
|
thinnfa_debug((
|
|
"%s: returning the offset of %u for the state ID %u. %u patterns, %u bytes. The offset moved to %u.\n",
|
|
rname,
|
|
*offset_for_cur_state,
|
|
comp_state->state_id,
|
|
n_patterns, pat_arr_size,
|
|
*last_used_offset_in_pat_match_array
|
|
));
|
|
}
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
static void
|
|
kiss_bnfa_update_state_depth(kiss_thin_nfa_state_t *comp_state)
|
|
{
|
|
struct kiss_thin_nfa_depth_map_s *map = &comp_state->comp->runtime_nfa->depth_map;
|
|
u_char depth = MIN(comp_state->depth, KISS_THIN_NFA_MAX_ENCODABLE_DEPTH);
|
|
kiss_bnfa_offset_t off;
|
|
|
|
// Update depth at the state's offset
|
|
off = comp_state->bnfa_offset;
|
|
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
|
|
|
|
// Matching state? Update at the match state offset too.
|
|
off = comp_state->bnfa_incoming_off;
|
|
if (off == comp_state->bnfa_offset) return;
|
|
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
|
|
|
|
// Full-matching state? Update at the jump state offset too.
|
|
off += sizeof(kiss_bnfa_match_state_t);
|
|
if (off == comp_state->bnfa_offset) return;
|
|
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
|
|
}
|
|
|
|
|
|
// Based on thin_nfa_comp_s structure we have built, create a binary Thin NFA.
|
|
// Parameter:
|
|
// nfa_comp - the NFA's compilation data structure.
|
|
//
|
|
// Performance notes:
|
|
// This function takes most of the CPU time in the compilation process (in my tests, at least).
|
|
// Within it, time is divided about equally between full and partial states.
|
|
// Full states take about 40 times more time, but there are about 40 times more partial states.
|
|
// Overall, compilation time isn't bad, but there are surely optimization options.
|
|
// Idea - when constructing a full state, start by bcopy() of its fail state transitions. This would require
|
|
// filling the states in BFS order, which isn't done today.
|
|
static kiss_ret_val
|
|
kiss_bnfa_fill_states(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
static const char rname[] = "kiss_bnfa_fill_states";
|
|
KissThinNFA *nfa = nfa_comp->runtime_nfa.get();
|
|
kiss_thin_nfa_state_t *comp_state;
|
|
u_int last_used_offset_in_pat_match_array = 0;
|
|
|
|
thinnfa_debug(("%s: Filling BNFA %p size %d with %d states\n", rname,
|
|
nfa->bnfa_start, nfa->max_bnfa_offset-nfa->min_bnfa_offset, nfa_comp->state_num));
|
|
|
|
if (kiss_bnfa_match_patterns_prepare(nfa_comp, nfa) != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Go over the states and build the BNFA representation
|
|
for (comp_state = nfa_comp->root_state;
|
|
comp_state != NULL;
|
|
comp_state = kiss_thin_nfa_get_subsequent_state(nfa_comp, comp_state)) {
|
|
u_int state_id = comp_state->state_id;
|
|
u_int offset_for_cur_state = (u_int)-1;
|
|
|
|
if (kiss_bnfa_copy_pat_list(
|
|
nfa, nfa_comp->patterns_hash, comp_state,
|
|
&last_used_offset_in_pat_match_array,
|
|
&offset_for_cur_state
|
|
) != KISS_OK) {
|
|
thinnfa_debug_critical((
|
|
"%s: kiss_bnfa_copy_pat_list() failed for the state %s\n",
|
|
rname,
|
|
state_name(comp_state)
|
|
));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "kiss_bnfa_copy_pat_list() failed");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Update the maximum pattern length (length = state depth)
|
|
if (comp_state->depth > nfa->max_pat_len) {
|
|
nfa->max_pat_len = comp_state->depth;
|
|
}
|
|
|
|
// Build the state
|
|
if (kiss_bnfa_add_state(comp_state, offset_for_cur_state) != KISS_OK) {
|
|
thinnfa_debug_critical(("%s: Failed to add the state %d\n", rname, state_id));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Update the depth map
|
|
kiss_bnfa_update_state_depth(comp_state);
|
|
}
|
|
|
|
if (kiss_bnfa_match_patterns_finalize(nfa_comp, nfa, last_used_offset_in_pat_match_array) != KISS_OK) {
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
return KISS_OK;
|
|
}
|
|
|
|
|
|
static void
|
|
kiss_thin_nfa_fill_stats(struct thin_nfa_comp_s *nfa_comp)
|
|
{
|
|
struct kiss_thin_nfa_specific_stats_s *stats = &nfa_comp->runtime_nfa->stats.specific;
|
|
|
|
stats->num_of_states = nfa_comp->state_num;
|
|
stats->num_of_final_states = nfa_comp->match_state_num;
|
|
}
|
|
|
|
|
|
// Get the nfa_comp structure and build, according to it, the runtime Thin NFA structure.
|
|
static kiss_ret_val
|
|
kiss_thin_nfa_build_bnfa(struct thin_nfa_comp_s *nfa_comp, CP_MAYBE_UNUSED u_int compile_flags)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_build_bnfa";
|
|
|
|
thinnfa_debug_major(("%s: Converting the compiled Thin NFA to the binary form\n", rname));
|
|
|
|
// Get the list of all BNFA offsets
|
|
if (kiss_bnfa_calc_offsets(nfa_comp) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Error allocating the offset list\n", rname));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate the offset list");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Allocate the runtime Thin NFA structure
|
|
nfa_comp->runtime_nfa = kiss_thin_nfa_create(
|
|
nfa_comp->match_state_num,
|
|
nfa_comp->min_bnfa_off,
|
|
nfa_comp->max_bnfa_off
|
|
);
|
|
if (!nfa_comp->runtime_nfa) {
|
|
thinnfa_debug_err(("%s: Error creating the NFA\n", rname));
|
|
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate BNFA");
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
if (nfa_comp->anchored_root_state) {
|
|
ENUM_SET_FLAG(nfa_comp->runtime_nfa->flags, KISS_THIN_NFA_HAS_ANCHOR);
|
|
}
|
|
|
|
// Build the BNFA we'll use on runtime
|
|
if (kiss_bnfa_fill_states(nfa_comp) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: kiss_bnfa_fill_states() failed\n", rname));
|
|
return KISS_ERROR;
|
|
}
|
|
|
|
// Copy the character translation table
|
|
if (nfa_comp->xlation_tab) {
|
|
bcopy(
|
|
nfa_comp->xlation_tab->tab,
|
|
nfa_comp->runtime_nfa->xlation_tab,
|
|
sizeof(nfa_comp->runtime_nfa->xlation_tab)
|
|
);
|
|
ENUM_SET_FLAG(nfa_comp->runtime_nfa->flags, KISS_THIN_NFA_USE_CHAR_XLATION);
|
|
}
|
|
|
|
kiss_thin_nfa_fill_stats(nfa_comp);
|
|
|
|
thinnfa_debug_major(("%s: Created the binary Thin NFA %p\n", rname, nfa_comp->runtime_nfa.get()));
|
|
return KISS_OK;
|
|
}
|
|
|
|
static void
|
|
kiss_thin_nfa_select_options(struct thin_nfa_comp_s *nfa_comp, CP_MAYBE_UNUSED u_int compile_flags)
|
|
{
|
|
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_ENABLE_ANCHOR_OPT);
|
|
nfa_comp->full_state_tier_num = kiss_thin_nfa_full_tiers_num;
|
|
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_USE_RECURSIVE_COMPILE);
|
|
return;
|
|
}
|
|
|
|
|
|
// Compiling the SM according to Aho-Corasick algorithm.
|
|
//
|
|
// The DFA has two types of states:
|
|
// 1. Full states - have a transition for each possible character.
|
|
// 2. Partial states - only have transitions for characters that take us forward in some string.
|
|
// For all other characters, a "fail state" is defined, and the transition is what that state would have done.
|
|
//
|
|
// Paraemters:
|
|
// patterns - a set of string patterns which the resulting automaton would search for.
|
|
// compile_flags - flags with the KISS_PM_COMP_ prefix.
|
|
// error - output - on failure, would be set to indicate the reason.
|
|
// Retuns NULL on error, pointer to a newly allocated handle on success.
|
|
std::unique_ptr<KissThinNFA>
|
|
kiss_thin_nfa_compile(const std::list<kiss_pmglob_string_s> &patterns, u_int compile_flags, KissPMError *error)
|
|
{
|
|
static const char rname[] = "kiss_thin_nfa_compile";
|
|
struct thin_nfa_comp_s *nfa_comp = NULL;
|
|
std::unique_ptr<KissThinNFA> nfa;
|
|
|
|
thinnfa_debug_major(("%s: Compiling a Thin NFA, flags=%x\n", rname, compile_flags));
|
|
|
|
// Creates a new kiss_thin_dfa_handle with initial state allocated
|
|
nfa_comp = kiss_thin_nfa_comp_create(error);
|
|
if (nfa_comp == NULL) {
|
|
thinnfa_debug_err(("%s: Failed to create a compile time structure\n", rname));
|
|
kiss_pm_error_set_details(error, KISS_PM_ERROR_INTERNAL, "Failed to allocate the compilation information");
|
|
goto finish;
|
|
}
|
|
|
|
// Enable some optimization flags as needed
|
|
kiss_thin_nfa_select_options(nfa_comp, compile_flags);
|
|
|
|
// Handle character translation - instead of converting to lowercase, build a translation
|
|
// tabel and use it when adding patterns to the trie and building transition tables.
|
|
if (kiss_thin_nfa_create_xlation_tab(nfa_comp, compile_flags) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Function kiss_thin_nfa_create_xlation_tab() failed\n", rname));
|
|
goto finish;
|
|
}
|
|
|
|
// Build a trie which contains all the pattern texts.
|
|
for (auto &pattern : patterns) {
|
|
// Adding each pattern to the the Thin NFA - Aho-Corasick first phase
|
|
if (kiss_thin_nfa_add_pattern_to_trie(nfa_comp, &pattern) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Function kiss_thin_nfa_add_pattern_to_trie() failed\n", rname));
|
|
goto finish;
|
|
}
|
|
}
|
|
|
|
// Calculate fail states for all NFA states
|
|
if (kiss_thin_nfa_calc_fail_states(nfa_comp) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Function kiss_thin_nfa_calc_fail_states() failed\n", rname));
|
|
goto finish;
|
|
}
|
|
|
|
// Convert the compilation data structure to the runtime structure
|
|
if (kiss_thin_nfa_build_bnfa(nfa_comp, compile_flags) != KISS_OK) {
|
|
thinnfa_debug_err(("%s: Function kiss_thin_nfa_build_bnfa() failed\n", rname));
|
|
goto finish;
|
|
}
|
|
|
|
if (!kiss_thin_nfa_is_valid(nfa_comp->runtime_nfa.get())) {
|
|
thinnfa_debug_err(("%s: Function kiss_thin_nfa_is_valid() failed\n", rname));
|
|
goto finish;
|
|
}
|
|
|
|
// Get the resulting NFA (set NULL to protect from free)
|
|
nfa = std::move(nfa_comp->runtime_nfa);
|
|
thinnfa_debug_major(("%s: Successfully compiled the Thin NFA %p\n", rname, nfa.get()));
|
|
|
|
finish:
|
|
if (nfa_comp != NULL) {
|
|
// We destroy the compilation data structure, whether we succeed or fail.
|
|
kiss_thin_nfa_comp_destroy(nfa_comp);
|
|
}
|
|
return nfa;
|
|
}
|