openappsec/components/utils/pm/kiss_thin_nfa_compile.cc
2023-01-17 12:17:27 +02:00

2230 lines
84 KiB
C++

// Copyright (C) 2022 Check Point Software Technologies Ltd. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "pm_adaptor.h"
#include "kiss_hash.h"
#include "kiss_thin_nfa_impl.h"
#include "kiss_patterns.h"
// Flag for a Thin NFA state
typedef enum {
THIN_NFA_STATE_FULL = 0x01, // We want a full state table for this state
THIN_NFA_STATE_MATCH = 0x02, // A matching state
THIN_NFA_STATE_ROOT = 0x04, // The root or anchored root state
THIN_NFA_STATE_MAX_IDENTICAL_CHAR = 0x08, // Maximal sequence of identical characters
THIN_NFA_STATE_ANCHORED = 0x10, // A part of the anchored tree
THIN_NFA_STATE_BUILT_TABLE = 0x20, // Already built the BNFA transition table
THIN_NFA_STATE_REACH_FROM_FULL = 0x40, // The state is reachable from full state
} nfa_thin_state_flags_t;
// A Thin NFA state, or a node in the trie, during compilation time
struct kiss_thin_nfa_state_s {
u_int state_id; // Sequencial number, starting from 0
nfa_thin_state_flags_t flags;
u_int depth; // Level in the trie
kiss_thin_nfa_pattern_list_t *ids; // For finite state, patterns associated with it
struct kiss_thin_nfa_state_s *bfs_q; // Use for a BFS iteration on the trie
struct thin_nfa_comp_s *comp; // Saves passing this pointer around
// Outgoing transitions
struct kiss_thin_nfa_state_s *child; // First child of this state
u_int num_trans; // Number of transitions
struct kiss_thin_nfa_state_s *fail_state;
// Incoming transition
struct kiss_thin_nfa_state_s *sibling; // Next child of this state's father
u_char tran_char; // The character that takes us to this state
// BNFA offset
kiss_bnfa_offset_t bnfa_offset; // Where the real state is
kiss_bnfa_offset_t bnfa_incoming_off; // Where incoming transitions should jump (possibly a match state)
// DEBUG ONLY
const u_char *pattern_text; // Points into the user's pattern list. Not null terminated
};
typedef struct kiss_thin_nfa_state_s kiss_thin_nfa_state_t;
// Blocks to hold states . A pretty simple pool mechanism.
// Not very much needed. We currently use it to iterate states by ID order, and for state pointer validation.
#define MAX_THIN_NFA_STATES_BLOCKS 1000
#define KISS_NFA_MAX_STATES_PER_BLOCK 1000
#define KISS_NFA_MAX_STATES_BLOCK_SIZE (KISS_NFA_MAX_STATES_PER_BLOCK * sizeof(kiss_thin_nfa_state_t))
// When do we want a full state? In the first X tiers (root included) and if more than Y transitions.
u_int kiss_thin_nfa_full_tiers_num_small = 2; // Old values, for PMs which must remain small
u_int kiss_thin_nfa_full_tiers_num_medium = 3; // Used for VSX / 32bit kernel, where memory is expensive
u_int kiss_thin_nfa_full_tiers_num = 7; // New value
u_int kiss_thin_nfa_max_partial_trans = 15; // Can't exceed KISS_BNFA_MAX_TRANS_NUM anyway
u_int kiss_thin_nfa_optimize_contig_chars = 1;
// Character translation table for caseless/digitless comparisons.
//
// The idea:
// Each character has a canonic character. This can be itself, or another.
// In a caseless Thin NFA, 'a' and '7' are canonic themselves, 'B' has canonic character 'b'.
// In a digitless Thin NFA, '7' is not canonic - its canonic character is '0'.
// Each character is also a member of a group, containing all characters with the same canonic character.
// In a caseless Thin NFA, 'a' and 'A' are in one group.
// In a digitless Thin NFA, all digits are in one group.
// Notice that a single Thin NFA can be caseless, digitless, neither or both.
//
// The data structure:
// tab - Translates each character into its canonic characer (possibly itself).
// rev - A linked list of characters belonging to the same group. The character itself
// is used instead of a pointer. The last character in the group points to the first.
// Example: For a caseless Thin NFA, rev['a']='A' and rev['A']='a'.
struct thin_nfa_char_trans_tab_s {
u_char tab[KISS_PM_ALPHABET_SIZE];
u_char rev[KISS_PM_ALPHABET_SIZE];
};
// Flags for an entire Thin NFA during compilation
typedef enum {
THIN_NFA_FAIL_STATES_CALCULATED = 0x01, // Once we set this, we expect all states to have fail states.
THIN_NFA_ENABLE_ANCHOR_OPT = 0x02, // Enable optimization for anochored states
THIN_NFA_USE_RECURSIVE_COMPILE = 0x04, // Build full states recursively. Faster, unsuitable for kernel
} thin_nfa_comp_flags_e;
// A Thin NFA which is under construction. The compiled BNFA is constructed from this later.
struct thin_nfa_comp_s {
kiss_thin_nfa_state_t *root_state; // The root state (somewhere inside state_blocks)
kiss_thin_nfa_state_t *anchored_root_state; // The root for anchored patterns
u_int full_state_tier_num; // How many tiers would be full states?
u_int state_num; // How many states do we have so far
u_int match_state_num; // How many matching states do we have?
u_int full_state_num; // How many full states do we have?
KissPMError *error; // Error to be returned to the user.
thin_nfa_comp_flags_e flags;
struct thin_nfa_char_trans_tab_s *xlation_tab; // Caseless/digitless translation table
kiss_thin_nfa_state_t *state_blocks[MAX_THIN_NFA_STATES_BLOCKS]; // Dynamically allocated memory for states
std::unique_ptr<KissThinNFA> runtime_nfa; // The final NFA we're building
kiss_hash_t patterns_hash; // Pattern array to offset mapping
kiss_bnfa_offset_t min_bnfa_off, max_bnfa_off;
};
#if defined(DEBUG)
#define KISS_THIN_NFA_DO_VERIFICATIONS
#endif
#define MAX_STATE_NAME_LEN 100
#define MAX_STATE_NAME_BUFS 4
static kiss_thin_nfa_state_t *
kiss_thin_nfa_get_state_by_id(struct thin_nfa_comp_s *nfa_comp, u_int state_id, const char *caller)
{
u_int block_index;
u_int index_in_block;
kiss_thin_nfa_state_t *block;
// Find the block and the place in the block
block_index = state_id / KISS_NFA_MAX_STATES_PER_BLOCK;
index_in_block = state_id % KISS_NFA_MAX_STATES_PER_BLOCK;
if (block_index >= MAX_THIN_NFA_STATES_BLOCKS) {
thinnfa_debug_critical(("%s: State %d - invalid block index %d (max %d)\n", caller,
state_id, block_index, MAX_THIN_NFA_STATES_BLOCKS));
return NULL;
}
block = nfa_comp->state_blocks[block_index];
if (block == NULL) {
thinnfa_debug_critical((
"%s: State %d - block index %d is not allocated yet\n",
caller,
state_id,
block_index
));
return NULL;
}
return &block[index_in_block];
}
// DEBUG FUNCTION - return a printable name for the state, in a static buffer.
// Accepts a NULL state.
static const char *
state_name(const kiss_thin_nfa_state_t *state)
{
static char buffers[MAX_STATE_NAME_BUFS][MAX_STATE_NAME_LEN];
static u_int next_buf = 0;
u_int cur_buf;
char *name, *p;
if (!state) {
// Happens when printing the root's fail state
return "NULL/-1";
}
// What's a state's name?
// Each state represents a prefix of one or more patterns. This prefix is the natural name for the state.
// We have the pattern text on the state. Its depth tells us how much of it do we need.
// We add the state ID as a suffix, to prevent ambiguituies (particularly for unprintable characters).
//Choose a buffer to use. Allows calling several times under a single debug message.
cur_buf = next_buf;
if (cur_buf >= MAX_STATE_NAME_BUFS) cur_buf = 0;
next_buf = cur_buf + 1;
name = buffers[cur_buf];
p = name;
if (state->flags & THIN_NFA_STATE_ANCHORED) {
// Prefix for anchored states
*p = '^';
p++;
}
// Fill in the state name. Not null-terminated meanwhile.
if (state->pattern_text == NULL) {
const char *state_name;
// Only the root makes sense. But deal with a missing pattern text anyway
state_name = (state->flags & THIN_NFA_STATE_ROOT) ? "ROOT" : "INVALID";
strcpy(p, state_name);
p += strlen(state_name);
} else {
u_int i;
// Normal state - use the relevant prefix of the pattern text
for (i=0; (i<state->depth) && (p<name+MAX_STATE_NAME_LEN-10); i++) {
*p = isprint(state->pattern_text[i]) ? state->pattern_text[i] : '.';
p++;
}
}
// Append the state ID. Removes ambituities (e.g. for unprintable characters)
snprintf(p, MAX_STATE_NAME_LEN-(p-name), "/%u", state->state_id);
name[MAX_STATE_NAME_LEN-1] = '\0';
return name;
}
#if defined(KISS_THIN_NFA_DO_VERIFICATIONS)
// DEBUG FUNCTION - Verify that a state pointer points to a valid state
static BOOL
is_valid_state_ptr(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *state, const char *caller)
{
kiss_thin_nfa_state_t *state_by_id;
if (!state) {
thinnfa_debug_critical(("%s: Null state pointer\n", caller));
return FALSE;
}
state_by_id = kiss_thin_nfa_get_state_by_id(nfa_comp, state->state_id, caller);
if (!state_by_id) {
return FALSE;
}
// Is the state where we expect it to be in the block?
if (state != state_by_id) {
thinnfa_debug_critical(("%s: State %p ID %d is invalid - should be at %p\n", caller, state,
state->state_id, state_by_id));
return FALSE;
}
return TRUE;
}
// DEBUG FUNCTION - Verify a state's transition table
static void
verify_state_ex(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *state, const char *caller)
{
kiss_thin_nfa_state_t *child, *prev_child;
u_int actual_tran_num;
// Is the pointer itself OK?
KISS_ASSERT(is_valid_state_ptr(nfa_comp, state, caller),
("%s: Invalid state pointer %p\n", caller, state));
// Go over the transition table
actual_tran_num = 0;
prev_child = NULL;
for (child = state->child; child != NULL; child = child->sibling) {
// Valid pointer?
KISS_ASSERT(is_valid_state_ptr(nfa_comp, child, caller),
("%s: State %s(%p) contains an invalid child %p after %02x\n", caller, state_name(state), state,
child, prev_child ? prev_child->tran_char : 0));
// Sorted in ascending order?
KISS_ASSERT(!prev_child || prev_child->tran_char < child->tran_char,
("%s: State %s(%p) transition %02x -> %s after %02x -> %s\n", caller,
state_name(state), state,
child->tran_char, state_name(child),
prev_child->tran_char, state_name(prev_child)));
actual_tran_num++;
if (actual_tran_num > state->num_trans) {
// We may be looping
break;
}
prev_child = child;
}
// Counter matches list?
KISS_ASSERT(actual_tran_num == state->num_trans,
("%s: State %s(%p) has %d transitions, but it should have %d\n", caller, state_name(state), state,
actual_tran_num, state->num_trans));
// Fail state?
if (nfa_comp->flags & THIN_NFA_FAIL_STATES_CALCULATED) {
if (state->fail_state == NULL) {
KISS_ASSERT(state == nfa_comp->root_state, ("%s: State %s has no fail state, but it is not root",
caller, state_name(state)));
} else {
KISS_ASSERT(
is_valid_state_ptr(nfa_comp, state->fail_state, caller),
"%s: State %s has an invalid fail state %p\n",
caller,
state_name(state),
state->fail_state
);
}
}
}
// Use this for sanity test on a state
#define verify_state(nfa_comp, state) verify_state_ex(nfa_comp, state, FILE_LINE)
#else // KISS_THIN_NFA_DO_VERIFICATIONS
// Verifications disabled
#define verify_state(nfa_comp, state)
#endif // KISS_THIN_NFA_DO_VERIFICATIONS
// Mark that a state needs to be full
static void
make_state_full(kiss_thin_nfa_state_t *state)
{
if (state->flags & THIN_NFA_STATE_FULL) return;
ENUM_SET_FLAG(state->flags, THIN_NFA_STATE_FULL);
state->comp->full_state_num++;
}
// Mark that a state is matching
static void
make_state_matching(kiss_thin_nfa_state_t *state)
{
if (state->flags & THIN_NFA_STATE_MATCH) return;
ENUM_SET_FLAG(state->flags, THIN_NFA_STATE_MATCH);
state->comp->match_state_num++;
}
// Allocate an empty state on an NFA.
// Initializes all fields to defaults.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_state_create(
struct thin_nfa_comp_s *nfa_comp,
u_int depth,
const u_char *pattern_text,
nfa_thin_state_flags_t flags
)
{
static const char rname[] = "kiss_thin_nfa_state_create";
u_int state_id;
u_int block_index;
u_int index_in_block;
kiss_thin_nfa_state_t *block;
kiss_thin_nfa_state_t *state;
// Find the next ID and the block it should be in
state_id = nfa_comp->state_num;
block_index = state_id / KISS_NFA_MAX_STATES_PER_BLOCK;
index_in_block = state_id % KISS_NFA_MAX_STATES_PER_BLOCK;
thinnfa_debug_extended(("%s: Adding state %d depth %d\n", rname, state_id, depth));
// No more possible blocks?
if (block_index >= MAX_THIN_NFA_STATES_BLOCKS) {
thinnfa_debug_err(("%s: State %d in block %d exceeds the limit %d\n", rname,
state_id, block_index, MAX_THIN_NFA_STATES_BLOCKS));
return NULL;
}
// Allocate the block if needed (first state in the block)
block = nfa_comp->state_blocks[block_index];
if (block == NULL) {
block = (kiss_thin_nfa_state_t *)fw_kmalloc_ex(KISS_NFA_MAX_STATES_BLOCK_SIZE, rname, FW_KMEM_SLEEP);
if (block == NULL) {
thinnfa_debug_err(("%s: Failed to allocate a state block size %lu for the state %u\n", rname,
KISS_NFA_MAX_STATES_BLOCK_SIZE, state_id));
return NULL;
}
nfa_comp->state_blocks[block_index] = block;
}
// Initialize the state
state = &block[index_in_block];
state->state_id = state_id;
state->flags = flags;
state->ids = NULL;
state->bfs_q = NULL;
state->child = NULL;
state->num_trans = 0;
state->fail_state = NULL;
state->sibling = NULL;
state->tran_char = '\0'; // Will be modified, except for the root
state->pattern_text = pattern_text;
state->depth = depth;
state->comp = nfa_comp;
state->bnfa_offset = KISS_BNFA_OFFSET_INVALID;
state->bnfa_incoming_off = KISS_BNFA_OFFSET_INVALID;
// Do we want a full state? kiss_thin_nfa_full_tiers_num=2 means tiers 0 and 1, i.e. the root plus one, are full.
if (state->flags & THIN_NFA_STATE_ROOT) {
// The root must be full, because it has no fail state.
// The anchored root (if exists) is the first state, and must be full, for the bnfa_full_state_size
// condition to work.
make_state_full(state);
} else if (depth < nfa_comp->full_state_tier_num && !(state->flags & THIN_NFA_STATE_ANCHORED)) {
make_state_full(state);
}
// Advance the counter
nfa_comp->state_num++;
return state;
}
// Release all resources on a state structure.
// Doesn't release the states, because it's part of a state block.
static void
kiss_thin_nfa_state_free(kiss_thin_nfa_state_t *state)
{
// Clean up the pattern list
if (state->ids) {
kiss_thin_nfa_free_pattern_ids(state->ids);
state->ids = NULL;
}
return;
}
// Returns the following state, by ID order.
// With prev==NULL, returns the first state.
// With prev!=NULL, returns the next.
// If prev is the last state, returns NULL.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_get_subsequent_state(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *prev)
{
static const char rname[] = "kiss_thin_nfa_get_subsequent_state";
u_int state_id;
// Find the next state's ID
state_id = prev ? prev->state_id + 1 : 0;
if (state_id >= nfa_comp->state_num) {
// prev was the last state.
return NULL;
}
// Get the state pointer
return kiss_thin_nfa_get_state_by_id(nfa_comp, state_id, rname);
}
// Find the transition for a given character from a given state.
// If no transition found, returns NULL and does not check the fail state.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_comp_get_next_state(kiss_thin_nfa_state_t *state, u_char ch)
{
static const char rname[] = "kiss_thin_nfa_comp_get_next_state";
kiss_thin_nfa_state_t *child;
verify_state(state->comp, state);
// Find the child in the list
for (child = state->child; child != NULL; child = child->sibling) {
u_char tran_ch = child->tran_char;
if (tran_ch == ch) {
thinnfa_debug_extended((
"%s: Found transition from the state %s by 0x%02x to %s\n",
rname,
state_name(state),
ch,
state_name(child)
));
return child;
}
// The list is sorted, so we don't need to look beyond the character.
if (tran_ch > ch) break;
}
thinnfa_debug_extended(("%s: No transition from the state %s by 0x%02x\n", rname, state_name(state), ch));
return NULL;
}
// Mark a state as finite and accepting a given kiss_thin_nfa_pattern_t pattern
static kiss_ret_val
kiss_thin_nfa_state_set_match(kiss_thin_nfa_state_t *state, const kiss_thin_nfa_pattern_t *pat_info)
{
static const char rname[] = "kiss_thin_nfa_state_set_match";
verify_state(state->comp, state);
// Add the pattern to this state's pattern list
if (kiss_thin_nfa_add_pattern_id(&(state->ids), pat_info) != KISS_OK) {
thinnfa_debug_err((
"%s: Could not add the 'pattern_id' %d to the final state %s\n",
rname,
pat_info->id,
state_name(state)
));
return KISS_ERROR;
}
thinnfa_debug((
"Setting state %s as the matching state for the 'pattern_id' %d\n",
state_name(state),
pat_info->id
));
make_state_matching(state);
return KISS_OK;
}
// Mark a state as finite, and accepting a given kiss_pmglob_string pattern
static kiss_ret_val
kiss_thin_nfa_state_set_match_pattern(kiss_thin_nfa_state_t *state, const kiss_pmglob_string_s *pattern)
{
kiss_thin_nfa_pattern_t pat_info;
pat_info.id = kiss_pmglob_string_get_id(pattern);
pat_info.pattern_id_flags = kiss_pmglob_string_get_flags(pattern);
pat_info.len = kiss_pmglob_string_get_size(pattern);
return kiss_thin_nfa_state_set_match(state, &pat_info);
}
// Copy the list of accepted patterns from one state to another.
// The destination state can already have patterns, and the lists would be concatenated.
static kiss_ret_val
kiss_thin_nfa_state_copy_match_ids(kiss_thin_nfa_state_t *dst, kiss_thin_nfa_state_t *src)
{
static const char rname[] = "kiss_thin_nfa_state_copy_match_ids";
kiss_thin_nfa_pattern_list_t *curr_id;
verify_state(src->comp, src);
verify_state(dst->comp, dst);
thinnfa_debug(("%s: Copying the match IDs from %s to %s\n", rname, state_name(src), state_name(dst)));
// traversing on the state_src 'ids' adding each one to 'state_dst' list
for(curr_id = src->ids; curr_id; curr_id = curr_id->next) {
if (kiss_thin_nfa_state_set_match(dst, &curr_id->pattern) != KISS_OK) {
thinnfa_debug_err((
"%s: Failed to set the ID %d on the state %s\n",
rname,
curr_id->pattern.id,
state_name(dst)
));
// NOTE: We don't release the IDs we have added. Compilation will fail and clean up anyway.
return KISS_ERROR;
}
}
return KISS_OK;
}
// Destroy the NFA we're compiling
static void
kiss_thin_nfa_comp_destroy(struct thin_nfa_comp_s *nfa_comp)
{
static const char rname[] = "kiss_thin_nfa_comp_destroy";
u_int i;
kiss_thin_nfa_state_t *state;
thinnfa_debug_major(("%s: Destroying the compilation information structure\n", rname));
// Cleanup whatever data we have on the states.
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
kiss_thin_nfa_state_free(state);
}
// Free the state blocks and transition blocks
for (i = 0; i < MAX_THIN_NFA_STATES_BLOCKS; i++) {
if (nfa_comp->state_blocks[i] != NULL) {
fw_kfree(nfa_comp->state_blocks[i], KISS_NFA_MAX_STATES_BLOCK_SIZE, rname);
nfa_comp->state_blocks[i] = NULL;
}
}
if (nfa_comp->xlation_tab!= NULL) {
fw_kfree(nfa_comp->xlation_tab, sizeof(*(nfa_comp->xlation_tab)), rname);
nfa_comp->xlation_tab= NULL;
}
nfa_comp->runtime_nfa.reset(nullptr);
if (nfa_comp->patterns_hash) {
kiss_hash_destroy(nfa_comp->patterns_hash);
nfa_comp->patterns_hash = NULL;
}
fw_kfree(nfa_comp, sizeof(*nfa_comp), rname);
}
// Allocate an empty thin NFA compilation data structure.
static struct thin_nfa_comp_s *
kiss_thin_nfa_comp_create(KissPMError *error)
{
static const char rname[] = "kiss_thin_nfa_comp_create";
struct thin_nfa_comp_s *nfa_comp = NULL;
thinnfa_debug_major(("%s: Allocating the compilation information structure\n", rname));
// Allocate and initialize the compilation temporary structure
nfa_comp = (struct thin_nfa_comp_s *)fw_kmalloc(sizeof(*nfa_comp), rname);
if (!nfa_comp) {
thinnfa_debug_err(("%s: Failed to allocate 'nfa_comp'\n", rname));
goto failure;
}
bzero((void *)nfa_comp, sizeof(*nfa_comp));
nfa_comp->error = error;
// Build the root state
nfa_comp->root_state = kiss_thin_nfa_state_create(nfa_comp, 0, NULL, THIN_NFA_STATE_ROOT);
if (nfa_comp->root_state == NULL) {
thinnfa_debug_err(("%s: Failed to create the root state\n", rname));
goto failure;
}
return nfa_comp;
failure:
if (nfa_comp != NULL) {
kiss_thin_nfa_comp_destroy(nfa_comp);
}
return NULL;
}
// Specify the error for failed Thin NFA compilation
static void
kiss_thin_nfa_set_comp_error(struct thin_nfa_comp_s *nfa_comp, const char *err_text)
{
// We always use "internal", which is appropriate for both logical errors and resource shortage.
// We don't specify a pattern, because nothing is really pattern specific.
kiss_pm_error_set_details(nfa_comp->error, KISS_PM_ERROR_INTERNAL, err_text);
}
// Initialize a translation table for caseless/digitless comparison.
// According to compilation flags, builds a table to translate each character.
static kiss_ret_val
kiss_thin_nfa_create_xlation_tab(struct thin_nfa_comp_s *nfa_comp, int pm_comp_flags)
{
static const char rname[] = "kiss_thin_nfa_create_xlation_tab";
enum kiss_pmglob_char_xlation_flags_e xlation_flags;
// Figure out which translations we need
xlation_flags = KISS_PMGLOB_CHAR_XLATION_NONE;
if (pm_comp_flags & KISS_PM_COMP_CASELESS) {
ENUM_SET_FLAG(xlation_flags, KISS_PMGLOB_CHAR_XLATION_CASE);
}
if (pm_comp_flags & KISS_PM_COMP_DIGITLESS) {
ENUM_SET_FLAG(xlation_flags, KISS_PMGLOB_CHAR_XLATION_DIGITS);
}
if (xlation_flags == KISS_PMGLOB_CHAR_XLATION_NONE) {
// No translation needed
nfa_comp->xlation_tab = NULL;
return KISS_OK;
}
thinnfa_debug_major(("%s: Using%s%s translation table\n", rname,
(xlation_flags&KISS_PMGLOB_CHAR_XLATION_CASE) ? " caseless" : "",
(xlation_flags&KISS_PMGLOB_CHAR_XLATION_DIGITS) ? " digitless" : ""));
// Allocate a translation table
nfa_comp->xlation_tab = (struct thin_nfa_char_trans_tab_s *)fw_kmalloc(sizeof(*(nfa_comp->xlation_tab)), rname);
if (!nfa_comp->xlation_tab) {
thinnfa_debug_err(("%s: Failed to allocate the translation table\n", rname));
return KISS_ERROR;
}
// Build the mapping - normal and reverse
kiss_pmglob_char_xlation_build(xlation_flags, nfa_comp->xlation_tab->tab);
kiss_pmglob_char_xlation_build_reverse(nfa_comp->xlation_tab->tab, nfa_comp->xlation_tab->rev);
return KISS_OK;
}
// Translate a character to canonic form, if a translation table is defined.
static CP_INLINE u_char
kiss_thin_nfa_xlate_char(struct thin_nfa_comp_s *nfa_comp, u_char ch)
{
if (!nfa_comp->xlation_tab) return ch;
return nfa_comp->xlation_tab->tab[ch];
}
#if defined(KISS_THIN_NFA_DO_VERIFICATIONS) && !defined(KERNEL)
// DEBUG FUNCTION - uses a simple&slow algorithm to verify the result of kiss_thin_nfa_are_trans_contained.
// Can't run in the kernel because of the large stack consumption.
static void
verify_trans_contains_(
kiss_thin_nfa_state_t *state_contains,
kiss_thin_nfa_state_t *state_included,
BOOL should_contain
)
{
kiss_thin_nfa_state_t *trans_contains[KISS_PM_ALPHABET_SIZE];
kiss_thin_nfa_state_t *trans_included[KISS_PM_ALPHABET_SIZE];
kiss_thin_nfa_state_t *child;
u_int i;
int mismatch_pos;
// Fill in both transition tables
bzero(trans_contains, sizeof(trans_contains));
for (child = state_contains->child; child != NULL; child = child->sibling) {
trans_contains[child->tran_char] = child;
}
bzero(trans_included, sizeof(trans_included));
for (child = state_included->child; child != NULL; child = child->sibling) {
trans_included[child->tran_char] = child;
}
// Go over the table, looking for a character that's in "included" but not in "contains".
mismatch_pos = -1;
for (i=0; i<KISS_PM_ALPHABET_SIZE; i++) {
if (trans_included[i] != NULL && trans_contains[i] == NULL) {
mismatch_pos = i;
break;
}
}
if (mismatch_pos < 0) {
// No mismatch - really contains
KISS_ASSERT(
should_contain,
("State %s contains %s, but the kiss_thin_nfa_are_trans_contained says it does not",
state_name(state_contains),
state_name(state_contains))
);
} else {
// Mismatch - doesn't contain
KISS_ASSERT(
!should_contain,
("State %s does not contain %s (%02x -> %s), but the kiss_thin_nfa_are_trans_contained says it does",
state_name(state_contains),
state_name(state_included),
(u_char)mismatch_pos,
state_name(trans_included[i]))
);
}
}
#define verify_trans_contains(state_contains, state_included, expected) \
verify_trans_contains_(state_contains, state_included, expected)
#else // KISS_THIN_NFA_DO_VERIFICATIONS
#define verify_trans_contains(state_contains, state_included, expected)
#endif // KISS_THIN_NFA_DO_VERIFICATIONS
// Do all transactions of "included" also exist in "contains"?
static BOOL
kiss_thin_nfa_are_trans_contained(kiss_thin_nfa_state_t *state_contains, kiss_thin_nfa_state_t *state_included)
{
kiss_thin_nfa_state_t *included_child, *contains_child;
verify_state(state_contains->comp, state_contains);
verify_state(state_included->comp, state_included);
if (state_contains->num_trans < state_included->num_trans) {
// "contains" has fewer states - it can't include all "included"
verify_trans_contains(state_contains, state_included, FALSE);
return FALSE;
}
// Advance both included_child and contains_child, to iterate both transition tables.
// Keep them in sync - included_child passes children one by one, and contains_child is advanced
// to the same transition character at each step.
contains_child = state_contains->child;
// Go over the transitions in "included", see if they're in "contained"
for (included_child = state_included->child; included_child != NULL; included_child = included_child->sibling) {
// Advance "tran_contains" until we reach the character we want
for (; contains_child != NULL; contains_child = contains_child->sibling) {
if (contains_child->tran_char >= included_child->tran_char) break;
}
// Do we have this character in "contains"?
if (contains_child == NULL || contains_child->tran_char != included_child->tran_char) {
// This character doesn't exist in state_contains
verify_trans_contains(state_contains, state_included, FALSE);
return FALSE;
}
}
verify_trans_contains(state_contains, state_included, TRUE);
return TRUE;
}
// Get the root state, or the anchored root state, as appropriate for the pattern.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_get_root_state(struct thin_nfa_comp_s *nfa_comp, int anchored)
{
static const char rname[] = "kiss_thin_nfa_get_root_state";
if (!anchored || !(nfa_comp->flags & THIN_NFA_ENABLE_ANCHOR_OPT)) {
thinnfa_debug(("%s: Using normal root: %s, feature %s\n", rname,
anchored?"anchored":"not anchored", (nfa_comp->flags & THIN_NFA_ENABLE_ANCHOR_OPT)?"enabled":"disabled"));
return nfa_comp->root_state;
}
if (!nfa_comp->anchored_root_state) {
// Lazy creation of the anchored root state
nfa_thin_state_flags_t flags = THIN_NFA_STATE_ROOT;
ENUM_SET_FLAG(flags, THIN_NFA_STATE_ANCHORED);
thinnfa_debug(("%s: Creating a new anchored root\n", rname));
nfa_comp->anchored_root_state = kiss_thin_nfa_state_create(nfa_comp, 0, NULL, flags);
if (nfa_comp->anchored_root_state == NULL) {
thinnfa_debug_err(("%s: Failed to create the anchored root state\n", rname));
return NULL;
}
}
thinnfa_debug(("%s: Returning the anchored root (%d)\n", rname, nfa_comp->anchored_root_state->state_id));
return nfa_comp->anchored_root_state;
}
// Find the state in the trie, which represents the longest prefix of a given string.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_find_longest_prefix(struct thin_nfa_comp_s *nfa_comp, const u_char *text, u_int len, int anchored)
{
u_int offset;
kiss_thin_nfa_state_t *state;
// Following the path labeled by chars in 'pattern' (skip the states which already exist)
state = kiss_thin_nfa_get_root_state(nfa_comp, anchored);
if (!state) return NULL;
for (offset = 0; offset < len; offset++) {
kiss_thin_nfa_state_t *next_state;
u_char ch = kiss_thin_nfa_xlate_char(nfa_comp, text[offset]);
verify_state(nfa_comp, state);
// Do we have a node for the next character?
next_state = kiss_thin_nfa_comp_get_next_state(state, ch);
if (next_state == NULL) {
// No next state - this is as far as we go
break;
} else {
state = next_state;
}
}
return state;
}
// Add a newly allocated state to the trie. Keep the transition list sorted.
static void
kiss_thin_nfa_add_transition(kiss_thin_nfa_state_t *parent, u_char tran_char, kiss_thin_nfa_state_t *new_child)
{
static const char rname[] = "kiss_thin_nfa_add_transition";
kiss_thin_nfa_state_t **child_p;
// Go over existing children and find the place to add the transition
for (child_p = &parent->child; *child_p != NULL; child_p = &(*child_p)->sibling) {
kiss_thin_nfa_state_t *child = *child_p;
if (child->tran_char > tran_char) {
// Add before this one
break;
}
}
// Add the transition
new_child->sibling = *child_p;
*child_p = new_child;
new_child->tran_char = tran_char;
parent->num_trans++;
thinnfa_debug_extended(("%s: Added transition from %s by 0x%2x to %s\n", rname,
state_name(parent), tran_char, state_name(new_child)));
if (parent->num_trans > MIN(kiss_thin_nfa_max_partial_trans, KISS_BNFA_MAX_TRANS_NUM)) {
thinnfa_debug((
"%s: State %s has %d transitions - making it full\n",
rname,
state_name(parent),
parent->num_trans
));
make_state_full(parent);
}
// Track states which represent a maximal sequence of identical characters
if ((parent->flags & THIN_NFA_STATE_ROOT) && !(parent->flags & THIN_NFA_STATE_ANCHORED)) {
// Single character - all characters are identical
ENUM_SET_FLAG(new_child->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
} else if ((parent->flags & THIN_NFA_STATE_MAX_IDENTICAL_CHAR) && (parent->tran_char == tran_char)) {
// The child, not the parent, is now the longest
ENUM_UNSET_FLAG(parent->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
ENUM_SET_FLAG(new_child->flags, THIN_NFA_STATE_MAX_IDENTICAL_CHAR);
}
}
// Add a pattern to the trie, which would generate the Thin NFA.
// Upon failure, doesn't clean up states it may have created. Will be cleaned up when destroying nfa_comp.
static kiss_ret_val
kiss_thin_nfa_add_pattern_to_trie(struct thin_nfa_comp_s *nfa_comp, const kiss_pmglob_string_s *sm_cur_pattern)
{
static const char rname[] = "kiss_thin_nfa_add_pattern_to_trie";
const u_char *pattern_text;
u_int pattern_len;
u_int i;
kiss_thin_nfa_state_t *current_state;
int anchored_pattern;
pattern_text = kiss_pmglob_string_get_pattern(sm_cur_pattern);
pattern_len = kiss_pmglob_string_get_size(sm_cur_pattern);
anchored_pattern = kiss_pmglob_string_get_flags(sm_cur_pattern) & KISS_PM_LSS_AT_BUF_START;
thinnfa_debug(("%s: Adding the pattern: %s flags=%x\n", rname, kiss_pmglob_string_to_debug_charp(sm_cur_pattern),
kiss_pmglob_string_get_flags(sm_cur_pattern)));
// How much of this pattern do we already have in the tree?
current_state = kiss_thin_nfa_find_longest_prefix(nfa_comp, pattern_text, pattern_len, anchored_pattern);
if (!current_state) return KISS_ERROR; // Messages printed inside
thinnfa_debug(("%s: State %s (flags %x) represents the longest prefix at the offset %d/%d\n", rname,
state_name(current_state), current_state->flags, current_state->depth, pattern_len));
// Go over the remaining bytes (if any) and add more states
for (i = current_state->depth; i < pattern_len; i++) {
kiss_thin_nfa_state_t *new_state;
u_char ch;
// Create a new state. Depth i+1, because the first character (i=0) is at depth 1.
new_state = kiss_thin_nfa_state_create(nfa_comp, i+1, pattern_text,
(nfa_thin_state_flags_t)(current_state->flags & THIN_NFA_STATE_ANCHORED));
if (!new_state) {
thinnfa_debug_err(("%s: Failed to allocate a new state\n", rname));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a new state");
return KISS_ERROR;
}
// Add a transition into the new state
ch = kiss_thin_nfa_xlate_char(nfa_comp, pattern_text[i]);
kiss_thin_nfa_add_transition(current_state, ch, new_state);
thinnfa_debug(("%s: Added new state+transition %s -> %s by 0x%02x offset %d\n", rname,
state_name(current_state), state_name(new_state), ch, i));
verify_state(nfa_comp, current_state);
// Add the following states after this one
current_state = new_state;
}
// Set state as finite and add the pattern ID to the list of patterns which this state accepts.
// Note: It's OK if the state isn't one we just added. E.g. the new pattern is a prefix of an existing one.
if (kiss_thin_nfa_state_set_match_pattern(current_state, sm_cur_pattern) != KISS_OK) {
thinnfa_debug_err((
"%s: Failed to save the pattern information for the state %s\n",
rname,
state_name(current_state)
));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to save the pattern information for the state");
return KISS_ERROR;
}
return KISS_OK;
}
// Find the transition from a state by a character, considering fail states.
// The state should alrady have its fail state calculated.
//
// Note: kiss_bnfa_build_full_trans_table may pass from_state=NULL. The result is returning the root, which is OK.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_calc_transition(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_state_t *from_state, u_char tran_char)
{
static const char rname[] = "kiss_thin_nfa_calc_transition";
kiss_thin_nfa_state_t *state;
// Go down the fail state chain, until we find a transition.
for (state = from_state; state != NULL; state = state->fail_state) {
kiss_thin_nfa_state_t *next_state;
// Look up in this state's transition table
next_state = kiss_thin_nfa_comp_get_next_state(state, tran_char);
if (next_state != NULL) {
if (state == from_state) {
thinnfa_debug_extended(("%s: Found transition from %s by 0x%02x to %s\n", rname,
state_name(from_state), tran_char, state_name(next_state)));
} else {
thinnfa_debug_extended((
"%s: Found transition from %s by 0x%02x to %s using the fail state %s\n",
rname,
state_name(from_state),
tran_char,
state_name(next_state),
state_name(state)
));
}
return next_state;
}
}
// We've gone down to the root, and found nothing - so the next state is the root.
thinnfa_debug_extended(("%s: No transition from %s by 0x%02x - going to root\n", rname,
state_name(from_state), tran_char));
return nfa_comp->root_state;
}
// A callback function prototype for kiss_thin_nfa_iterate_trans
typedef kiss_ret_val (*kiss_thin_nfa_iterate_trans_cb)(kiss_thin_nfa_state_t *from_state,
u_char tran_char, kiss_thin_nfa_state_t *to_state);
// Iterate all the transitions in the trie, in BFS order.
// Note: The callback will be called once per transition, i.e. once per state, except for the initial state.
static kiss_ret_val
kiss_thin_nfa_iterate_trans_bfs(struct thin_nfa_comp_s *nfa_comp, kiss_thin_nfa_iterate_trans_cb iter_cb)
{
static const char rname[] = "kiss_thin_nfa_iterate_trans_bfs";
kiss_thin_nfa_state_t *bfs_q_head, *bfs_q_tail;
thinnfa_debug(("%s: Starting BFS iteration, %d states\n", rname, nfa_comp->state_num));
// This queue contains states, whose children we want to iterate.
// We start with the root state followed by the anchored root state.
bfs_q_head = nfa_comp->root_state;
bfs_q_head->bfs_q = NULL;
bfs_q_tail = bfs_q_head;
if (nfa_comp->anchored_root_state) {
bfs_q_tail->bfs_q = nfa_comp->anchored_root_state;
nfa_comp->anchored_root_state->bfs_q = NULL;
bfs_q_tail = nfa_comp->anchored_root_state;
}
// Dequeue each of the states, call the iterator for each transition and enqueue the children
while (bfs_q_head != NULL) {
kiss_thin_nfa_state_t *from_state;
kiss_thin_nfa_state_t *to_state;
// Dequeue a state from the head
from_state = bfs_q_head;
bfs_q_head = from_state->bfs_q;
if (bfs_q_head == NULL) bfs_q_tail = NULL;
thinnfa_debug_extended((
"%s: Got the state %s with %d children\n",
rname,
state_name(from_state),
from_state->num_trans
));
// Go over the state's transitions
for (to_state = from_state->child; to_state != NULL; to_state = to_state->sibling) {
thinnfa_debug_extended((
"%s: Got the child state %s at the depth %d\n",
rname,
state_name(to_state),
to_state->depth
));
// Call the iterator function
if (iter_cb(from_state, to_state->tran_char, to_state) != KISS_OK) {
return KISS_ERROR;
}
// No need to enqueue states with no children
if (to_state->num_trans == 0) continue;
// Enqueue the next state, so we'd iterate its transitions too
to_state->bfs_q = NULL;
if (bfs_q_tail != NULL) {
bfs_q_tail->bfs_q = to_state;
} else {
bfs_q_head = to_state;
}
bfs_q_tail = to_state;
}
}
return KISS_OK;
}
// Set a state's fail state.
// To calculate this, we need the state's parent, and the character that takes us from the parent to the current.
// The parent's fail state must be calculated already.
static kiss_ret_val
kiss_thin_nfa_set_fail_state(kiss_thin_nfa_state_t *parent, u_char tran_char, kiss_thin_nfa_state_t *state)
{
static const char rname[] = "kiss_thin_nfa_set_fail_state";
kiss_thin_nfa_state_t *fail_state;
// Calculate the fail state.
// The same character that takes us from parent to state would take us from parent->fail_state to state->fail_state
fail_state = kiss_thin_nfa_calc_transition(state->comp, parent->fail_state, tran_char);
state->fail_state = fail_state;
thinnfa_debug(("%s: The fail state of %s is %s (parent %s, parent->fail_state %s, char %02x)\n", rname,
state_name(state), state_name(fail_state), state_name(parent),
state_name(parent->fail_state), tran_char));
// If a state's fail state is finite, so is the state itself.
// This is because the fail state represents a suffix of the state, which is included in
// the suffix the state represents. If the shorter suffix is a match, so is the longer one.
// Example - The fail state of "abc" is "bc" (if it exists). If "bc" is a match, then so is "abc".
if (fail_state->flags & THIN_NFA_STATE_MATCH) {
thinnfa_debug(("%s: Fail state %s is finite - so is %s\n", rname,
state_name(fail_state), state_name(state)));
if (kiss_thin_nfa_state_copy_match_ids(state, fail_state)) {
thinnfa_debug_err((
"%s: Failed to copy the pattern IDs from %s to %s\n",
rname,
state_name(fail_state),
state_name(state)
));
kiss_thin_nfa_set_comp_error(state->comp, "Failed to copy the pattern IDs");
return KISS_ERROR;
}
}
// This isn't related to calculating fail states. It should be done after the trie was built, but before
// starting BNFA construction.
if (kiss_thin_nfa_optimize_contig_chars && (state->flags & THIN_NFA_STATE_MAX_IDENTICAL_CHAR)) {
// Optimization for identical character sequences. States which represent a maximal sequence of the same
// characters will be full. So for a long sequence of a single character, we'll always be in a full state.
// Great for the performance lab.
thinnfa_debug((
"%s: State %s is a maximal identical character sequence - making it full\n",
rname,
state_name(state)
));
make_state_full(state);
}
return KISS_OK;
}
// See if we can find a better fail state for a state.
// If the fail state contains only transitions the original state has anyway, we can use its fail state instead.
static kiss_thin_nfa_state_t *
kiss_thin_nfa_find_better_fail_state(kiss_thin_nfa_state_t *state)
{
kiss_thin_nfa_state_t *fail_state;
if (!state->fail_state) return NULL;
// Go down the fail state chain.
// Keep going as long as the states contain only transitions the current state has anyway.
for (fail_state = state->fail_state; fail_state->fail_state != NULL; fail_state = fail_state->fail_state) {
verify_state(state->comp, fail_state);
if (fail_state->flags & THIN_NFA_STATE_FULL) {
// Full state - failing to it will always give us the answer.
break;
}
if (!kiss_thin_nfa_are_trans_contained(state, fail_state)) {
// This state has transitions that the current state doesn't - we must fail to it,
// not lower.
break;
}
}
return fail_state;
}
// Change fail states to go faster up the tree, if possible.
// Normally, a fail state points one level upward. But sometimes it can be more upward.
//
// Note: This must be done after kiss_thin_nfa_set_fail_state was called for all states. This is because
// kiss_thin_nfa_set_fail_state uses the parent's fail state to calculate the child's. If the parent's fail stae
// was "reduced", we'll get the wrong fail state for the child.
static void
kiss_thin_nfa_reduce_fail_states(struct thin_nfa_comp_s *nfa_comp)
{
static const char rname[] = "kiss_thin_nfa_reduce_fail_states";
kiss_thin_nfa_state_t *state;
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
kiss_thin_nfa_state_t *fail_state;
if (state->flags & THIN_NFA_STATE_FULL) {
// A full state's fail state isn't interesting
continue;
}
fail_state = kiss_thin_nfa_find_better_fail_state(state);
if (fail_state != state->fail_state) {
// We have a better fail state
thinnfa_debug(("%s: Changing the fail state of %s from %s to %s\n", rname,
state_name(state), state_name(state->fail_state), state_name(fail_state)));
state->fail_state = fail_state;
}
}
}
// Calculate fail states for all states.
static kiss_ret_val
kiss_thin_nfa_calc_fail_states(struct thin_nfa_comp_s *nfa_comp)
{
static const char rname[] = "kiss_thin_nfa_calc_fail_states";
// The root state has no fail state
nfa_comp->root_state->fail_state = NULL;
if (nfa_comp->anchored_root_state) {
// The anchored root fails to the root
nfa_comp->anchored_root_state->fail_state = nfa_comp->root_state;
}
thinnfa_debug(("%s: Calculating the fail states for all states\n", rname));
// Iterate all transitions, and calculate fail states for the target states.
// This would cover all states, except the initial (whose fail state was already set).
// BFS order assures that a parent's fail state is already calculated when we reach the child.
if (kiss_thin_nfa_iterate_trans_bfs(nfa_comp, kiss_thin_nfa_set_fail_state) != KISS_OK) {
thinnfa_debug_err(("%s: Failed to calculate the fail states\n", rname));
return KISS_ERROR;
}
// All states now have their fail states calculated
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_FAIL_STATES_CALCULATED);
// Optimization - reduce fail states
kiss_thin_nfa_reduce_fail_states(nfa_comp);
return KISS_OK;
}
// Set a state's BNFA offset to the size so far, and increment by the state size.
static void
set_state_offset(kiss_thin_nfa_state_t *state, kiss_bnfa_offset_t *cur_offset)
{
static const char rname[] = "set_state_offset";
u_int state_size=0, match_size=0;
verify_state(state->comp, state);
if (state->bnfa_offset == KISS_BNFA_OFFSET_INVALID) {
// Room for the actual state - negative offset for full states, positive for partial.
if ((state->flags & THIN_NFA_STATE_FULL) && (*cur_offset<0)) {
state_size = sizeof(kiss_bnfa_full_state_t);
} else if (!(state->flags & THIN_NFA_STATE_FULL) && (*cur_offset>=0)) {
state_size = kiss_bnfa_partial_state_size(state->num_trans);
}
}
if (state->bnfa_incoming_off == KISS_BNFA_OFFSET_INVALID) {
// Room for a match state - if needed, must be a positive offset.
if ((state->flags & THIN_NFA_STATE_MATCH) && (*cur_offset >= 0)) {
match_size = sizeof(kiss_bnfa_match_state_t);
if (state->flags & THIN_NFA_STATE_FULL) {
// Need a jump state too
match_size += kiss_bnfa_partial_state_size(0);
}
}
}
// Update the state offsets
if (match_size > 0) {
thinnfa_debug_extended(("%s: State %s was given a match offset %d size %d", rname, state_name(state),
*cur_offset, match_size));
state->bnfa_incoming_off = *cur_offset;
*cur_offset += match_size;
}
if (state_size > 0) {
thinnfa_debug_extended(("%s: State %s was given a real offset %d size %d", rname, state_name(state),
*cur_offset, state_size));
state->bnfa_offset = *cur_offset;
*cur_offset += state_size;
if (!(state->flags & THIN_NFA_STATE_MATCH)) {
// Incoming transitions go directly to the state
state->bnfa_incoming_off = state->bnfa_offset;
}
}
}
// Check if compressed offset fits full state offset size
static BOOL
comp_offset_fits_short(kiss_bnfa_comp_offset_t comp_offset)
{
if ((comp_offset) != (kiss_bnfa_short_offset_t)(comp_offset)) {
return FALSE;
}
return TRUE;
}
// Mark all child of a given state as reacheable as reachable from full state
static void
kiss_bnfa_mark_childs_reach_from_full(kiss_thin_nfa_state_t *state)
{
kiss_thin_nfa_state_t *child;
for (child = state->child; child != NULL; child = child->sibling) {
ENUM_SET_FLAG(child->flags, THIN_NFA_STATE_REACH_FROM_FULL);
}
}
// Mark all states that are reachable from a given full state,
// in order to place them at lower offsets to avoid possible overflow due to offset compression.
// If a state`s fail state is of partial type, mark it`s children too
static void
kiss_bnfa_mark_reachable_from_full(kiss_thin_nfa_state_t *state) {
kiss_bnfa_mark_childs_reach_from_full(state);
for (state = state->fail_state; state && !(state->flags & THIN_NFA_STATE_FULL); state = state->fail_state) {
kiss_bnfa_mark_childs_reach_from_full(state);
}
}
// Calcultate the offset of each BNFA state, and the entire BNFA size.
// Sets nfa_comp->offset_list to an array, holding the BNFA offset for each state at [state_id].
// Sets *bnfa_size_p to the total BNFA size.
static kiss_ret_val
kiss_bnfa_calc_offsets(struct thin_nfa_comp_s *nfa_comp)
{
static const char rname[] = "kiss_bnfa_calc_offsets";
kiss_thin_nfa_state_t *state;
kiss_bnfa_offset_t cur_offset;
// Full states have negative offsets. So the first state's offset depends on the number of full states.
cur_offset = -(kiss_bnfa_offset_t)(nfa_comp->full_state_num * sizeof(kiss_bnfa_full_state_t));
nfa_comp->min_bnfa_off = cur_offset;
// Put the anchored root state first, because it's the initial state
if (nfa_comp->anchored_root_state) {
KISS_ASSERT(nfa_comp->anchored_root_state->flags & THIN_NFA_STATE_FULL,
"%s: The anchored root %s must be a full state\n", rname, state_name(nfa_comp->anchored_root_state));
set_state_offset(nfa_comp->anchored_root_state, &cur_offset);
}
// If there's no anchored root, then root must be initial. If there is, validation expects it second.
set_state_offset(nfa_comp->root_state, &cur_offset);
// in this loop we add only the full states, which have negative offsets
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
if (state->flags & THIN_NFA_STATE_FULL) {
kiss_bnfa_mark_reachable_from_full(state); // Mark child states so they'll get low offsets
set_state_offset(state, &cur_offset);
}
}
// We added all full states and moving to partials - we must be at offset 0.
KISS_ASSERT(cur_offset==0,
"%s: Offset %d != 0 after adding %d full states\n", rname, cur_offset, nfa_comp->full_state_num);
// in this loop we add states that are reachable from full states. We want them at low offsets to avoid
// possible overflow due to offset compression
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
if (state->flags & THIN_NFA_STATE_REACH_FROM_FULL){
set_state_offset(state, &cur_offset);
}
}
// Make sure we have not exceede the limit of offsets that can be compressed to 16bit
// Note: the test is a little too strict - we check the first state that is not reachable from a full state
// instead of the last state that is reachable
if (!comp_offset_fits_short(kiss_bnfa_offset_compress(cur_offset))) {
thinnfa_debug_err(("%s: Current offset is %d, not reachable from the full state\n", rname, cur_offset));
kiss_thin_nfa_set_comp_error(nfa_comp, "Exceeded the limit of reachable states");
return KISS_ERROR;
}
// in this loop we add the partial and mathing states, which weren't handled in the loop above.
for (state = nfa_comp->root_state; state != NULL; state = kiss_thin_nfa_get_subsequent_state(nfa_comp, state)) {
set_state_offset(state, &cur_offset);
}
// The current offset is the size of partial states. Add the full state size to get the total size.
nfa_comp->max_bnfa_off = cur_offset;
thinnfa_debug_major(("%s: BNFA size - %u full states, %u partial states, total %u bytes\n", rname,
nfa_comp->full_state_num,
nfa_comp->state_num-nfa_comp->full_state_num,
nfa_comp->max_bnfa_off - nfa_comp->min_bnfa_off));
return KISS_OK;
}
// Get a state's BNFA offset.
// skip_match makes a difference for matching states:
// TRUE - Get the actual state, where the transition table is.
// FALSE - Get the match state, where incoming transitions should go.
static kiss_bnfa_offset_t
state_bnfa_offset(kiss_thin_nfa_state_t *state, BOOL skip_match)
{
return skip_match ? state->bnfa_offset : state->bnfa_incoming_off;
}
// Convert a BNFA offset to a BNFA state pointer
static kiss_bnfa_state_t *
comp_bnfa_offset_to_state(struct thin_nfa_comp_s *nfa_comp, kiss_bnfa_offset_t bnfa_offset)
{
return kiss_bnfa_offset_to_state_write(nfa_comp->runtime_nfa->bnfa, bnfa_offset);
}
// Get a pointer to a state in the BNFA.
// skip_match makes a difference for matching states:
// TRUE - Get the actual state, where the transition table is.
// FALSE - Get the match state, where incoming transitions should go.
static kiss_bnfa_state_t *
comp_to_bnfa_state(kiss_thin_nfa_state_t *state, BOOL skip_match)
{
return comp_bnfa_offset_to_state(state->comp, state_bnfa_offset(state, skip_match));
}
// Move next to state_bnfa_offset. assert inside.
static kiss_bnfa_short_offset_t
state_bnfa_short_offset(kiss_thin_nfa_state_t *state)
{
static const char rname[] = "state_bnfa_short_offset";
kiss_bnfa_comp_offset_t comp_offset = kiss_bnfa_offset_compress(state_bnfa_offset(state, FALSE));
KISS_ASSERT(comp_offset_fits_short(comp_offset),
"%s: Compressed offset %d exceeds the allowed size\n", rname, comp_offset);
return (kiss_bnfa_short_offset_t) comp_offset;
}
// If character translation is enabled, duplicate ch's transition to all equivalents
static void
add_equivalent_transitions(struct thin_nfa_comp_s *nfa_comp, kiss_bnfa_full_state_t *bnfa_state, u_char ch)
{
static const char rname[] = "add_equivalent_transitions";
u_char other_ch;
u_int group_size;
if (!nfa_comp->xlation_tab) return;
// Go over all characters within the same group
group_size = 0;
for (other_ch = nfa_comp->xlation_tab->rev[ch]; other_ch != ch; other_ch = nfa_comp->xlation_tab->rev[other_ch]) {
thinnfa_debug_extended(("%s: Setting translated transition by %02x - same as %02x\n", rname, other_ch, ch));
bnfa_state->transitions[other_ch] = bnfa_state->transitions[ch];
// Prevent looping in case the table is corrupt
group_size++;
KISS_ASSERT_CRASH(group_size <= KISS_PM_ALPHABET_SIZE,
"%s: Too many characters to translate into %02x\n", rname, ch);
}
}
// Add a transition to a full transition table.
// If there's a translation table, add trnasitions for all equivalent characters.
static void
add_full_transition(
struct thin_nfa_comp_s *nfa_comp,
kiss_bnfa_full_state_t *bnfa_state,
kiss_thin_nfa_state_t *next_state
)
{
static const char rname[] = "add_full_transition";
u_char ch = next_state->tran_char;
thinnfa_debug_extended(("%s: Setting the transition by %02x to %s\n", rname,
next_state->tran_char, state_name(next_state)));
// Set the transition, for ch and equivalent characters
bnfa_state->transitions[ch] = state_bnfa_short_offset(next_state);
add_equivalent_transitions(nfa_comp, bnfa_state, ch);
}
#if !defined(KERNEL)
// A recursive algorithm to build full state tables.
// Much faster than the previous algorithm, but shouldn't be used in the kernel.
// Allow mutual recursion between these two functions:
static void build_full_trans_table(kiss_thin_nfa_state_t *comp_state);
static void get_full_trans_table(kiss_thin_nfa_state_t *target_state, kiss_thin_nfa_state_t *source_state);
// Get the transition table of source_state and write it in target_state's.
// source_state is somewhere in the fail state chain of target_state.
static void
get_full_trans_table(kiss_thin_nfa_state_t *target_state, kiss_thin_nfa_state_t *source_state)
{
kiss_thin_nfa_state_t *child;
kiss_bnfa_state_t *target_bnfa = comp_to_bnfa_state(target_state, TRUE);
if (source_state != target_state && (source_state->flags & THIN_NFA_STATE_FULL)) {
// We've reached a full state - just copy its transition table (build it first, if needed)
build_full_trans_table(source_state);
bcopy(comp_to_bnfa_state(source_state, TRUE)->full.transitions,
target_bnfa->full.transitions,
sizeof(target_bnfa->full.transitions));
return;
}
// Start with our fail state's state table
if (source_state->fail_state) {
get_full_trans_table(target_state, source_state->fail_state);
} else {
int i;
kiss_bnfa_short_offset_t root_bnfa_comp_offset = state_bnfa_short_offset(source_state);
// Reached the root - fill with transitions to root
for (i=0; i<KISS_PM_ALPHABET_SIZE; i++) {
target_bnfa->full.transitions[i] = root_bnfa_comp_offset;
}
}
// Override transitions which exist in this state
for (child = source_state->child; child != NULL; child = child->sibling) {
add_full_transition(target_state->comp, &target_bnfa->full, child);
}
}
// Recursive function for building a full state's state table.
// target_bnfa_state is the state who's table we're building.
// source_state changes when recursing over the tail state chain
static void
build_full_trans_table(kiss_thin_nfa_state_t *comp_state)
{
if (comp_state->flags & THIN_NFA_STATE_BUILT_TABLE) return;
get_full_trans_table(comp_state, comp_state);
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
}
#endif // KERNEL
static CP_INLINE kiss_ret_val
verify_add_state(kiss_thin_nfa_state_t *comp_state, kiss_bnfa_state_t *bnfa_state, u_int state_size,
const char *caller, const char *type)
{
const KissThinNFA *nfa_h = comp_state->comp->runtime_nfa.get();
kiss_bnfa_offset_t bnfa_offset = (char *)bnfa_state - (char *)(nfa_h->bnfa);
u_int state_alignment = (bnfa_offset < 0) ? sizeof(kiss_bnfa_full_state_t) : KISS_BNFA_STATE_ALIGNMENT;
if ((bnfa_offset < nfa_h->min_bnfa_offset) || (bnfa_offset+(int)state_size > nfa_h->max_bnfa_offset)) {
thinnfa_debug_err(("%s: Cannot add the %s state %s at the offset %d:%d - out of range %d:%d\n", caller, type,
state_name(comp_state),
bnfa_offset, bnfa_offset+state_size,
nfa_h->min_bnfa_offset, nfa_h->max_bnfa_offset));
return KISS_ERROR;
}
if ((bnfa_offset % state_alignment) != 0) {
thinnfa_debug_err((
"%s: Cannot add the %s state %s at the offset %d:%d - not aligned on %d bytes\n",
caller,
type,
state_name(comp_state),
bnfa_offset,
bnfa_offset + state_size,
state_alignment
));
return KISS_ERROR;
}
thinnfa_debug(("%s: Adding the %s state %s, offsets %d:%d\n", caller, type,
state_name(comp_state),
bnfa_offset, bnfa_offset+state_size));
return KISS_OK;
}
// Old, non-recursive and slow version on build_full_trans_table.
static void
build_full_trans_table_no_recursion(kiss_thin_nfa_state_t *comp_state)
{
static const char rname[] = "build_full_trans_table_no_recursion";
struct thin_nfa_comp_s *nfa_comp = comp_state->comp;
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
kiss_thin_nfa_state_t *child;
u_int i;
// Go over all characters. Maintain a pointer to the next transition in the list.
// We rely on the list being sorted.
// We could simply call kiss_thin_nfa_calc_transition for each character. But it would look up again and
// again in the current state.
child = comp_state->child;
for (i = 0; i < KISS_PM_ALPHABET_SIZE; i++) {
u_char ch = (u_char)i;
kiss_thin_nfa_state_t *next_state;
// Check if it's a canonic character (e.g. lowercase when we're case insensitive)
if (kiss_thin_nfa_xlate_char(nfa_comp, ch) != ch) {
// We'll fill this in when we reach the canonic character.
continue;
}
if (child != NULL && child->tran_char == ch) {
// Use the explicit transition
next_state = child;
// Go forward in the transition table
child = child->sibling;
thinnfa_debug_extended(("%s: Setting the explicit transition by %02x to %s\n", rname,
ch, state_name(next_state)));
} else {
// Note: if comp_state is the initial, we pass from_state=NULL.
// This works as desired (returning the initial state).
next_state = kiss_thin_nfa_calc_transition(nfa_comp, comp_state->fail_state, ch);
thinnfa_debug_extended(("%s: Setting the fail-state transition by %02x to %s\n", rname,
ch, state_name(next_state)));
}
// Set the transition for this character and equivalents
bnfa_state->full.transitions[ch] = state_bnfa_short_offset(next_state);
add_equivalent_transitions(nfa_comp, &bnfa_state->full, ch);
}
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
}
// Build a full state's transition table in the BNFA.
// Either uses the explicit transition, or calculates using fail states.
static kiss_ret_val
kiss_bnfa_build_full_state(kiss_thin_nfa_state_t *comp_state)
{
static const char rname[] = "kiss_bnfa_build_full_state";
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
if (verify_add_state(comp_state, bnfa_state, sizeof(kiss_bnfa_full_state_t), rname, "full") != KISS_OK) {
return KISS_ERROR;
}
#if !defined(KERNEL)
if (comp_state->comp->flags & THIN_NFA_USE_RECURSIVE_COMPILE) {
build_full_trans_table(comp_state);
return KISS_OK;
}
#endif // KERNEL
build_full_trans_table_no_recursion(comp_state);
return KISS_OK;
}
static void
kiss_bnfa_build_partial_state_header(
kiss_bnfa_partial_state_t *bnfa_state,
u_int trans_num,
kiss_bnfa_offset_t fail_offset
)
{
bnfa_state->type = KISS_BNFA_STATE_PARTIAL;
bnfa_state->trans_num = trans_num;
bnfa_state->fail_state_offset = kiss_bnfa_offset_compress(fail_offset);
}
// Build a partial state's transition table in the BNFA.
// Temporary encoding - sets the state ID instead of the BNFA offset (which is yet unknown).
static kiss_ret_val
kiss_bnfa_build_partial_state(kiss_thin_nfa_state_t *comp_state)
{
static const char rname[] = "kiss_bnfa_build_partial_state";
kiss_bnfa_state_t *bnfa_state = comp_to_bnfa_state(comp_state, TRUE);
kiss_thin_nfa_state_t *child;
u_int trans_num;
if (verify_add_state(
comp_state,
bnfa_state,
kiss_bnfa_partial_state_size(comp_state->num_trans),
rname,
"partial"
) != KISS_OK) {
return KISS_ERROR;
}
// Fill in the transition number and fail state
kiss_bnfa_build_partial_state_header(&bnfa_state->partial, comp_state->num_trans,
state_bnfa_offset(comp_state->fail_state, TRUE));
thinnfa_debug_extended(("%s: The fail state is %s\n", rname, state_name(comp_state->fail_state)));
// Build a transition for each existing character
trans_num = 0;
for (child = comp_state->child; child != NULL; child = child->sibling) {
thinnfa_debug_extended(("%s: Setting the transition by %02x to %s\n", rname,
child->tran_char, state_name(child)));
bnfa_state->partial.transitions[trans_num].tran_char = child->tran_char;
bnfa_state->partial.transitions[trans_num].next_state_offset =
kiss_bnfa_offset_compress(state_bnfa_offset(child, FALSE));
trans_num++;
}
KISS_ASSERT(trans_num == comp_state->num_trans, "%s: State %s should have %d transitions, but it has %d",
rname, state_name(comp_state), comp_state->num_trans, trans_num);
ENUM_SET_FLAG(comp_state->flags, THIN_NFA_STATE_BUILT_TABLE);
return KISS_OK;
}
// Build a match state.
static kiss_ret_val
kiss_bnfa_build_match_state(kiss_thin_nfa_state_t *comp_state, u_int match_id)
{
static const char rname[] = "kiss_bnfa_build_match_state";
kiss_bnfa_offset_t match_bnfa_offset = state_bnfa_offset(comp_state, FALSE);
kiss_bnfa_state_t *match_state = comp_bnfa_offset_to_state(comp_state->comp, match_bnfa_offset);
kiss_bnfa_offset_t following_state_offset, real_state_offset;
if (verify_add_state(comp_state, match_state, sizeof(kiss_bnfa_match_state_t), rname, "match") != KISS_OK) {
return KISS_ERROR;
}
// Fill in the match state
match_state->match.type = KISS_BNFA_STATE_MATCH;
match_state->match.unused = 0;
match_state->match.match_id = match_id;
// Add a jump state if the real state isn't directly following the match state (i.e. for full-matching states).
real_state_offset = state_bnfa_offset(comp_state, TRUE);
following_state_offset = match_bnfa_offset + sizeof(kiss_bnfa_match_state_t);
if (following_state_offset != real_state_offset) {
kiss_bnfa_state_t *jump_state = comp_bnfa_offset_to_state(comp_state->comp, following_state_offset);
// Add a jump state (a 0-transition partial state) to the real state
if (verify_add_state(comp_state, jump_state, kiss_bnfa_partial_state_size(0), rname, "jump") != KISS_OK) {
return KISS_ERROR;
}
kiss_bnfa_build_partial_state_header(&jump_state->partial, 0, real_state_offset);
}
return KISS_OK;
}
// Encode a state in binary NFA form.
static kiss_ret_val
kiss_bnfa_add_state(kiss_thin_nfa_state_t *comp_state, u_int offset_in_pat_match_array)
{
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
// Build a match state (a jump state too if needed)
if (kiss_bnfa_build_match_state(comp_state, offset_in_pat_match_array) != KISS_OK) return KISS_ERROR;
}
// Add the state
if (comp_state->flags & THIN_NFA_STATE_FULL) {
if (kiss_bnfa_build_full_state(comp_state) != KISS_OK) return KISS_ERROR;
} else {
if (kiss_bnfa_build_partial_state(comp_state) != KISS_OK) return KISS_ERROR;
}
return KISS_OK;
}
static uintptr_t
pat_key_hash_func(const void *key, CP_MAYBE_UNUSED void *info)
{
const kiss_thin_nfa_pattern_array_t *pat_arr = (const kiss_thin_nfa_pattern_array_t *)key;
const char* buf = (const char *)key;
const char *buf_end;
uintptr_t val = 0;
buf_end = buf + kiss_thin_nfa_pattern_array_size(pat_arr->n_patterns);
for ( ; buf != buf_end; buf++) {
val = ((val >> 3) ^ (val<<5)) + *buf;
}
return val;
}
static int
pat_key_cmp_func(const void *key1, const void *key2, CP_MAYBE_UNUSED void *info)
{
const kiss_thin_nfa_pattern_array_t *pat1 = (const kiss_thin_nfa_pattern_array_t *)key1;
const kiss_thin_nfa_pattern_array_t *pat2 = (const kiss_thin_nfa_pattern_array_t *)key2;
if (pat1->n_patterns != pat2->n_patterns) {
return 1; // No match
}
return memcmp(pat1, pat2, kiss_thin_nfa_pattern_array_size(pat1->n_patterns));
}
static u_int
pattern_list_len(const kiss_thin_nfa_pattern_list_t *pat_list)
{
const kiss_thin_nfa_pattern_list_t *pat;
u_int n = 0;
for (pat = pat_list; pat != NULL; pat = pat->next) {
n++;
}
return n;
}
static kiss_ret_val
kiss_bnfa_match_patterns_prepare(struct thin_nfa_comp_s *nfa_comp, KissThinNFA *nfa)
{
static const char rname[] = "kiss_bnfa_match_patterns_prepare";
kiss_thin_nfa_pattern_array_t *pat_arr;
kiss_thin_nfa_state_t *comp_state;
u_int total_size_for_patterns;
total_size_for_patterns = 0;
for (comp_state = nfa_comp->root_state;
comp_state != NULL;
comp_state = kiss_thin_nfa_get_subsequent_state(nfa_comp, comp_state)) {
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
if (!comp_state->ids) {
thinnfa_debug_critical((
"%s: State %s is finite, but its IDs are null\n",
rname,
state_name(comp_state)
));
kiss_thin_nfa_set_comp_error(nfa_comp, "The state is finite, but its IDs are null");
return KISS_ERROR;
}
total_size_for_patterns += kiss_thin_nfa_pattern_array_size(pattern_list_len(comp_state->ids));
}
}
if (total_size_for_patterns == 0) {
thinnfa_debug_critical(("%s: no finite states?!\n", rname));
kiss_thin_nfa_set_comp_error(nfa_comp, "no finite states?!");
return KISS_ERROR;
}
// We allocate according to maximum possible size.
// We might reduce it at the end, if duplicates exist.
thinnfa_debug(("%s: alocating %u bytes for a pattern array\n", rname, total_size_for_patterns));
pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_pmglob_memory_kmalloc_ex(
total_size_for_patterns,
rname,
FW_KMEM_SLEEP
);
if (!pat_arr) {
thinnfa_debug_critical((
"%s: failed to allocate %d bytes for a complete pattern array\n",
rname,
total_size_for_patterns
));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate memory for a complete pattern array");
return KISS_ERROR;
}
nfa->pattern_arrays = pat_arr;
nfa->pattern_arrays_size = total_size_for_patterns;
nfa_comp->patterns_hash = kiss_hash_create_with_ksleep(
nfa->match_state_num,
pat_key_hash_func,
pat_key_cmp_func,
NULL
);
if (!nfa_comp->patterns_hash) {
thinnfa_debug((
"%s: failed to create patterns hash table for %u finite states\n",
rname,
nfa->match_state_num
));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to create patterns hash table for finite states");
return KISS_ERROR;
}
return KISS_OK;
}
static kiss_ret_val
kiss_bnfa_match_patterns_finalize(struct thin_nfa_comp_s *nfa_comp, KissThinNFA *nfa, u_int new_size)
{
static const char rname[] = "kiss_bnfa_match_patterns_finalize";
kiss_thin_nfa_pattern_array_t *new_pat_arr;
// Compact the match patter array, if needed
if (new_size == nfa->pattern_arrays_size) {
thinnfa_debug(("%s: no size change - the pattern array size is %u bytes\n", rname, new_size));
return KISS_OK;
}
if (new_size > nfa->pattern_arrays_size) {
thinnfa_debug_critical((
"%s: new pattern array size (%u) is greater than the current size (%u). This should not happen.\n",
rname,
new_size,
nfa->pattern_arrays_size
));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a complete pattern array");
return KISS_ERROR;
}
new_pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_pmglob_memory_kmalloc_ex(new_size, rname, FW_KMEM_SLEEP);
if (!new_pat_arr) {
thinnfa_debug_critical(("%s: failed to allocate %d bytes for a complete pattern array\n", rname, new_size));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate a complete pattern array");
return KISS_ERROR;
}
thinnfa_debug(("%s: reducing the size from %u to %u\n", rname, nfa->pattern_arrays_size, new_size));
bcopy(nfa->pattern_arrays, new_pat_arr, new_size);
kiss_pmglob_memory_kfree(nfa->pattern_arrays, nfa->pattern_arrays_size, rname);
nfa->pattern_arrays = new_pat_arr;
nfa->pattern_arrays_size = new_size;
return KISS_OK;
}
static kiss_ret_val
kiss_bnfa_copy_pat_list(
const KissThinNFA *nfa,
kiss_hash_t patterns_hash,
kiss_thin_nfa_state_t *comp_state,
u_int *last_used_offset_in_pat_match_array,
u_int *offset_for_cur_state)
{
static const char rname[] = "kiss_bnfa_copy_pat_list";
if (comp_state->flags & THIN_NFA_STATE_MATCH) {
kiss_thin_nfa_pattern_list_t *pat_list_ent;
kiss_thin_nfa_pattern_array_t *pat_arr;
kiss_thin_nfa_pattern_array_t *cached_pat_arr;
u_int pat_arr_size;
u_int n_patterns;
u_int i;
n_patterns = pattern_list_len(comp_state->ids);
pat_arr_size = kiss_thin_nfa_pattern_array_size(n_patterns);
if ((*last_used_offset_in_pat_match_array + pat_arr_size) > nfa->pattern_arrays_size) {
thinnfa_debug_critical(("%s: offset (%u) + required size (%u) exceeds the total array size (%u)\n",
rname, *last_used_offset_in_pat_match_array, pat_arr_size, nfa->pattern_arrays_size));
return KISS_ERROR;
}
pat_arr = kiss_thin_nfa_offset_to_pat_array_ptr(nfa, *last_used_offset_in_pat_match_array);
pat_arr->n_patterns = n_patterns;
for (i = 0, pat_list_ent = comp_state->ids; i < pat_arr->n_patterns; pat_list_ent = pat_list_ent->next, i++) {
bcopy(&(pat_list_ent->pattern), &(pat_arr->pattern[i]), sizeof(pat_list_ent->pattern));
}
kiss_thin_nfa_free_pattern_ids(comp_state->ids);
comp_state->ids = NULL; // Prevent release when the state is cleaned up
cached_pat_arr = (kiss_thin_nfa_pattern_array_t *)kiss_hash_lookkey(patterns_hash, pat_arr);
if (cached_pat_arr) {
u_int cached_offset;
cached_offset = kiss_thin_nfa_pat_array_ptr_to_offset(nfa, cached_pat_arr);
// No need to move the last_used_offset
*offset_for_cur_state = cached_offset;
thinnfa_debug((
"%s: returning cached offset of %u for the state ID %u. "
"%u patterns %u bytes. The offset stays at %u.\n",
rname,
*offset_for_cur_state,
comp_state->state_id,
n_patterns, pat_arr_size,
*last_used_offset_in_pat_match_array
));
} else {
*offset_for_cur_state = *last_used_offset_in_pat_match_array;
if (!kiss_hash_insert(patterns_hash, pat_arr, NULL)) {
thinnfa_debug(("%s: failed to insert a pattern into a hash (non-critical error)\n", rname));
}
*last_used_offset_in_pat_match_array += pat_arr_size;
thinnfa_debug((
"%s: returning the offset of %u for the state ID %u. %u patterns, %u bytes. The offset moved to %u.\n",
rname,
*offset_for_cur_state,
comp_state->state_id,
n_patterns, pat_arr_size,
*last_used_offset_in_pat_match_array
));
}
}
return KISS_OK;
}
static void
kiss_bnfa_update_state_depth(kiss_thin_nfa_state_t *comp_state)
{
struct kiss_thin_nfa_depth_map_s *map = &comp_state->comp->runtime_nfa->depth_map;
u_char depth = MIN(comp_state->depth, KISS_THIN_NFA_MAX_ENCODABLE_DEPTH);
kiss_bnfa_offset_t off;
// Update depth at the state's offset
off = comp_state->bnfa_offset;
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
// Matching state? Update at the match state offset too.
off = comp_state->bnfa_incoming_off;
if (off == comp_state->bnfa_offset) return;
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
// Full-matching state? Update at the jump state offset too.
off += sizeof(kiss_bnfa_match_state_t);
if (off == comp_state->bnfa_offset) return;
map->offset0[kiss_bnfa_offset_compress(off)] = depth;
}
// Based on thin_nfa_comp_s structure we have built, create a binary Thin NFA.
// Parameter:
// nfa_comp - the NFA's compilation data structure.
//
// Performance notes:
// This function takes most of the CPU time in the compilation process (in my tests, at least).
// Within it, time is divided about equally between full and partial states.
// Full states take about 40 times more time, but there are about 40 times more partial states.
// Overall, compilation time isn't bad, but there are surely optimization options.
// Idea - when constructing a full state, start by bcopy() of its fail state transitions. This would require
// filling the states in BFS order, which isn't done today.
static kiss_ret_val
kiss_bnfa_fill_states(struct thin_nfa_comp_s *nfa_comp)
{
static const char rname[] = "kiss_bnfa_fill_states";
KissThinNFA *nfa = nfa_comp->runtime_nfa.get();
kiss_thin_nfa_state_t *comp_state;
u_int last_used_offset_in_pat_match_array = 0;
thinnfa_debug(("%s: Filling BNFA %p size %d with %d states\n", rname,
nfa->bnfa_start, nfa->max_bnfa_offset-nfa->min_bnfa_offset, nfa_comp->state_num));
if (kiss_bnfa_match_patterns_prepare(nfa_comp, nfa) != KISS_OK) {
return KISS_ERROR;
}
// Go over the states and build the BNFA representation
for (comp_state = nfa_comp->root_state;
comp_state != NULL;
comp_state = kiss_thin_nfa_get_subsequent_state(nfa_comp, comp_state)) {
u_int state_id = comp_state->state_id;
u_int offset_for_cur_state = (u_int)-1;
if (kiss_bnfa_copy_pat_list(
nfa, nfa_comp->patterns_hash, comp_state,
&last_used_offset_in_pat_match_array,
&offset_for_cur_state
) != KISS_OK) {
thinnfa_debug_critical((
"%s: kiss_bnfa_copy_pat_list() failed for the state %s\n",
rname,
state_name(comp_state)
));
kiss_thin_nfa_set_comp_error(nfa_comp, "kiss_bnfa_copy_pat_list() failed");
return KISS_ERROR;
}
// Update the maximum pattern length (length = state depth)
if (comp_state->depth > nfa->max_pat_len) {
nfa->max_pat_len = comp_state->depth;
}
// Build the state
if (kiss_bnfa_add_state(comp_state, offset_for_cur_state) != KISS_OK) {
thinnfa_debug_critical(("%s: Failed to add the state %d\n", rname, state_id));
return KISS_ERROR;
}
// Update the depth map
kiss_bnfa_update_state_depth(comp_state);
}
if (kiss_bnfa_match_patterns_finalize(nfa_comp, nfa, last_used_offset_in_pat_match_array) != KISS_OK) {
return KISS_ERROR;
}
return KISS_OK;
}
static void
kiss_thin_nfa_fill_stats(struct thin_nfa_comp_s *nfa_comp)
{
struct kiss_thin_nfa_specific_stats_s *stats = &nfa_comp->runtime_nfa->stats.specific;
stats->num_of_states = nfa_comp->state_num;
stats->num_of_final_states = nfa_comp->match_state_num;
}
// Get the nfa_comp structure and build, according to it, the runtime Thin NFA structure.
static kiss_ret_val
kiss_thin_nfa_build_bnfa(struct thin_nfa_comp_s *nfa_comp, CP_MAYBE_UNUSED u_int compile_flags)
{
static const char rname[] = "kiss_thin_nfa_build_bnfa";
thinnfa_debug_major(("%s: Converting the compiled Thin NFA to the binary form\n", rname));
// Get the list of all BNFA offsets
if (kiss_bnfa_calc_offsets(nfa_comp) != KISS_OK) {
thinnfa_debug_err(("%s: Error allocating the offset list\n", rname));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate the offset list");
return KISS_ERROR;
}
// Allocate the runtime Thin NFA structure
nfa_comp->runtime_nfa = kiss_thin_nfa_create(
nfa_comp->match_state_num,
nfa_comp->min_bnfa_off,
nfa_comp->max_bnfa_off
);
if (!nfa_comp->runtime_nfa) {
thinnfa_debug_err(("%s: Error creating the NFA\n", rname));
kiss_thin_nfa_set_comp_error(nfa_comp, "Failed to allocate BNFA");
return KISS_ERROR;
}
if (nfa_comp->anchored_root_state) {
ENUM_SET_FLAG(nfa_comp->runtime_nfa->flags, KISS_THIN_NFA_HAS_ANCHOR);
}
// Build the BNFA we'll use on runtime
if (kiss_bnfa_fill_states(nfa_comp) != KISS_OK) {
thinnfa_debug_err(("%s: kiss_bnfa_fill_states() failed\n", rname));
return KISS_ERROR;
}
// Copy the character translation table
if (nfa_comp->xlation_tab) {
bcopy(
nfa_comp->xlation_tab->tab,
nfa_comp->runtime_nfa->xlation_tab,
sizeof(nfa_comp->runtime_nfa->xlation_tab)
);
ENUM_SET_FLAG(nfa_comp->runtime_nfa->flags, KISS_THIN_NFA_USE_CHAR_XLATION);
}
kiss_thin_nfa_fill_stats(nfa_comp);
thinnfa_debug_major(("%s: Created the binary Thin NFA %p\n", rname, nfa_comp->runtime_nfa.get()));
return KISS_OK;
}
static void
kiss_thin_nfa_select_options(struct thin_nfa_comp_s *nfa_comp, CP_MAYBE_UNUSED u_int compile_flags)
{
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_ENABLE_ANCHOR_OPT);
nfa_comp->full_state_tier_num = kiss_thin_nfa_full_tiers_num;
ENUM_SET_FLAG(nfa_comp->flags, THIN_NFA_USE_RECURSIVE_COMPILE);
return;
}
// Compiling the SM according to Aho-Corasick algorithm.
//
// The DFA has two types of states:
// 1. Full states - have a transition for each possible character.
// 2. Partial states - only have transitions for characters that take us forward in some string.
// For all other characters, a "fail state" is defined, and the transition is what that state would have done.
//
// Paraemters:
// patterns - a set of string patterns which the resulting automaton would search for.
// compile_flags - flags with the KISS_PM_COMP_ prefix.
// error - output - on failure, would be set to indicate the reason.
// Retuns NULL on error, pointer to a newly allocated handle on success.
std::unique_ptr<KissThinNFA>
kiss_thin_nfa_compile(const std::list<kiss_pmglob_string_s> &patterns, u_int compile_flags, KissPMError *error)
{
static const char rname[] = "kiss_thin_nfa_compile";
struct thin_nfa_comp_s *nfa_comp = NULL;
std::unique_ptr<KissThinNFA> nfa;
thinnfa_debug_major(("%s: Compiling a Thin NFA, flags=%x\n", rname, compile_flags));
// Creates a new kiss_thin_dfa_handle with initial state allocated
nfa_comp = kiss_thin_nfa_comp_create(error);
if (nfa_comp == NULL) {
thinnfa_debug_err(("%s: Failed to create a compile time structure\n", rname));
kiss_pm_error_set_details(error, KISS_PM_ERROR_INTERNAL, "Failed to allocate the compilation information");
goto finish;
}
// Enable some optimization flags as needed
kiss_thin_nfa_select_options(nfa_comp, compile_flags);
// Handle character translation - instead of converting to lowercase, build a translation
// tabel and use it when adding patterns to the trie and building transition tables.
if (kiss_thin_nfa_create_xlation_tab(nfa_comp, compile_flags) != KISS_OK) {
thinnfa_debug_err(("%s: Function kiss_thin_nfa_create_xlation_tab() failed\n", rname));
goto finish;
}
// Build a trie which contains all the pattern texts.
for (auto &pattern : patterns) {
// Adding each pattern to the the Thin NFA - Aho-Corasick first phase
if (kiss_thin_nfa_add_pattern_to_trie(nfa_comp, &pattern) != KISS_OK) {
thinnfa_debug_err(("%s: Function kiss_thin_nfa_add_pattern_to_trie() failed\n", rname));
goto finish;
}
}
// Calculate fail states for all NFA states
if (kiss_thin_nfa_calc_fail_states(nfa_comp) != KISS_OK) {
thinnfa_debug_err(("%s: Function kiss_thin_nfa_calc_fail_states() failed\n", rname));
goto finish;
}
// Convert the compilation data structure to the runtime structure
if (kiss_thin_nfa_build_bnfa(nfa_comp, compile_flags) != KISS_OK) {
thinnfa_debug_err(("%s: Function kiss_thin_nfa_build_bnfa() failed\n", rname));
goto finish;
}
if (!kiss_thin_nfa_is_valid(nfa_comp->runtime_nfa.get())) {
thinnfa_debug_err(("%s: Function kiss_thin_nfa_is_valid() failed\n", rname));
goto finish;
}
// Get the resulting NFA (set NULL to protect from free)
nfa = std::move(nfa_comp->runtime_nfa);
thinnfa_debug_major(("%s: Successfully compiled the Thin NFA %p\n", rname, nfa.get()));
finish:
if (nfa_comp != NULL) {
// We destroy the compilation data structure, whether we succeed or fail.
kiss_thin_nfa_comp_destroy(nfa_comp);
}
return nfa;
}