mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Implement Direct API
The API now provide searches for: - short literal (up to 8 char) - long literal - (long) literals set - single char - char set - single pair - pair set Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
parent
c057c7f0f0
commit
e91c75f139
@ -304,6 +304,7 @@ set (hs_exec_SRCS
|
||||
src/crc32.h
|
||||
src/report.h
|
||||
src/runtime.c
|
||||
src/hs_direct_search.cpp
|
||||
src/stream_compress.c
|
||||
src/stream_compress.h
|
||||
src/stream_compress_impl.h
|
||||
@ -484,6 +485,7 @@ SET (hs_compile_SRCS
|
||||
src/hs.cpp
|
||||
src/hs_internal.h
|
||||
src/hs_version.h.in
|
||||
src/hs_direct_search_compile.cpp
|
||||
src/scratch.h
|
||||
src/state.h
|
||||
src/ue2common.h
|
||||
|
21
hs.def
21
hs.def
@ -41,3 +41,24 @@ EXPORTS
|
||||
hs_stream_size
|
||||
hs_valid_platform
|
||||
hs_version
|
||||
hs_short_literal_search
|
||||
hs_long_literal_search
|
||||
hs_multi_literal_search
|
||||
hs_single_char_search
|
||||
hs_char_set_search
|
||||
hs_single_char_pair_search
|
||||
hs_char_pair_set_search
|
||||
hs_compile_short_literal_search
|
||||
hs_compile_long_literal_search
|
||||
hs_compile_multi_literal_search
|
||||
hs_compile_single_char_search
|
||||
hs_compile_char_set_search
|
||||
hs_compile_single_char_pair_search
|
||||
hs_compile_char_pair_set_search
|
||||
hs_free_short_literal_pattern
|
||||
hs_free_long_literal_pattern
|
||||
hs_free_multi_literal_pattern
|
||||
hs_free_single_char_pattern
|
||||
hs_free_char_set_pattern
|
||||
hs_free_single_char_pair_pattern
|
||||
hs_free_char_pair_set_pattern
|
@ -33,4 +33,11 @@ EXPORTS
|
||||
hs_set_stream_allocator
|
||||
hs_stream_size
|
||||
hs_valid_platform
|
||||
hs_version
|
||||
hs_version
|
||||
hs_short_literal_search
|
||||
hs_long_literal_search
|
||||
hs_multi_literal_search
|
||||
hs_single_char_search
|
||||
hs_char_set_search
|
||||
hs_single_char_pair_search
|
||||
hs_char_pair_set_search
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2020, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
* Copyright (c) 2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -352,6 +353,99 @@ CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_strea
|
||||
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||
buf, buf_size, scratch, onEvent, context);
|
||||
|
||||
/** DIRECT API **/
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_short_literal_search,
|
||||
const hs_short_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_short_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_short_literal_search,
|
||||
const hs_short_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_short_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_long_literal_search,
|
||||
const hs_long_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_long_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_long_literal_search,
|
||||
const hs_long_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_long_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_multi_literal_search,
|
||||
const hs_multi_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_multi_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_multi_literal_search,
|
||||
const hs_multi_literal_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_multi_literal_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_single_char_search,
|
||||
const hs_single_char_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_single_char_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_single_char_search,
|
||||
const hs_single_char_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_single_char_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_char_set_search,
|
||||
const hs_char_set_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_char_set_search, database, data, length, onEvent,
|
||||
context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_char_set_search,
|
||||
const hs_char_set_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_char_set_search, database, data, length, onEvent,
|
||||
context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_single_char_pair_search,
|
||||
const hs_single_char_pair_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_single_char_pair_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_single_char_pair_search,
|
||||
const hs_single_char_pair_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_single_char_pair_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_char_pair_set_search,
|
||||
const hs_char_pair_set_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_char_pair_set_search, database, data, length,
|
||||
onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_char_pair_set_search,
|
||||
const hs_char_pair_set_compiled_pattern_t *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_char_pair_set_search, database, data, length,
|
||||
onEvent, context);
|
||||
|
||||
/** INTERNALS **/
|
||||
|
||||
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -585,6 +586,90 @@ hs_error_t HS_CDECL hs_valid_platform(void);
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* The following functions are part of the extended API.
|
||||
* This extension offers direct access to search algorithms
|
||||
* allowing the user to minimise calling overhead for simple
|
||||
* search use cases where type of the search is known.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup DIRECT_API_COMMON
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* The size threshold after which a pattern is considered long and must be fed
|
||||
* to @ref hs_compile_long_literal_search(). Patterns up to this length may be
|
||||
* fed to hs_compile_short_literal_search() instead.
|
||||
*/
|
||||
#define HS_SHORT_PATTERN_THRESHOLD 8
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for short literals
|
||||
*
|
||||
* Generated by @ref hs_compile_short_literal_search() and to be freed with @ref
|
||||
* hs_free_short_literal_pattern
|
||||
*/
|
||||
typedef struct hs_short_literal_compiled_pattern
|
||||
hs_short_literal_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for long literals
|
||||
*
|
||||
* Generated by @ref hs_compile_long_literal_search() and to be freed with @ref
|
||||
* hs_free_long_literal_pattern
|
||||
*/
|
||||
typedef struct hs_long_literal_compiled_pattern
|
||||
hs_long_literal_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for several long literal
|
||||
*
|
||||
* Generated by @ref hs_compile_multi_literal_search() and to be freed with @ref
|
||||
* hs_free_multi_literal_pattern
|
||||
*/
|
||||
typedef struct hs_multi_literal_compiled_pattern
|
||||
hs_multi_literal_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for a single character
|
||||
*
|
||||
* Generated by @ref hs_compile_single_char_search() and to be freed with @ref
|
||||
* hs_free_single_char_pattern
|
||||
*/
|
||||
typedef struct hs_single_char_compiled_pattern
|
||||
hs_single_char_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for a character set
|
||||
*
|
||||
* Generated by @ref hs_compile_char_set_search() and to be freed with @ref
|
||||
* hs_free_char_set_pattern
|
||||
*/
|
||||
typedef struct hs_char_set_compiled_pattern hs_char_set_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for a character pair
|
||||
*
|
||||
* Generated by @ref hs_compile_char_pair_search() and to be freed with @ref
|
||||
* hs_free_char_pair_pattern
|
||||
*/
|
||||
typedef struct hs_single_char_pair_compiled_pattern
|
||||
hs_single_char_pair_compiled_pattern_t;
|
||||
|
||||
/**
|
||||
* The compiled pattern type for searching for a set of character pairs
|
||||
*
|
||||
* Generated by @ref hs_compile_char_pair_set_search() and to be freed with
|
||||
* @ref hs_free_char_pair_set_pattern
|
||||
*/
|
||||
typedef struct hs_char_pair_set_compiled_pattern
|
||||
hs_char_pair_set_compiled_pattern_t;
|
||||
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
271
src/hs_compile.h
271
src/hs_compile.h
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2021, Intel Corporation
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -1211,6 +1212,276 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* The following functions are part of the extended API.
|
||||
* This extension offers direct access to search algorithms
|
||||
* allowing the user to minimise calling overhead for simple
|
||||
* search use cases where type of the search is known.
|
||||
*
|
||||
* All search functions handle a limited type of pattern.
|
||||
* For more generic patterns, use @ref hs_compile().
|
||||
*
|
||||
* NOTE: All search functions are considered case-sensitive.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup DIRECT_API_COMPILE
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compiles a short literal expression used in @ref hs_short_literal_search().
|
||||
*
|
||||
* The expression must be at most @ref HS_SHORT_PATTERN_THRESHOLD characters
|
||||
* long. For longer expressions, use @ref hs_compile_long_literal_search() and
|
||||
* @ref hs_long_literal_search() instead.
|
||||
*
|
||||
* @param expression
|
||||
* The expression to parse. Note that this string must represent ONLY the
|
||||
* pattern to be matched, with no delimiters. Null characters are accepted
|
||||
* as part of the expression.
|
||||
*
|
||||
* @param expression_length
|
||||
* The length of the expression in bytes. Up to @ref
|
||||
* HS_SHORT_PATTERN_THRESHOLD characters long.
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_short_literal_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_short_literal_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_short_literal_search(
|
||||
const char *expression, size_t expression_length,
|
||||
hs_short_literal_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a short literal pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_short_literal_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_short_literal_pattern(hs_short_literal_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles a literal expression used in @ref hs_long_literal_search().
|
||||
*
|
||||
* There is no size limit. For expressions up to @ref
|
||||
* HS_SHORT_PATTERN_THRESHOLD character long, @ref
|
||||
* hs_compile_short_literal_search() and @ref hs_short_literal_search() might be
|
||||
* faster
|
||||
*
|
||||
* @param expression
|
||||
* The expression to parse. Note that this string must represent ONLY the
|
||||
* pattern to be matched, with no delimiters. Null characters are accepted
|
||||
* as part of the expression.
|
||||
*
|
||||
* @param expression_length
|
||||
* The length of the expression in bytes.
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_long_literal_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_long_literal_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_long_literal_search(
|
||||
const char *expression, size_t expression_length,
|
||||
hs_long_literal_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a long literal pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_long_literal_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_long_literal_pattern(hs_long_literal_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles several literal expressions used in @ref hs_multi_literal_search().
|
||||
*
|
||||
* There is no size limit.
|
||||
*
|
||||
* @param expression
|
||||
* The array of expressions to parse. Note that the strings must represent
|
||||
* ONLY the patterns to be matched, with no delimiters. Null characters are
|
||||
* accepted as part of the expression. The expression id in
|
||||
* @ref match_event_handler will match the order of the expression given
|
||||
* here (ie: expression[0] will be id 0).
|
||||
*
|
||||
* @param pattern_count
|
||||
* The number of expressions in the @p expression array.
|
||||
*
|
||||
* @param expression_length
|
||||
* The array of length of each expression in the @p expression array.
|
||||
* Expressed in bytes.
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_multi_literal_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_multi_literal_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_multi_literal_search(
|
||||
const char **expression, size_t pattern_count,
|
||||
const size_t *expression_length,
|
||||
hs_multi_literal_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a multi literal pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_multi_literal_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_multi_literal_pattern(hs_multi_literal_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles a single character used in @ref hs_single_char_search().
|
||||
*
|
||||
* @param character
|
||||
* The single character to be searched. It is case sensitive.
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_single_char_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_single_char_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_single_char_search(
|
||||
const char character, hs_single_char_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a single char pattern
|
||||
* @param database
|
||||
* The @ref hs_single_char_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_single_char_pattern(hs_single_char_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles a set of characters used in @ref hs_char_set_search().
|
||||
*
|
||||
* @param character_array
|
||||
* The string or character array containing all the characters in the set.
|
||||
* It is case sensitive. Null terminator is optional.
|
||||
*
|
||||
* @param character_count
|
||||
* The number of characters in @p character_array
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_char_set_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_char_set_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_char_set_search(
|
||||
const char *character_array, size_t character_count,
|
||||
hs_char_set_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a multi char pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_char_set_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_char_set_pattern(hs_char_set_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles a pair of characters used in @ref hs_single_char_pair_search().
|
||||
*
|
||||
* NOTE: The character order matters in the pair. "Aj" won't match "jA"
|
||||
*
|
||||
* @param pair
|
||||
* The string or character array containing the pair. Null terminator is
|
||||
* optional.
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_single_char_pair_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_single_char_pair_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_single_char_pair_search(
|
||||
const char *pair, hs_single_char_pair_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a single char pair pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_single_char_pair_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL hs_free_single_char_pair_pattern(
|
||||
hs_single_char_pair_compiled_pattern_t *database);
|
||||
|
||||
/**
|
||||
* Compiles severals pairs used in @ref hs_char_pair_set_search().
|
||||
*
|
||||
* IMPORTANT: Compilation is only guaranteed for up to 8 pairs. If you search
|
||||
* for more, internal compression may attempt to merge adjacent patterns
|
||||
* (e.g., [ab, ac, ad] becomes a[bcd]) to reduce the total to 8 pairs. If the
|
||||
* compression is insufficient, compilation will fail with
|
||||
* @ref HS_COMPILER_ERROR. In such cases, use @ref multi_literal_search instead.
|
||||
* The compression does not affect the match IDs returned by
|
||||
* @ref hs_char_pair_set_search(). For example, a[bcd] will still report "ab" as
|
||||
* ID 0, "ac" as ID 1, and "ad" as ID 2.
|
||||
*
|
||||
* NOTE: The character order matters in the pair. "Aj" won't match "jA"
|
||||
*
|
||||
* @param expression
|
||||
* The concatenation of all pairs to be parsed. If one want to search for
|
||||
* "ab" or "Cd", then @p expression would be ['a','b','C','d']. Null
|
||||
* terminator is ignored, use @ref pair_count to set the length.
|
||||
*
|
||||
* @param pair_count
|
||||
* The number of characters pair in @p expression
|
||||
*
|
||||
* @param output_database
|
||||
* Returns pointer to buffer containing @ref
|
||||
* hs_char_pair_set_compiled_pattern_t. The buffer must be freed with
|
||||
* @ref hs_free_char_pair_set_pattern.
|
||||
*
|
||||
* @return
|
||||
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||
* HS_COMPILER_ERROR otherwise.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_compile_char_pair_set_search(
|
||||
const char *expression, size_t pair_count,
|
||||
hs_char_pair_set_compiled_pattern_t **output_database);
|
||||
|
||||
/**
|
||||
* Free a multi char pairs pattern.
|
||||
*
|
||||
* @param database
|
||||
* The @ref hs_char_pair_set_compiled_pattern_t pointer to be freed.
|
||||
*/
|
||||
void HS_CDECL
|
||||
hs_free_char_pair_set_pattern(hs_char_pair_set_compiled_pattern_t *database);
|
||||
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
435
src/hs_direct_search.cpp
Normal file
435
src/hs_direct_search.cpp
Normal file
@ -0,0 +1,435 @@
|
||||
/*
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
|
||||
#include "hs_common.h"
|
||||
#include "hs_runtime.h"
|
||||
#include "hs_direct_search.h"
|
||||
#include "hs_direct_search_types.h"
|
||||
|
||||
#include "scratch.h"
|
||||
#include "util/arch.h" // CAN_USE_WIDE_TRUFFLE
|
||||
#include "util/bitutils.h" // ctz64()
|
||||
#include "util/simd_utils.h" // load128()
|
||||
#include "util/supervector/supervector.hpp"
|
||||
|
||||
#include "fdr/fdr.h"
|
||||
#include "hwlm/noodle_engine.h"
|
||||
#include "nfa/shufti.h"
|
||||
#include "nfa/truffle.h"
|
||||
|
||||
typedef typename SuperVector<VECTORSIZE>::comparemask_type vector_mask_type;
|
||||
|
||||
static_assert((uint64_t)CB_CONTINUE_MATCHING == HWLM_CONTINUE_MATCHING,
|
||||
"CB_CONTINUE_MATCHING doesn't match HWLM_CONTINUE_MATCHING");
|
||||
static_assert((uint64_t)CB_TERMINATE_MATCHING == HWLM_TERMINATE_MATCHING,
|
||||
"CB_TERMINATE_MATCHING doesn't match HWLM_TERMINATE_MATCHING");
|
||||
|
||||
static inline hs_error_t hwlm_to_hs_error(const hwlm_error_t error) {
|
||||
switch (error) {
|
||||
case HWLM_SUCCESS:
|
||||
return HS_SUCCESS;
|
||||
case HWLM_TERMINATED:
|
||||
return HS_SCAN_TERMINATED;
|
||||
case HWLM_ERROR_UNKNOWN:
|
||||
return HS_UNKNOWN_ERROR;
|
||||
case HWLM_LITERAL_MAX_LEN:
|
||||
return HS_COMPILER_ERROR;
|
||||
default:
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// convert the callback type of Noodle
|
||||
hwlmcb_rv_t HS_CDECL noodle_to_hs_callback(size_t end, u32 id,
|
||||
struct hs_scratch *scratch) {
|
||||
struct noodle_context *storage = reinterpret_cast<struct noodle_context *>(
|
||||
scratch->core_info.userContext);
|
||||
// hwlm's end is the last char of the pattern, but hs's end is the first
|
||||
// char after the pattern
|
||||
size_t match_start = end + 1 - storage->pattern_length;
|
||||
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||
id, match_start, end + 1, 0, storage->usr_context));
|
||||
}
|
||||
|
||||
// Receive the FDR callback and perform the check for longer patterns (>8 char)
|
||||
hwlmcb_rv_t HS_CDECL FDR_to_hs_callback(size_t end, u32 id,
|
||||
struct hs_scratch *scratch) {
|
||||
const struct FDR_cb_context *combined_ctx =
|
||||
reinterpret_cast<struct FDR_cb_context *>(
|
||||
scratch->core_info.userContext);
|
||||
const FDR_pattern_storage *ps = combined_ctx->patterns;
|
||||
size_t pattern_length = get_const_pattern_sizes(ps)[id];
|
||||
size_t start_offset =
|
||||
end + 1 - std::min(pattern_length, (size_t)HWLM_LITERAL_MAX_LEN);
|
||||
if (pattern_length > HWLM_LITERAL_MAX_LEN) {
|
||||
// long pattern for FDR, we need to confirm it.
|
||||
const char *pattern = get_const_pattern_ptrs(ps)[id];
|
||||
const char *buffer = combined_ctx->buffer;
|
||||
size_t buffer_length = combined_ctx->buffer_length;
|
||||
|
||||
if (start_offset + pattern_length > buffer_length) {
|
||||
// pattern too long for the remaining buffer, no match
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
}
|
||||
|
||||
const char *confirm_buffer_start =
|
||||
buffer + start_offset + HWLM_LITERAL_MAX_LEN;
|
||||
const char *confirm_pattern_start = pattern + HWLM_LITERAL_MAX_LEN;
|
||||
size_t confirm_len = pattern_length - HWLM_LITERAL_MAX_LEN;
|
||||
|
||||
if (confirm_len >= VECTORSIZE) {
|
||||
while (confirm_len > VECTORSIZE) {
|
||||
SuperVector<VECTORSIZE> buffer_vector =
|
||||
SuperVector<VECTORSIZE>::loadu(confirm_buffer_start);
|
||||
SuperVector<VECTORSIZE> pattern_vector =
|
||||
SuperVector<VECTORSIZE>::loadu(confirm_pattern_start);
|
||||
vector_mask_type mask = buffer_vector.eqmask(pattern_vector);
|
||||
if(~mask)
|
||||
// don't match the pattern, continue searching
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
confirm_buffer_start += VECTORSIZE;
|
||||
confirm_pattern_start += VECTORSIZE;
|
||||
confirm_len -= VECTORSIZE;
|
||||
}
|
||||
|
||||
// unaligned load: we cannot risk loading any extra byte, so we run
|
||||
// the vector one last time with an offset to overlap the previous
|
||||
// check, but avoid overflowing.
|
||||
size_t overlap = VECTORSIZE - confirm_len;
|
||||
SuperVector<VECTORSIZE> buffer_vector =
|
||||
SuperVector<VECTORSIZE>::loadu(confirm_buffer_start - overlap);
|
||||
SuperVector<VECTORSIZE> pattern_vector =
|
||||
SuperVector<VECTORSIZE>::loadu(confirm_pattern_start - overlap);
|
||||
vector_mask_type mask = buffer_vector.eqmask(pattern_vector);
|
||||
if(~mask)
|
||||
// don't match the pattern, continue searching
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
} else {
|
||||
size_t confirm_64 = confirm_len / 8;
|
||||
for (size_t i = 0; i < confirm_64; i++) {
|
||||
if ((reinterpret_cast<const uint64_t *>(confirm_buffer_start))[i] !=
|
||||
(reinterpret_cast<const uint64_t *>(confirm_pattern_start))[i])
|
||||
// don't match the pattern, continue searching
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
}
|
||||
confirm_len = confirm_len % 8;
|
||||
|
||||
for (size_t i = 0; i < confirm_len; i++) {
|
||||
if (confirm_buffer_start[i] != confirm_pattern_start[i])
|
||||
// don't match the pattern, continue searching
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
}
|
||||
}
|
||||
|
||||
// we have a valid match. Call the user callback
|
||||
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||
id, start_offset, start_offset + pattern_length, 0,
|
||||
combined_ctx->usr_context));
|
||||
} else {
|
||||
// short pattern, no confirmation needed
|
||||
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||
id, start_offset, end + 1, 0, combined_ctx->usr_context));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// --- short_literal (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_short_literal_search(
|
||||
const hs_short_literal_compiled_pattern *database, const char *data,
|
||||
size_t length, match_event_handler onEvent, void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_short_literal_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_short_literal_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_short_literal_search called with nullptr database");
|
||||
struct noodle_context storage;
|
||||
storage.usr_context = context;
|
||||
storage.pattern_length = database->pattern_length;
|
||||
struct hs_scratch scratch;
|
||||
scratch.core_info.userContext = &storage;
|
||||
scratch.core_info.userCallback = onEvent;
|
||||
|
||||
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||
reinterpret_cast<const uint8_t *>(data),
|
||||
length, 0, noodle_to_hs_callback, &scratch);
|
||||
return hwlm_to_hs_error(error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- long_literal (FDR) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_long_literal_search(
|
||||
const hs_long_literal_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_long_literal_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_long_literal_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_long_literal_search called with nullptr database");
|
||||
|
||||
struct hs_scratch scratch;
|
||||
struct FDR_cb_context combined_ctx = {
|
||||
context, database->fdr_database.patterns, data, length};
|
||||
scratch.core_info.userContext = &combined_ctx;
|
||||
scratch.core_info.userCallback = onEvent;
|
||||
scratch.fdr_conf = nullptr;
|
||||
hwlm_error_t error =
|
||||
fdrExec(database->fdr_database.database,
|
||||
reinterpret_cast<const uint8_t *>(data), length, 0,
|
||||
FDR_to_hs_callback, &scratch, HWLM_ALL_GROUPS);
|
||||
return hwlm_to_hs_error(error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- multi_literal (FDR) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_multi_literal_search(
|
||||
const hs_multi_literal_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent, void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_multi_literal_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_multi_literal_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_multi_literal_search called with nullptr database");
|
||||
|
||||
struct hs_scratch scratch;
|
||||
struct FDR_cb_context combined_ctx = {
|
||||
context, database->fdr_database.patterns, data, length};
|
||||
scratch.core_info.userContext = &combined_ctx;
|
||||
scratch.core_info.userCallback = onEvent;
|
||||
scratch.fdr_conf = nullptr;
|
||||
hwlm_error_t error =
|
||||
fdrExec(database->fdr_database.database,
|
||||
reinterpret_cast<const uint8_t *>(data), length, 0,
|
||||
FDR_to_hs_callback, &scratch, HWLM_ALL_GROUPS);
|
||||
return hwlm_to_hs_error(error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- single_char (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_single_char_search(
|
||||
const hs_single_char_compiled_pattern *database, const char *data,
|
||||
size_t length, match_event_handler onEvent, void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_single_char_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_single_char_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_single_char_search called with nullptr database");
|
||||
struct noodle_context storage;
|
||||
storage.usr_context = context;
|
||||
storage.pattern_length = 1;
|
||||
struct hs_scratch scratch;
|
||||
scratch.core_info.userContext = &storage;
|
||||
scratch.core_info.userCallback = onEvent;
|
||||
|
||||
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||
reinterpret_cast<const uint8_t *>(data),
|
||||
length, 0, noodle_to_hs_callback, &scratch);
|
||||
return hwlm_to_hs_error(error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- char_set (Truffle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_char_set_search(
|
||||
const hs_char_set_compiled_pattern *database, const char *data,
|
||||
size_t length, match_event_handler onEvent, void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_char_set_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_char_set_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_char_set_search called with nullptr database");
|
||||
|
||||
const u8 *current_buf = reinterpret_cast<const u8*>(data);
|
||||
// buf_end must be the first char past the buffer, so current_buf==buf_end
|
||||
// means current_buf is empty.
|
||||
const u8 *buf_end = reinterpret_cast<const u8*>(data) + length;
|
||||
while(current_buf < buf_end) {
|
||||
const u8 *current_match;
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
current_match = truffleExecWide(
|
||||
loadu256(database->wide_mask), current_buf, buf_end);
|
||||
#else
|
||||
current_match = truffleExec(load128(database->mask1),
|
||||
load128(database->mask2),
|
||||
current_buf, buf_end);
|
||||
#endif
|
||||
// current_match is the pointer to the matching char, NOT past the
|
||||
// matching char. or buf_end if no match.
|
||||
if(current_match < buf_end) {
|
||||
size_t id = database->char_id_map[*current_match];
|
||||
size_t match_start =
|
||||
current_match - reinterpret_cast<const u8 *>(data);
|
||||
if( ! onEvent(id, match_start, match_start + 1, 0, context)) {
|
||||
// user requested to stop matching
|
||||
break;
|
||||
}
|
||||
}
|
||||
current_buf = current_match + 1;
|
||||
}
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- single_char_pair (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_single_char_pair_search(
|
||||
const hs_single_char_pair_compiled_pattern *database,
|
||||
const char *data, size_t length, match_event_handler onEvent,
|
||||
void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_single_char_pair_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_single_char_pair_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_single_char_pair_search called with nullptr database");
|
||||
struct noodle_context storage;
|
||||
storage.usr_context = context;
|
||||
storage.pattern_length = 2;
|
||||
struct hs_scratch scratch;
|
||||
scratch.core_info.userContext = &storage;
|
||||
scratch.core_info.userCallback = onEvent;
|
||||
|
||||
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||
reinterpret_cast<const uint8_t *>(data),
|
||||
length, 0, noodle_to_hs_callback, &scratch);
|
||||
return hwlm_to_hs_error(error);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- char_pair_set (Double shufti) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_char_pair_set_search(
|
||||
const hs_char_pair_set_compiled_pattern *database, const char *data,
|
||||
size_t length, match_event_handler onEvent, void *context) {
|
||||
assert(onEvent != nullptr &&
|
||||
"hs_char_pair_set_search called with nullptr callback");
|
||||
assert(data != nullptr &&
|
||||
"hs_char_pair_set_search called with nullptr buffer");
|
||||
assert(database != nullptr &&
|
||||
"hs_char_pair_set_search called with nullptr database");
|
||||
|
||||
const u8 *current_buf = reinterpret_cast<const u8*>(data);
|
||||
// buf_end must be the first char past the buffer, so current_buf==buf_end
|
||||
// means current_buf is empty.
|
||||
const u8 *buf_end = reinterpret_cast<const u8*>(data) + length;
|
||||
while(current_buf < buf_end) {
|
||||
const u8 *current_match;
|
||||
current_match = shuftiDoubleExec(
|
||||
load128(database->dshufti_database.mask1),
|
||||
load128(database->dshufti_database.mask2),
|
||||
load128(database->dshufti_database.mask3),
|
||||
load128(database->dshufti_database.mask4), current_buf, buf_end);
|
||||
// current_match is the pointer to the matching char, NOT past the
|
||||
// matching char. or buf_end if no match.
|
||||
if (current_match < buf_end) {
|
||||
// Shufti doesn't return which pair matched so we have to find out.
|
||||
// Use a 16 bits vector search on the original pattern string,
|
||||
// then return the <first match>/2 as ID.
|
||||
SuperVector<VECTORSIZE> found_pair = SuperVector<VECTORSIZE>(
|
||||
*reinterpret_cast<const u16 *>(current_match));
|
||||
size_t width = SuperVector<VECTORSIZE>::mask_width();
|
||||
SuperVector<VECTORSIZE> all_pair;
|
||||
vector_mask_type mask;
|
||||
vector_mask_type merged_mask;
|
||||
size_t loop = 0;
|
||||
size_t vector_match_iterations_needed =
|
||||
((database->dshufti_database.pair_count - 1) /
|
||||
(VECTORSIZE / 2));
|
||||
for (; loop <= vector_match_iterations_needed; loop++) {
|
||||
all_pair = SuperVector<VECTORSIZE>::load(
|
||||
database->dshufti_database.all_pairs + (VECTORSIZE * loop));
|
||||
// It is fine if the vector isn't filled as we are guaranteed to
|
||||
// have a match before reaching the garbage data
|
||||
mask = all_pair.eqmask(found_pair);
|
||||
// now we have <width> bit set to 1 when a char match.
|
||||
// first we merge the lane result to keep only consecutive
|
||||
// matches
|
||||
merged_mask = mask & (mask >> width);
|
||||
// Then we filter to keep only a single bit per lane, and only
|
||||
// every other lane
|
||||
merged_mask =
|
||||
merged_mask & database->dshufti_database.bit_filter_mask;
|
||||
if (merged_mask)
|
||||
break;
|
||||
}
|
||||
// And finaly we can ctz to get the first pair that match
|
||||
unsigned int id =
|
||||
(ctz64(merged_mask) / width / 2) + (loop * (VECTORSIZE / 2));
|
||||
size_t match_start = current_match - reinterpret_cast<const u8*>(data);
|
||||
if (!onEvent(id, match_start, match_start + 2, 0, context)) {
|
||||
// user requested to stop matching
|
||||
break;
|
||||
}
|
||||
}
|
||||
current_buf = current_match + 1;
|
||||
}
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
207
src/hs_direct_search.h
Normal file
207
src/hs_direct_search.h
Normal file
@ -0,0 +1,207 @@
|
||||
/*
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DIRECT_SEARCH_H
|
||||
#define DIRECT_SEARCH_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "allocator.h"
|
||||
|
||||
#include "fdr/fdr_internal.h"
|
||||
#include "util/arch.h"
|
||||
|
||||
/*
|
||||
* FDR_pattern_storage memory layout:
|
||||
*
|
||||
* |-------------------------------------------------|
|
||||
* | size_t pattern_count |
|
||||
* |------------------------|------------------------|
|
||||
* | pattern_raw_storage : char* pattern_ptrs[] |
|
||||
* | :------------------------|
|
||||
* | : size_t pattern_sizes[] |
|
||||
* | :------------------------|
|
||||
* | : char actual_storage[] |
|
||||
* |------------------------|------------------------|
|
||||
*
|
||||
* Use size_fdr_pattern() to get the size to allocate.
|
||||
*/
|
||||
|
||||
struct FDR_pattern_storage {
|
||||
size_t pattern_count;
|
||||
char pattern_raw_storage[];
|
||||
};
|
||||
|
||||
static inline char **get_pattern_ptrs(struct FDR_pattern_storage *pat) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
return (char **)((char *)pat +
|
||||
offsetof(struct FDR_pattern_storage, pattern_raw_storage));
|
||||
}
|
||||
|
||||
static inline char *const *
|
||||
get_const_pattern_ptrs(const struct FDR_pattern_storage *pat) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
return (char *const *)((const char *)pat +
|
||||
offsetof(struct FDR_pattern_storage,
|
||||
pattern_raw_storage));
|
||||
}
|
||||
|
||||
static inline size_t *get_pattern_sizes(struct FDR_pattern_storage *pat) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
return (size_t *)((char *)get_pattern_ptrs(pat) +
|
||||
pat->pattern_count * sizeof(char *));
|
||||
}
|
||||
|
||||
static inline const size_t *
|
||||
get_const_pattern_sizes(const struct FDR_pattern_storage *pat) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
return (const size_t *)((const char *)get_const_pattern_ptrs(pat) +
|
||||
pat->pattern_count * sizeof(char *));
|
||||
}
|
||||
|
||||
static inline char *
|
||||
get_pattern_string_storage(struct FDR_pattern_storage *pat) {
|
||||
return (char *)get_pattern_sizes(pat) + pat->pattern_count * sizeof(size_t);
|
||||
}
|
||||
|
||||
static inline const char *
|
||||
get_const_pattern_string_storage(const struct FDR_pattern_storage *pat) {
|
||||
return (const char *)get_const_pattern_sizes(pat) +
|
||||
pat->pattern_count * sizeof(size_t);
|
||||
}
|
||||
|
||||
static
|
||||
void init_pattern_store(struct FDR_pattern_storage *storage,
|
||||
const char **in_expression, size_t in_pattern_count,
|
||||
const size_t *in_expression_length) {
|
||||
storage->pattern_count = in_pattern_count;
|
||||
memcpy(get_pattern_sizes(storage), in_expression_length,
|
||||
storage->pattern_count);
|
||||
char *next_string = get_pattern_string_storage(storage);
|
||||
for (size_t i = 0; i < storage->pattern_count; i++) {
|
||||
memcpy(next_string, in_expression[i], in_expression_length[i]);
|
||||
get_pattern_ptrs(storage)[i] = next_string;
|
||||
get_pattern_sizes(storage)[i] = in_expression_length[i];
|
||||
next_string += in_expression_length[i];
|
||||
}
|
||||
}
|
||||
|
||||
static inline
|
||||
void init_pattern_store_single(struct FDR_pattern_storage *storage,
|
||||
const char *in_expression,
|
||||
const size_t in_expression_length) {
|
||||
init_pattern_store(storage, &in_expression, 1, &in_expression_length);
|
||||
}
|
||||
|
||||
static
|
||||
size_t size_fdr_pattern(size_t in_pattern_count,
|
||||
const size_t *in_expression_length) {
|
||||
size_t total_string_size = 0;
|
||||
for (size_t i = 0; i < in_pattern_count; i++) {
|
||||
total_string_size += in_expression_length[i];
|
||||
}
|
||||
size_t ptr_array_size = in_pattern_count * sizeof(char *);
|
||||
size_t pattern_sizes_array_size = in_pattern_count * sizeof(size_t);
|
||||
size_t required_mem = sizeof(struct FDR_pattern_storage) + ptr_array_size +
|
||||
pattern_sizes_array_size + total_string_size;
|
||||
return required_mem;
|
||||
}
|
||||
|
||||
/*
|
||||
* combined_fdr_database memory layout:
|
||||
*
|
||||
* |-------------------------------------------------|
|
||||
* | FDR *database |
|
||||
* |-------------------------------------------------|
|
||||
* | FDR_pattern_storage *patterns |
|
||||
* |------------------------|------------------------|
|
||||
* | raw_storage : FDR fdr_storage |
|
||||
* | :------------------------|
|
||||
* | : FDR_pattern_storage |
|
||||
* |------------------------|------------------------|
|
||||
*
|
||||
* Use size_fdr_database() to get the size to allocate.
|
||||
*/
|
||||
struct combined_fdr_database {
|
||||
struct FDR *database;
|
||||
struct FDR_pattern_storage *patterns;
|
||||
unsigned char raw_storage[];
|
||||
};
|
||||
|
||||
void init_combined_fdr_database(struct combined_fdr_database *database,
|
||||
size_t fdr_size, const char **in_expression,
|
||||
size_t in_pattern_count,
|
||||
const size_t *in_expression_length);
|
||||
|
||||
void init_combined_fdr_database_single(struct combined_fdr_database *database,
|
||||
size_t fdr_size,
|
||||
const char *in_expression,
|
||||
const size_t in_expression_length);
|
||||
static inline
|
||||
size_t size_fdr_database(size_t fdr_size, size_t in_pattern_count,
|
||||
const size_t *in_expression_length) {
|
||||
return sizeof(struct combined_fdr_database) +
|
||||
size_fdr_pattern(in_pattern_count, in_expression_length) + fdr_size;
|
||||
}
|
||||
|
||||
static inline
|
||||
size_t size_fdr_database_single(size_t fdr_size,
|
||||
const size_t in_expression_length) {
|
||||
return size_fdr_database(fdr_size, 1, &in_expression_length);
|
||||
}
|
||||
|
||||
hwlmcb_rv_t HS_CDECL noodle_to_hs_callback(size_t end, u32 id,
|
||||
struct hs_scratch *scratch);
|
||||
|
||||
// Receive the FDR callback and perform the check for longer patterns (>8 char)
|
||||
hwlmcb_rv_t HS_CDECL FDR_to_hs_callback(size_t end, u32 id,
|
||||
struct hs_scratch *scratch);
|
||||
|
||||
struct FDR_cb_context {
|
||||
void *usr_context;
|
||||
const struct FDR_pattern_storage *patterns;
|
||||
const char *buffer;
|
||||
size_t buffer_length;
|
||||
};
|
||||
|
||||
struct noodle_context {
|
||||
void *usr_context;
|
||||
u8 pattern_length;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // DIRECT_SEARCH_H
|
495
src/hs_direct_search_compile.cpp
Normal file
495
src/hs_direct_search_compile.cpp
Normal file
@ -0,0 +1,495 @@
|
||||
/*
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
|
||||
#include "hs_common.h"
|
||||
#include "hs_compile.h"
|
||||
#include "hs_direct_search.h"
|
||||
#include "hs_direct_search_types.h"
|
||||
|
||||
#include "allocator.h" // hs_database_alloc()
|
||||
#include "grey.h"
|
||||
#include "hwlm/hwlm.h" // HWLM_LITERAL_MAX_LEN
|
||||
#include "hwlm/hwlm_internal.h" // HWLM_ENGINE_FDR
|
||||
#include "hwlm/hwlm_literal.h" // ue2::hwlmLiteral
|
||||
#include "hwlm/noodle_internal.h" // noodTable
|
||||
#include "ue2common.h" // likely() - unlikely()
|
||||
#include "util/arch.h" // CAN_USE_WIDE_TRUFFLE
|
||||
#include "util/bytecode_ptr.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/flat_containers.h" // flat_set
|
||||
#include "util/supervector/supervector.hpp"
|
||||
#include "util/target_info.h" // target_t
|
||||
|
||||
#include "fdr/fdr_compile.h"
|
||||
#include "hwlm/noodle_build.h"
|
||||
#include "nfa/shufticompile.h"
|
||||
#include "nfa/trufflecompile.h"
|
||||
|
||||
typedef typename SuperVector<VECTORSIZE>::comparemask_type vector_mask_type;
|
||||
|
||||
void init_combined_fdr_database(struct combined_fdr_database *database,
|
||||
size_t fdr_size, const char **in_expression,
|
||||
size_t in_pattern_count,
|
||||
const size_t *in_expression_length) {
|
||||
database->database = reinterpret_cast<FDR *>(database->raw_storage);
|
||||
database->patterns = reinterpret_cast<FDR_pattern_storage *>(
|
||||
database->raw_storage + fdr_size);
|
||||
init_pattern_store(database->patterns, in_expression, in_pattern_count,
|
||||
in_expression_length);
|
||||
};
|
||||
|
||||
void init_combined_fdr_database_single(struct combined_fdr_database *database,
|
||||
size_t fdr_size,
|
||||
const char *in_expression,
|
||||
const size_t in_expression_length) {
|
||||
database->database = reinterpret_cast<FDR *>(database->raw_storage);
|
||||
database->patterns = reinterpret_cast<FDR_pattern_storage *>(
|
||||
database->raw_storage + fdr_size);
|
||||
init_pattern_store_single(database->patterns, in_expression,
|
||||
in_expression_length);
|
||||
};
|
||||
|
||||
inline void generic_free(void *database) {
|
||||
if (likely(database)) {
|
||||
hs_database_free(database);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// --- short_literal (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_short_literal_search(
|
||||
const char *expression, size_t expression_length,
|
||||
hs_short_literal_compiled_pattern **output_database) {
|
||||
assert(expression_length > 0 &&
|
||||
"hs_compile_short_literal_search called with an empty pattern");
|
||||
assert(expression != nullptr &&
|
||||
"hs_compile_short_literal_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_short_literal_search called with nullptr");
|
||||
if (unlikely(expression_length > HS_SHORT_PATTERN_THRESHOLD)) {
|
||||
return HS_INVALID;
|
||||
}
|
||||
/*
|
||||
* Exposing caseness at the api level may restrict our ability to change
|
||||
* the backing algorithm, so we decided to make all algo case sensitive
|
||||
*/
|
||||
bool is_case_insensitive = false;
|
||||
bool only_need_first_match = false;
|
||||
ue2::hwlmLiteral lit(std::string(expression, expression_length),
|
||||
is_case_insensitive, only_need_first_match, 0,
|
||||
HWLM_ALL_GROUPS, {}, {});
|
||||
|
||||
hs_short_literal_compiled_pattern *database =
|
||||
reinterpret_cast<hs_short_literal_compiled_pattern *>(hs_database_alloc(
|
||||
sizeof(hs_short_literal_compiled_pattern)));
|
||||
if (unlikely(database == nullptr)) {
|
||||
return HS_NOMEM;
|
||||
}
|
||||
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
database->pattern_length = expression_length;
|
||||
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||
sizeof(noodTable));
|
||||
*output_database = database;
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_short_literal_pattern(
|
||||
hs_short_literal_compiled_pattern *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- long_literal (FDR) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_long_literal_search(
|
||||
const char *expression, size_t expression_length,
|
||||
hs_long_literal_compiled_pattern_t **output_database) {
|
||||
assert(expression_length > 0 &&
|
||||
"hs_compile_long_literal_search called with an empty pattern");
|
||||
assert(expression != nullptr &&
|
||||
"hs_compile_long_literal_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_long_literal_search called with nullptr");
|
||||
/*
|
||||
* Exposing caseness at the api level may restrict our ability to change
|
||||
* the backing algorithm, so we decided to make all algo case sensitive
|
||||
*/
|
||||
bool is_case_insensitive = false;
|
||||
bool only_need_first_match = false;
|
||||
std::vector<ue2::hwlmLiteral> lits;
|
||||
// longer strings are checked in the callback
|
||||
ue2::hwlmLiteral lit(
|
||||
std::string(expression,
|
||||
std::min(expression_length, (size_t)HWLM_LITERAL_MAX_LEN)),
|
||||
is_case_insensitive, only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||
lits.push_back(lit);
|
||||
|
||||
ue2::Grey g = ue2::Grey();
|
||||
u8 engType = HWLM_ENGINE_FDR;
|
||||
bool make_small = false;
|
||||
|
||||
hs_platform_info platform_info;
|
||||
hs_populate_platform(&platform_info);
|
||||
|
||||
ue2::target_t target = ue2::target_t(platform_info);
|
||||
|
||||
std::unique_ptr<ue2::HWLMProto> proto =
|
||||
ue2::fdrBuildProto(engType, lits, make_small, target, g);
|
||||
|
||||
ue2::bytecode_ptr<FDR> bytecode_database = ue2::fdrBuildTable(*proto, g);
|
||||
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
size_t fdr_size = bytecode_database.get()->size;
|
||||
|
||||
size_t mem_required = size_fdr_database_single(fdr_size, expression_length);
|
||||
struct combined_fdr_database *combined_database =
|
||||
reinterpret_cast<struct combined_fdr_database *>(
|
||||
hs_database_alloc(mem_required));
|
||||
if (unlikely(combined_database == nullptr)) {
|
||||
return HS_NOMEM;
|
||||
}
|
||||
init_combined_fdr_database_single(combined_database, fdr_size, expression,
|
||||
expression_length);
|
||||
memcpy(combined_database->database, bytecode_database.get(), fdr_size);
|
||||
*output_database = reinterpret_cast<hs_long_literal_compiled_pattern_t *>(
|
||||
combined_database);
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_long_literal_pattern(
|
||||
hs_long_literal_compiled_pattern_t *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- multi_literal (FDR) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_multi_literal_search(
|
||||
const char **expression, size_t pattern_count,
|
||||
const size_t *expression_length,
|
||||
hs_multi_literal_compiled_pattern_t **output_database) {
|
||||
assert(pattern_count > 0 &&
|
||||
"hs_compile_multi_literal_search called with no pattern");
|
||||
assert(expression != nullptr &&
|
||||
"hs_compile_multi_literal_search called with nullptr");
|
||||
assert(expression_length != nullptr &&
|
||||
"hs_compile_multi_literal_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_multi_literal_search called with nullptr");
|
||||
/*
|
||||
* Exposing caseness at the api level may restrict our ability to change
|
||||
* the backing algorithm, so we decided to make all algo case sensitive
|
||||
*/
|
||||
bool is_case_insensitive = false;
|
||||
bool only_need_first_match = false;
|
||||
std::vector<ue2::hwlmLiteral> lits;
|
||||
for (size_t i = 0; i < pattern_count; i++) {
|
||||
assert(expression_length[i] > 0 && expression[i] &&
|
||||
"hs_compile_multi_literal_search called with an empty pattern");
|
||||
// longer strings are checked in the callback
|
||||
ue2::hwlmLiteral lit(
|
||||
std::string(expression[i], std::min(expression_length[i],
|
||||
(size_t)HWLM_LITERAL_MAX_LEN)),
|
||||
is_case_insensitive, only_need_first_match, i, HWLM_ALL_GROUPS, {},
|
||||
{});
|
||||
lits.push_back(lit);
|
||||
}
|
||||
|
||||
ue2::Grey g = ue2::Grey();
|
||||
u8 engType = HWLM_ENGINE_FDR;
|
||||
bool make_small = false;
|
||||
|
||||
hs_platform_info platform_info;
|
||||
hs_populate_platform(&platform_info);
|
||||
|
||||
ue2::target_t target = ue2::target_t(platform_info);
|
||||
|
||||
std::unique_ptr<ue2::HWLMProto> proto =
|
||||
ue2::fdrBuildProto(engType, lits, make_small, target, g);
|
||||
|
||||
ue2::bytecode_ptr<FDR> bytecode_database = ue2::fdrBuildTable(*proto, g);
|
||||
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
size_t fdr_size = bytecode_database.get()->size;
|
||||
|
||||
size_t mem_required =
|
||||
size_fdr_database(fdr_size, pattern_count, expression_length);
|
||||
struct combined_fdr_database *combined_database =
|
||||
reinterpret_cast<struct combined_fdr_database *>(
|
||||
hs_database_alloc(mem_required));
|
||||
if (unlikely(combined_database == nullptr)) {
|
||||
return HS_NOMEM;
|
||||
}
|
||||
init_combined_fdr_database(combined_database, fdr_size, expression,
|
||||
pattern_count, expression_length);
|
||||
memcpy(combined_database->database, bytecode_database.get(), fdr_size);
|
||||
*output_database = reinterpret_cast<hs_multi_literal_compiled_pattern_t *>(
|
||||
combined_database);
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_multi_literal_pattern(
|
||||
hs_multi_literal_compiled_pattern_t *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- single_char (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_single_char_search(
|
||||
const char character, hs_single_char_compiled_pattern **output_database) {
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_single_char_search called with nullptr");
|
||||
|
||||
/*
|
||||
* Exposing caseness at the api level may restrict our ability to change
|
||||
* the backing algorithm, so we decided to make all algo case sensitive
|
||||
*/
|
||||
bool is_case_insensitive = false;
|
||||
bool only_need_first_match = false;
|
||||
ue2::hwlmLiteral lit(std::string(&character, 1), is_case_insensitive,
|
||||
only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||
|
||||
hs_single_char_compiled_pattern *database =
|
||||
reinterpret_cast<hs_single_char_compiled_pattern *>(hs_database_alloc(
|
||||
sizeof(hs_single_char_compiled_pattern)));
|
||||
if (unlikely(database == nullptr)) {
|
||||
return HS_NOMEM;
|
||||
}
|
||||
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||
sizeof(noodTable));
|
||||
*output_database = database;
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_single_char_pattern(
|
||||
hs_single_char_compiled_pattern *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- char_set (Truffle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL
|
||||
hs_compile_char_set_search(const char *character_array, size_t character_count,
|
||||
hs_char_set_compiled_pattern **output_database) {
|
||||
assert(character_count > 0 &&
|
||||
"hs_compile_char_set_search called with an empty set");
|
||||
assert(character_array != nullptr &&
|
||||
"hs_compile_char_set_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_char_set_search called with nullptr");
|
||||
|
||||
const ue2::CharReach cr =
|
||||
ue2::CharReach(std::string(character_array, character_count));
|
||||
truffle_storage *database = reinterpret_cast<truffle_storage *>(
|
||||
hs_database_alloc(sizeof(truffle_storage)));
|
||||
// hs_database_alloc is meant to align to a machine word (likely 64b), which
|
||||
// is actually required here
|
||||
assert((((intptr_t)(database) & 3) == 0) &&
|
||||
"user-provided alloc didn't meet alignment requirement in "
|
||||
"hs_compile_char_set_search");
|
||||
for (u8 i = 0; i < character_count; i++) {
|
||||
database->char_id_map[(u8)character_array[i]] = i;
|
||||
}
|
||||
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
ue2::truffleBuildMasksWide(cr, database->wide_mask);
|
||||
#else
|
||||
ue2::truffleBuildMasks(cr, database->mask1,
|
||||
database->mask2);
|
||||
#endif
|
||||
|
||||
*output_database = database;
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_char_set_pattern(hs_char_set_compiled_pattern *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- single_char_pair (Noodle) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_single_char_pair_search(
|
||||
const char *pair, hs_single_char_pair_compiled_pattern **output_database) {
|
||||
assert(pair != nullptr &&
|
||||
"hs_compile_single_char_pair_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_single_char_pair_search called with nullptr");
|
||||
|
||||
/*
|
||||
* Exposing caseness at the api level may restrict our ability to change
|
||||
* the backing algorithm, so we decided to make all algo case sensitive
|
||||
*/
|
||||
bool is_case_insensitive = false;
|
||||
bool only_need_first_match = false;
|
||||
ue2::hwlmLiteral lit(std::string(pair, 2), is_case_insensitive,
|
||||
only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||
|
||||
hs_single_char_pair_compiled_pattern *database =
|
||||
reinterpret_cast<hs_single_char_pair_compiled_pattern *>(
|
||||
hs_database_alloc(sizeof(hs_single_char_pair_compiled_pattern)));
|
||||
if (unlikely(database == nullptr)) {
|
||||
return HS_NOMEM;
|
||||
}
|
||||
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||
return HS_UNKNOWN_ERROR;
|
||||
}
|
||||
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||
sizeof(noodTable));
|
||||
*output_database = database;
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_single_char_pair_pattern(
|
||||
hs_single_char_pair_compiled_pattern *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// --- char_pair_set (Double shufti) ---
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_compile_char_pair_set_search(
|
||||
const char *expression, size_t pair_count,
|
||||
hs_char_pair_set_compiled_pattern **output_database) {
|
||||
assert(pair_count > 0 &&
|
||||
"hs_compile_char_pair_set_search called with an empty set");
|
||||
assert(expression != nullptr &&
|
||||
"hs_compile_char_pair_set_search called with nullptr");
|
||||
assert(output_database != nullptr &&
|
||||
"hs_compile_char_pair_set_search called with nullptr");
|
||||
|
||||
ue2::flat_set<std::pair<u8, u8>> pairs;
|
||||
for (u8 i = 0; i < pair_count; i++) {
|
||||
pairs.insert(
|
||||
std::make_pair((u8)expression[2 * i], (u8)expression[2 * i + 1]));
|
||||
}
|
||||
|
||||
hs_char_pair_set_compiled_pattern *database =
|
||||
reinterpret_cast<hs_char_pair_set_compiled_pattern *>(hs_database_alloc(
|
||||
sizeof(hs_char_pair_set_compiled_pattern) +
|
||||
sizeof(char) * 2 * pair_count));
|
||||
// hs_database_alloc is meant to align to a machine word (likely 64b), which
|
||||
// is actually required here
|
||||
assert((((intptr_t)(database) & 3) == 0) &&
|
||||
"user-provided alloc didn't meet alignment requirement in "
|
||||
"hs_compile_char_pair_set_search");
|
||||
|
||||
bool success = ue2::shuftiBuildDoubleMasks(
|
||||
ue2::CharReach(), pairs, database->dshufti_database.mask1,
|
||||
database->dshufti_database.mask2, database->dshufti_database.mask3,
|
||||
database->dshufti_database.mask4);
|
||||
|
||||
if (!success) {
|
||||
return HS_COMPILER_ERROR;
|
||||
}
|
||||
|
||||
database->dshufti_database.pair_count = pair_count;
|
||||
|
||||
size_t width = SuperVector<VECTORSIZE>::mask_width();
|
||||
assert(width <= 4 &&
|
||||
"Code needs rework if supervector's mask are bigger than 4");
|
||||
assert(width != 3 &&
|
||||
"Code needs rework if supervector's mask aren't a power of 2");
|
||||
// we need a mask such that every 2*width bits, only the lsb is set to 1
|
||||
// so for a width of 4, we repeat 0X01
|
||||
unsigned char bit_filter_mask = 0;
|
||||
for (size_t i = 8; i > 0; i -= 2 * width) {
|
||||
bit_filter_mask = bit_filter_mask << (2 * width) | 0x1;
|
||||
}
|
||||
memset(&(database->dshufti_database.bit_filter_mask), bit_filter_mask,
|
||||
sizeof(vector_mask_type));
|
||||
memcpy(database->dshufti_database.all_pairs, expression, 2 * pair_count);
|
||||
|
||||
*output_database = database;
|
||||
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
HS_PUBLIC_API
|
||||
void hs_free_char_pair_set_pattern(
|
||||
hs_char_pair_set_compiled_pattern *database) {
|
||||
generic_free(database);
|
||||
}
|
||||
|
87
src/hs_direct_search_types.h
Normal file
87
src/hs_direct_search_types.h
Normal file
@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DIRECT_SEARCH_TYPES_H
|
||||
#define DIRECT_SEARCH_TYPES_H
|
||||
|
||||
#include <stdalign.h>
|
||||
|
||||
#include "util/supervector/supervector.hpp"
|
||||
|
||||
#include "fdr/fdr_internal.h"
|
||||
#include "hwlm/noodle_internal.h"
|
||||
|
||||
|
||||
struct hs_short_literal_compiled_pattern {
|
||||
noodTable noodle_database;
|
||||
u8 pattern_length;
|
||||
};
|
||||
|
||||
struct hs_long_literal_compiled_pattern {
|
||||
struct combined_fdr_database fdr_database;
|
||||
};
|
||||
|
||||
struct hs_multi_literal_compiled_pattern {
|
||||
struct combined_fdr_database fdr_database;
|
||||
};
|
||||
|
||||
struct hs_single_char_compiled_pattern {
|
||||
struct noodTable noodle_database;
|
||||
};
|
||||
|
||||
struct hs_single_char_pair_compiled_pattern {
|
||||
struct noodTable noodle_database;
|
||||
};
|
||||
|
||||
typedef struct hs_char_set_compiled_pattern {
|
||||
union
|
||||
{
|
||||
struct {
|
||||
uint8_t mask1[16] __attribute__((aligned));
|
||||
uint8_t mask2[16] __attribute__((aligned));
|
||||
};
|
||||
uint8_t wide_mask[32] __attribute__((aligned));
|
||||
};
|
||||
// allows us to get the id from the character
|
||||
u8 char_id_map[256];
|
||||
} truffle_storage;
|
||||
|
||||
struct dshufti_storage {
|
||||
alignas(16) uint8_t mask1[16];
|
||||
alignas(16) uint8_t mask2[16];
|
||||
alignas(16) uint8_t mask3[16];
|
||||
alignas(16) uint8_t mask4[16];
|
||||
size_t pair_count;
|
||||
typename SuperVector<VECTORSIZE>::comparemask_type bit_filter_mask;
|
||||
alignas(VECTORSIZE) uint8_t all_pairs[];
|
||||
};
|
||||
|
||||
struct hs_char_pair_set_compiled_pattern {
|
||||
struct dshufti_storage dshufti_database;
|
||||
};
|
||||
#endif // DIRECT_SEARCH_TYPES_H
|
221
src/hs_runtime.h
221
src/hs_runtime.h
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018, Intel Corporation
|
||||
* Copyright (c) 2024-2025, Arm ltd
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -614,6 +615,226 @@ hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch);
|
||||
*/
|
||||
#define HS_OFFSET_PAST_HORIZON (~0ULL)
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* The following functions are part of the extended API.
|
||||
* This extension offers direct access to search algorithms
|
||||
* allowing the user to minimise calling overhead for simple
|
||||
* search use cases where type of the search is known.
|
||||
*
|
||||
* All search functions handle a limited kind of patterns. For more generic
|
||||
* patterns, use @ref hs_scan()
|
||||
*
|
||||
* NOTE: All search functions are considered case-sensitive.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup DIRECT_API_RUNTIME
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** Callback return value indicating that we should continue matching. */
|
||||
#define CB_CONTINUE_MATCHING (int)(~0U)
|
||||
|
||||
/** Callback return value indicating that we should halt matching. */
|
||||
#define CB_TERMINATE_MATCHING (int)0
|
||||
|
||||
/**
|
||||
* Search the given data for the short literal pattern up to
|
||||
* @ref HS_SHORT_PATTERN_THRESHOLD chars long. For longer patterns, use @ref
|
||||
* hs_long_literal_search(). Other options exists for character pairs or set.
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_short_literal_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_short_literal_search(
|
||||
const hs_short_literal_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for the long literal pattern.
|
||||
*
|
||||
* If the pattern length is less or equal to @ref HS_SHORT_PATTERN_THRESHOLD,
|
||||
* @ref hs_short_literal_search() may be faster.
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_long_literal_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_long_literal_search(
|
||||
const hs_long_literal_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for several long literal patterns at once.
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_multi_literal_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* The reported ID is the index of the matching literal.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_multi_literal_search(
|
||||
const hs_multi_literal_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for any occurrence of the given character.
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_single_char_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_single_char_search(
|
||||
const hs_single_char_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for occurrences of any character from the given
|
||||
* character set.
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_char_set_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* The reported ID is the index of the matching char.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_char_set_search(
|
||||
const hs_char_set_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for occurrences of the given ordered character pair
|
||||
* ("Aj" won't match "jA").
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref hs_compile_char_pair_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_single_char_pair_search(
|
||||
const hs_single_char_pair_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
* Search the given data for occurrences of any of the ordered character pair
|
||||
* from the given set ("Aj" won't match "jA")
|
||||
*
|
||||
* @param database
|
||||
* The compiled pattern returned by @ref
|
||||
* hs_compile_char_pair_set_search()
|
||||
* @param data
|
||||
* Pointer to the data to be scanned.
|
||||
* @param length
|
||||
* The number of bytes to scan.
|
||||
* @param onEvent
|
||||
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||
* pointer is given, no matches will be returned.
|
||||
* The "flag" argument is unused.
|
||||
* The reported ID is the index of the matching pair.
|
||||
* @param context
|
||||
* The user defined pointer which will be passed to the callback function.
|
||||
*
|
||||
* @return
|
||||
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||
* match callback indicated that scanning should stop; other values on
|
||||
* error.
|
||||
*/
|
||||
hs_error_t HS_CDECL hs_char_pair_set_search(
|
||||
const hs_char_pair_set_compiled_pattern_t *database, const char *data,
|
||||
size_t length, match_event_handler onEvent,
|
||||
void *context);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user