mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Implement Direct API
The API now provide searches for: - short literal (up to 8 char) - long literal - (long) literals set - single char - char set - single pair - pair set Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
parent
c057c7f0f0
commit
e91c75f139
@ -304,6 +304,7 @@ set (hs_exec_SRCS
|
|||||||
src/crc32.h
|
src/crc32.h
|
||||||
src/report.h
|
src/report.h
|
||||||
src/runtime.c
|
src/runtime.c
|
||||||
|
src/hs_direct_search.cpp
|
||||||
src/stream_compress.c
|
src/stream_compress.c
|
||||||
src/stream_compress.h
|
src/stream_compress.h
|
||||||
src/stream_compress_impl.h
|
src/stream_compress_impl.h
|
||||||
@ -484,6 +485,7 @@ SET (hs_compile_SRCS
|
|||||||
src/hs.cpp
|
src/hs.cpp
|
||||||
src/hs_internal.h
|
src/hs_internal.h
|
||||||
src/hs_version.h.in
|
src/hs_version.h.in
|
||||||
|
src/hs_direct_search_compile.cpp
|
||||||
src/scratch.h
|
src/scratch.h
|
||||||
src/state.h
|
src/state.h
|
||||||
src/ue2common.h
|
src/ue2common.h
|
||||||
|
21
hs.def
21
hs.def
@ -41,3 +41,24 @@ EXPORTS
|
|||||||
hs_stream_size
|
hs_stream_size
|
||||||
hs_valid_platform
|
hs_valid_platform
|
||||||
hs_version
|
hs_version
|
||||||
|
hs_short_literal_search
|
||||||
|
hs_long_literal_search
|
||||||
|
hs_multi_literal_search
|
||||||
|
hs_single_char_search
|
||||||
|
hs_char_set_search
|
||||||
|
hs_single_char_pair_search
|
||||||
|
hs_char_pair_set_search
|
||||||
|
hs_compile_short_literal_search
|
||||||
|
hs_compile_long_literal_search
|
||||||
|
hs_compile_multi_literal_search
|
||||||
|
hs_compile_single_char_search
|
||||||
|
hs_compile_char_set_search
|
||||||
|
hs_compile_single_char_pair_search
|
||||||
|
hs_compile_char_pair_set_search
|
||||||
|
hs_free_short_literal_pattern
|
||||||
|
hs_free_long_literal_pattern
|
||||||
|
hs_free_multi_literal_pattern
|
||||||
|
hs_free_single_char_pattern
|
||||||
|
hs_free_char_set_pattern
|
||||||
|
hs_free_single_char_pair_pattern
|
||||||
|
hs_free_char_pair_set_pattern
|
@ -33,4 +33,11 @@ EXPORTS
|
|||||||
hs_set_stream_allocator
|
hs_set_stream_allocator
|
||||||
hs_stream_size
|
hs_stream_size
|
||||||
hs_valid_platform
|
hs_valid_platform
|
||||||
hs_version
|
hs_version
|
||||||
|
hs_short_literal_search
|
||||||
|
hs_long_literal_search
|
||||||
|
hs_multi_literal_search
|
||||||
|
hs_single_char_search
|
||||||
|
hs_char_set_search
|
||||||
|
hs_single_char_pair_search
|
||||||
|
hs_char_pair_set_search
|
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016-2020, Intel Corporation
|
* Copyright (c) 2016-2020, Intel Corporation
|
||||||
* Copyright (c) 2024, VectorCamp PC
|
* Copyright (c) 2024, VectorCamp PC
|
||||||
|
* Copyright (c) 2025, Arm ltd
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -352,6 +353,99 @@ CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_strea
|
|||||||
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||||
buf, buf_size, scratch, onEvent, context);
|
buf, buf_size, scratch, onEvent, context);
|
||||||
|
|
||||||
|
/** DIRECT API **/
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_short_literal_search,
|
||||||
|
const hs_short_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_short_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_short_literal_search,
|
||||||
|
const hs_short_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_short_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_long_literal_search,
|
||||||
|
const hs_long_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_long_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_long_literal_search,
|
||||||
|
const hs_long_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_long_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_multi_literal_search,
|
||||||
|
const hs_multi_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_multi_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_multi_literal_search,
|
||||||
|
const hs_multi_literal_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_multi_literal_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_single_char_search,
|
||||||
|
const hs_single_char_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_single_char_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_single_char_search,
|
||||||
|
const hs_single_char_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_single_char_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_char_set_search,
|
||||||
|
const hs_char_set_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_char_set_search, database, data, length, onEvent,
|
||||||
|
context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_char_set_search,
|
||||||
|
const hs_char_set_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_char_set_search, database, data, length, onEvent,
|
||||||
|
context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_single_char_pair_search,
|
||||||
|
const hs_single_char_pair_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_single_char_pair_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_single_char_pair_search,
|
||||||
|
const hs_single_char_pair_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_single_char_pair_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
|
CREATE_DISPATCH(hs_error_t, hs_char_pair_set_search,
|
||||||
|
const hs_char_pair_set_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_1(hs_error_t, hs_char_pair_set_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
CONNECT_DISPATCH_2(hs_error_t, hs_char_pair_set_search,
|
||||||
|
const hs_char_pair_set_compiled_pattern_t *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
CONNECT_ARGS_3(hs_error_t, hs_char_pair_set_search, database, data, length,
|
||||||
|
onEvent, context);
|
||||||
|
|
||||||
/** INTERNALS **/
|
/** INTERNALS **/
|
||||||
|
|
||||||
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2019, Intel Corporation
|
* Copyright (c) 2015-2019, Intel Corporation
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -585,6 +586,90 @@ hs_error_t HS_CDECL hs_valid_platform(void);
|
|||||||
|
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The following functions are part of the extended API.
|
||||||
|
* This extension offers direct access to search algorithms
|
||||||
|
* allowing the user to minimise calling overhead for simple
|
||||||
|
* search use cases where type of the search is known.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @defgroup DIRECT_API_COMMON
|
||||||
|
*
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The size threshold after which a pattern is considered long and must be fed
|
||||||
|
* to @ref hs_compile_long_literal_search(). Patterns up to this length may be
|
||||||
|
* fed to hs_compile_short_literal_search() instead.
|
||||||
|
*/
|
||||||
|
#define HS_SHORT_PATTERN_THRESHOLD 8
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for short literals
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_short_literal_search() and to be freed with @ref
|
||||||
|
* hs_free_short_literal_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_short_literal_compiled_pattern
|
||||||
|
hs_short_literal_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for long literals
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_long_literal_search() and to be freed with @ref
|
||||||
|
* hs_free_long_literal_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_long_literal_compiled_pattern
|
||||||
|
hs_long_literal_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for several long literal
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_multi_literal_search() and to be freed with @ref
|
||||||
|
* hs_free_multi_literal_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_multi_literal_compiled_pattern
|
||||||
|
hs_multi_literal_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for a single character
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_single_char_search() and to be freed with @ref
|
||||||
|
* hs_free_single_char_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_single_char_compiled_pattern
|
||||||
|
hs_single_char_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for a character set
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_char_set_search() and to be freed with @ref
|
||||||
|
* hs_free_char_set_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_char_set_compiled_pattern hs_char_set_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for a character pair
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_char_pair_search() and to be freed with @ref
|
||||||
|
* hs_free_char_pair_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_single_char_pair_compiled_pattern
|
||||||
|
hs_single_char_pair_compiled_pattern_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compiled pattern type for searching for a set of character pairs
|
||||||
|
*
|
||||||
|
* Generated by @ref hs_compile_char_pair_set_search() and to be freed with
|
||||||
|
* @ref hs_free_char_pair_set_pattern
|
||||||
|
*/
|
||||||
|
typedef struct hs_char_pair_set_compiled_pattern
|
||||||
|
hs_char_pair_set_compiled_pattern_t;
|
||||||
|
|
||||||
|
/** @} */
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
#endif
|
#endif
|
||||||
|
271
src/hs_compile.h
271
src/hs_compile.h
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2021, Intel Corporation
|
* Copyright (c) 2015-2021, Intel Corporation
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -1211,6 +1212,276 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
|
|||||||
|
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The following functions are part of the extended API.
|
||||||
|
* This extension offers direct access to search algorithms
|
||||||
|
* allowing the user to minimise calling overhead for simple
|
||||||
|
* search use cases where type of the search is known.
|
||||||
|
*
|
||||||
|
* All search functions handle a limited type of pattern.
|
||||||
|
* For more generic patterns, use @ref hs_compile().
|
||||||
|
*
|
||||||
|
* NOTE: All search functions are considered case-sensitive.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @defgroup DIRECT_API_COMPILE
|
||||||
|
*
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a short literal expression used in @ref hs_short_literal_search().
|
||||||
|
*
|
||||||
|
* The expression must be at most @ref HS_SHORT_PATTERN_THRESHOLD characters
|
||||||
|
* long. For longer expressions, use @ref hs_compile_long_literal_search() and
|
||||||
|
* @ref hs_long_literal_search() instead.
|
||||||
|
*
|
||||||
|
* @param expression
|
||||||
|
* The expression to parse. Note that this string must represent ONLY the
|
||||||
|
* pattern to be matched, with no delimiters. Null characters are accepted
|
||||||
|
* as part of the expression.
|
||||||
|
*
|
||||||
|
* @param expression_length
|
||||||
|
* The length of the expression in bytes. Up to @ref
|
||||||
|
* HS_SHORT_PATTERN_THRESHOLD characters long.
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_short_literal_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_short_literal_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_short_literal_search(
|
||||||
|
const char *expression, size_t expression_length,
|
||||||
|
hs_short_literal_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a short literal pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_short_literal_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_short_literal_pattern(hs_short_literal_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a literal expression used in @ref hs_long_literal_search().
|
||||||
|
*
|
||||||
|
* There is no size limit. For expressions up to @ref
|
||||||
|
* HS_SHORT_PATTERN_THRESHOLD character long, @ref
|
||||||
|
* hs_compile_short_literal_search() and @ref hs_short_literal_search() might be
|
||||||
|
* faster
|
||||||
|
*
|
||||||
|
* @param expression
|
||||||
|
* The expression to parse. Note that this string must represent ONLY the
|
||||||
|
* pattern to be matched, with no delimiters. Null characters are accepted
|
||||||
|
* as part of the expression.
|
||||||
|
*
|
||||||
|
* @param expression_length
|
||||||
|
* The length of the expression in bytes.
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_long_literal_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_long_literal_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_long_literal_search(
|
||||||
|
const char *expression, size_t expression_length,
|
||||||
|
hs_long_literal_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a long literal pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_long_literal_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_long_literal_pattern(hs_long_literal_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles several literal expressions used in @ref hs_multi_literal_search().
|
||||||
|
*
|
||||||
|
* There is no size limit.
|
||||||
|
*
|
||||||
|
* @param expression
|
||||||
|
* The array of expressions to parse. Note that the strings must represent
|
||||||
|
* ONLY the patterns to be matched, with no delimiters. Null characters are
|
||||||
|
* accepted as part of the expression. The expression id in
|
||||||
|
* @ref match_event_handler will match the order of the expression given
|
||||||
|
* here (ie: expression[0] will be id 0).
|
||||||
|
*
|
||||||
|
* @param pattern_count
|
||||||
|
* The number of expressions in the @p expression array.
|
||||||
|
*
|
||||||
|
* @param expression_length
|
||||||
|
* The array of length of each expression in the @p expression array.
|
||||||
|
* Expressed in bytes.
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_multi_literal_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_multi_literal_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_multi_literal_search(
|
||||||
|
const char **expression, size_t pattern_count,
|
||||||
|
const size_t *expression_length,
|
||||||
|
hs_multi_literal_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a multi literal pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_multi_literal_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_multi_literal_pattern(hs_multi_literal_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a single character used in @ref hs_single_char_search().
|
||||||
|
*
|
||||||
|
* @param character
|
||||||
|
* The single character to be searched. It is case sensitive.
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_single_char_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_single_char_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_single_char_search(
|
||||||
|
const char character, hs_single_char_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a single char pattern
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_single_char_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_single_char_pattern(hs_single_char_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a set of characters used in @ref hs_char_set_search().
|
||||||
|
*
|
||||||
|
* @param character_array
|
||||||
|
* The string or character array containing all the characters in the set.
|
||||||
|
* It is case sensitive. Null terminator is optional.
|
||||||
|
*
|
||||||
|
* @param character_count
|
||||||
|
* The number of characters in @p character_array
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_char_set_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_char_set_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_char_set_search(
|
||||||
|
const char *character_array, size_t character_count,
|
||||||
|
hs_char_set_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a multi char pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_char_set_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_char_set_pattern(hs_char_set_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles a pair of characters used in @ref hs_single_char_pair_search().
|
||||||
|
*
|
||||||
|
* NOTE: The character order matters in the pair. "Aj" won't match "jA"
|
||||||
|
*
|
||||||
|
* @param pair
|
||||||
|
* The string or character array containing the pair. Null terminator is
|
||||||
|
* optional.
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_single_char_pair_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_single_char_pair_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_single_char_pair_search(
|
||||||
|
const char *pair, hs_single_char_pair_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a single char pair pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_single_char_pair_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL hs_free_single_char_pair_pattern(
|
||||||
|
hs_single_char_pair_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles severals pairs used in @ref hs_char_pair_set_search().
|
||||||
|
*
|
||||||
|
* IMPORTANT: Compilation is only guaranteed for up to 8 pairs. If you search
|
||||||
|
* for more, internal compression may attempt to merge adjacent patterns
|
||||||
|
* (e.g., [ab, ac, ad] becomes a[bcd]) to reduce the total to 8 pairs. If the
|
||||||
|
* compression is insufficient, compilation will fail with
|
||||||
|
* @ref HS_COMPILER_ERROR. In such cases, use @ref multi_literal_search instead.
|
||||||
|
* The compression does not affect the match IDs returned by
|
||||||
|
* @ref hs_char_pair_set_search(). For example, a[bcd] will still report "ab" as
|
||||||
|
* ID 0, "ac" as ID 1, and "ad" as ID 2.
|
||||||
|
*
|
||||||
|
* NOTE: The character order matters in the pair. "Aj" won't match "jA"
|
||||||
|
*
|
||||||
|
* @param expression
|
||||||
|
* The concatenation of all pairs to be parsed. If one want to search for
|
||||||
|
* "ab" or "Cd", then @p expression would be ['a','b','C','d']. Null
|
||||||
|
* terminator is ignored, use @ref pair_count to set the length.
|
||||||
|
*
|
||||||
|
* @param pair_count
|
||||||
|
* The number of characters pair in @p expression
|
||||||
|
*
|
||||||
|
* @param output_database
|
||||||
|
* Returns pointer to buffer containing @ref
|
||||||
|
* hs_char_pair_set_compiled_pattern_t. The buffer must be freed with
|
||||||
|
* @ref hs_free_char_pair_set_pattern.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @ref HS_SUCCESS is returned on successful compilation; @ref
|
||||||
|
* HS_COMPILER_ERROR otherwise.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_compile_char_pair_set_search(
|
||||||
|
const char *expression, size_t pair_count,
|
||||||
|
hs_char_pair_set_compiled_pattern_t **output_database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free a multi char pairs pattern.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The @ref hs_char_pair_set_compiled_pattern_t pointer to be freed.
|
||||||
|
*/
|
||||||
|
void HS_CDECL
|
||||||
|
hs_free_char_pair_set_pattern(hs_char_pair_set_compiled_pattern_t *database);
|
||||||
|
|
||||||
|
/** @} */
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
#endif
|
#endif
|
||||||
|
435
src/hs_direct_search.cpp
Normal file
435
src/hs_direct_search.cpp
Normal file
@ -0,0 +1,435 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "hs_common.h"
|
||||||
|
#include "hs_runtime.h"
|
||||||
|
#include "hs_direct_search.h"
|
||||||
|
#include "hs_direct_search_types.h"
|
||||||
|
|
||||||
|
#include "scratch.h"
|
||||||
|
#include "util/arch.h" // CAN_USE_WIDE_TRUFFLE
|
||||||
|
#include "util/bitutils.h" // ctz64()
|
||||||
|
#include "util/simd_utils.h" // load128()
|
||||||
|
#include "util/supervector/supervector.hpp"
|
||||||
|
|
||||||
|
#include "fdr/fdr.h"
|
||||||
|
#include "hwlm/noodle_engine.h"
|
||||||
|
#include "nfa/shufti.h"
|
||||||
|
#include "nfa/truffle.h"
|
||||||
|
|
||||||
|
typedef typename SuperVector<VECTORSIZE>::comparemask_type vector_mask_type;
|
||||||
|
|
||||||
|
static_assert((uint64_t)CB_CONTINUE_MATCHING == HWLM_CONTINUE_MATCHING,
|
||||||
|
"CB_CONTINUE_MATCHING doesn't match HWLM_CONTINUE_MATCHING");
|
||||||
|
static_assert((uint64_t)CB_TERMINATE_MATCHING == HWLM_TERMINATE_MATCHING,
|
||||||
|
"CB_TERMINATE_MATCHING doesn't match HWLM_TERMINATE_MATCHING");
|
||||||
|
|
||||||
|
static inline hs_error_t hwlm_to_hs_error(const hwlm_error_t error) {
|
||||||
|
switch (error) {
|
||||||
|
case HWLM_SUCCESS:
|
||||||
|
return HS_SUCCESS;
|
||||||
|
case HWLM_TERMINATED:
|
||||||
|
return HS_SCAN_TERMINATED;
|
||||||
|
case HWLM_ERROR_UNKNOWN:
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
case HWLM_LITERAL_MAX_LEN:
|
||||||
|
return HS_COMPILER_ERROR;
|
||||||
|
default:
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert the callback type of Noodle
|
||||||
|
hwlmcb_rv_t HS_CDECL noodle_to_hs_callback(size_t end, u32 id,
|
||||||
|
struct hs_scratch *scratch) {
|
||||||
|
struct noodle_context *storage = reinterpret_cast<struct noodle_context *>(
|
||||||
|
scratch->core_info.userContext);
|
||||||
|
// hwlm's end is the last char of the pattern, but hs's end is the first
|
||||||
|
// char after the pattern
|
||||||
|
size_t match_start = end + 1 - storage->pattern_length;
|
||||||
|
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||||
|
id, match_start, end + 1, 0, storage->usr_context));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Receive the FDR callback and perform the check for longer patterns (>8 char)
|
||||||
|
hwlmcb_rv_t HS_CDECL FDR_to_hs_callback(size_t end, u32 id,
|
||||||
|
struct hs_scratch *scratch) {
|
||||||
|
const struct FDR_cb_context *combined_ctx =
|
||||||
|
reinterpret_cast<struct FDR_cb_context *>(
|
||||||
|
scratch->core_info.userContext);
|
||||||
|
const FDR_pattern_storage *ps = combined_ctx->patterns;
|
||||||
|
size_t pattern_length = get_const_pattern_sizes(ps)[id];
|
||||||
|
size_t start_offset =
|
||||||
|
end + 1 - std::min(pattern_length, (size_t)HWLM_LITERAL_MAX_LEN);
|
||||||
|
if (pattern_length > HWLM_LITERAL_MAX_LEN) {
|
||||||
|
// long pattern for FDR, we need to confirm it.
|
||||||
|
const char *pattern = get_const_pattern_ptrs(ps)[id];
|
||||||
|
const char *buffer = combined_ctx->buffer;
|
||||||
|
size_t buffer_length = combined_ctx->buffer_length;
|
||||||
|
|
||||||
|
if (start_offset + pattern_length > buffer_length) {
|
||||||
|
// pattern too long for the remaining buffer, no match
|
||||||
|
return HWLM_CONTINUE_MATCHING;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *confirm_buffer_start =
|
||||||
|
buffer + start_offset + HWLM_LITERAL_MAX_LEN;
|
||||||
|
const char *confirm_pattern_start = pattern + HWLM_LITERAL_MAX_LEN;
|
||||||
|
size_t confirm_len = pattern_length - HWLM_LITERAL_MAX_LEN;
|
||||||
|
|
||||||
|
if (confirm_len >= VECTORSIZE) {
|
||||||
|
while (confirm_len > VECTORSIZE) {
|
||||||
|
SuperVector<VECTORSIZE> buffer_vector =
|
||||||
|
SuperVector<VECTORSIZE>::loadu(confirm_buffer_start);
|
||||||
|
SuperVector<VECTORSIZE> pattern_vector =
|
||||||
|
SuperVector<VECTORSIZE>::loadu(confirm_pattern_start);
|
||||||
|
vector_mask_type mask = buffer_vector.eqmask(pattern_vector);
|
||||||
|
if(~mask)
|
||||||
|
// don't match the pattern, continue searching
|
||||||
|
return HWLM_CONTINUE_MATCHING;
|
||||||
|
confirm_buffer_start += VECTORSIZE;
|
||||||
|
confirm_pattern_start += VECTORSIZE;
|
||||||
|
confirm_len -= VECTORSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// unaligned load: we cannot risk loading any extra byte, so we run
|
||||||
|
// the vector one last time with an offset to overlap the previous
|
||||||
|
// check, but avoid overflowing.
|
||||||
|
size_t overlap = VECTORSIZE - confirm_len;
|
||||||
|
SuperVector<VECTORSIZE> buffer_vector =
|
||||||
|
SuperVector<VECTORSIZE>::loadu(confirm_buffer_start - overlap);
|
||||||
|
SuperVector<VECTORSIZE> pattern_vector =
|
||||||
|
SuperVector<VECTORSIZE>::loadu(confirm_pattern_start - overlap);
|
||||||
|
vector_mask_type mask = buffer_vector.eqmask(pattern_vector);
|
||||||
|
if(~mask)
|
||||||
|
// don't match the pattern, continue searching
|
||||||
|
return HWLM_CONTINUE_MATCHING;
|
||||||
|
} else {
|
||||||
|
size_t confirm_64 = confirm_len / 8;
|
||||||
|
for (size_t i = 0; i < confirm_64; i++) {
|
||||||
|
if ((reinterpret_cast<const uint64_t *>(confirm_buffer_start))[i] !=
|
||||||
|
(reinterpret_cast<const uint64_t *>(confirm_pattern_start))[i])
|
||||||
|
// don't match the pattern, continue searching
|
||||||
|
return HWLM_CONTINUE_MATCHING;
|
||||||
|
}
|
||||||
|
confirm_len = confirm_len % 8;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < confirm_len; i++) {
|
||||||
|
if (confirm_buffer_start[i] != confirm_pattern_start[i])
|
||||||
|
// don't match the pattern, continue searching
|
||||||
|
return HWLM_CONTINUE_MATCHING;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we have a valid match. Call the user callback
|
||||||
|
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||||
|
id, start_offset, start_offset + pattern_length, 0,
|
||||||
|
combined_ctx->usr_context));
|
||||||
|
} else {
|
||||||
|
// short pattern, no confirmation needed
|
||||||
|
return (hwlmcb_rv_t)(scratch->core_info.userCallback(
|
||||||
|
id, start_offset, end + 1, 0, combined_ctx->usr_context));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- short_literal (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_short_literal_search(
|
||||||
|
const hs_short_literal_compiled_pattern *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent, void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_short_literal_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_short_literal_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_short_literal_search called with nullptr database");
|
||||||
|
struct noodle_context storage;
|
||||||
|
storage.usr_context = context;
|
||||||
|
storage.pattern_length = database->pattern_length;
|
||||||
|
struct hs_scratch scratch;
|
||||||
|
scratch.core_info.userContext = &storage;
|
||||||
|
scratch.core_info.userCallback = onEvent;
|
||||||
|
|
||||||
|
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||||
|
reinterpret_cast<const uint8_t *>(data),
|
||||||
|
length, 0, noodle_to_hs_callback, &scratch);
|
||||||
|
return hwlm_to_hs_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- long_literal (FDR) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_long_literal_search(
|
||||||
|
const hs_long_literal_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_long_literal_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_long_literal_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_long_literal_search called with nullptr database");
|
||||||
|
|
||||||
|
struct hs_scratch scratch;
|
||||||
|
struct FDR_cb_context combined_ctx = {
|
||||||
|
context, database->fdr_database.patterns, data, length};
|
||||||
|
scratch.core_info.userContext = &combined_ctx;
|
||||||
|
scratch.core_info.userCallback = onEvent;
|
||||||
|
scratch.fdr_conf = nullptr;
|
||||||
|
hwlm_error_t error =
|
||||||
|
fdrExec(database->fdr_database.database,
|
||||||
|
reinterpret_cast<const uint8_t *>(data), length, 0,
|
||||||
|
FDR_to_hs_callback, &scratch, HWLM_ALL_GROUPS);
|
||||||
|
return hwlm_to_hs_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- multi_literal (FDR) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_multi_literal_search(
|
||||||
|
const hs_multi_literal_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent, void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_multi_literal_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_multi_literal_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_multi_literal_search called with nullptr database");
|
||||||
|
|
||||||
|
struct hs_scratch scratch;
|
||||||
|
struct FDR_cb_context combined_ctx = {
|
||||||
|
context, database->fdr_database.patterns, data, length};
|
||||||
|
scratch.core_info.userContext = &combined_ctx;
|
||||||
|
scratch.core_info.userCallback = onEvent;
|
||||||
|
scratch.fdr_conf = nullptr;
|
||||||
|
hwlm_error_t error =
|
||||||
|
fdrExec(database->fdr_database.database,
|
||||||
|
reinterpret_cast<const uint8_t *>(data), length, 0,
|
||||||
|
FDR_to_hs_callback, &scratch, HWLM_ALL_GROUPS);
|
||||||
|
return hwlm_to_hs_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- single_char (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_single_char_search(
|
||||||
|
const hs_single_char_compiled_pattern *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent, void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_single_char_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_single_char_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_single_char_search called with nullptr database");
|
||||||
|
struct noodle_context storage;
|
||||||
|
storage.usr_context = context;
|
||||||
|
storage.pattern_length = 1;
|
||||||
|
struct hs_scratch scratch;
|
||||||
|
scratch.core_info.userContext = &storage;
|
||||||
|
scratch.core_info.userCallback = onEvent;
|
||||||
|
|
||||||
|
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||||
|
reinterpret_cast<const uint8_t *>(data),
|
||||||
|
length, 0, noodle_to_hs_callback, &scratch);
|
||||||
|
return hwlm_to_hs_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- char_set (Truffle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_char_set_search(
|
||||||
|
const hs_char_set_compiled_pattern *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent, void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_char_set_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_char_set_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_char_set_search called with nullptr database");
|
||||||
|
|
||||||
|
const u8 *current_buf = reinterpret_cast<const u8*>(data);
|
||||||
|
// buf_end must be the first char past the buffer, so current_buf==buf_end
|
||||||
|
// means current_buf is empty.
|
||||||
|
const u8 *buf_end = reinterpret_cast<const u8*>(data) + length;
|
||||||
|
while(current_buf < buf_end) {
|
||||||
|
const u8 *current_match;
|
||||||
|
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||||
|
current_match = truffleExecWide(
|
||||||
|
loadu256(database->wide_mask), current_buf, buf_end);
|
||||||
|
#else
|
||||||
|
current_match = truffleExec(load128(database->mask1),
|
||||||
|
load128(database->mask2),
|
||||||
|
current_buf, buf_end);
|
||||||
|
#endif
|
||||||
|
// current_match is the pointer to the matching char, NOT past the
|
||||||
|
// matching char. or buf_end if no match.
|
||||||
|
if(current_match < buf_end) {
|
||||||
|
size_t id = database->char_id_map[*current_match];
|
||||||
|
size_t match_start =
|
||||||
|
current_match - reinterpret_cast<const u8 *>(data);
|
||||||
|
if( ! onEvent(id, match_start, match_start + 1, 0, context)) {
|
||||||
|
// user requested to stop matching
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_buf = current_match + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- single_char_pair (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_single_char_pair_search(
|
||||||
|
const hs_single_char_pair_compiled_pattern *database,
|
||||||
|
const char *data, size_t length, match_event_handler onEvent,
|
||||||
|
void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_single_char_pair_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_single_char_pair_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_single_char_pair_search called with nullptr database");
|
||||||
|
struct noodle_context storage;
|
||||||
|
storage.usr_context = context;
|
||||||
|
storage.pattern_length = 2;
|
||||||
|
struct hs_scratch scratch;
|
||||||
|
scratch.core_info.userContext = &storage;
|
||||||
|
scratch.core_info.userCallback = onEvent;
|
||||||
|
|
||||||
|
hwlm_error_t error = noodExec(&(database->noodle_database),
|
||||||
|
reinterpret_cast<const uint8_t *>(data),
|
||||||
|
length, 0, noodle_to_hs_callback, &scratch);
|
||||||
|
return hwlm_to_hs_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- char_pair_set (Double shufti) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_char_pair_set_search(
|
||||||
|
const hs_char_pair_set_compiled_pattern *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent, void *context) {
|
||||||
|
assert(onEvent != nullptr &&
|
||||||
|
"hs_char_pair_set_search called with nullptr callback");
|
||||||
|
assert(data != nullptr &&
|
||||||
|
"hs_char_pair_set_search called with nullptr buffer");
|
||||||
|
assert(database != nullptr &&
|
||||||
|
"hs_char_pair_set_search called with nullptr database");
|
||||||
|
|
||||||
|
const u8 *current_buf = reinterpret_cast<const u8*>(data);
|
||||||
|
// buf_end must be the first char past the buffer, so current_buf==buf_end
|
||||||
|
// means current_buf is empty.
|
||||||
|
const u8 *buf_end = reinterpret_cast<const u8*>(data) + length;
|
||||||
|
while(current_buf < buf_end) {
|
||||||
|
const u8 *current_match;
|
||||||
|
current_match = shuftiDoubleExec(
|
||||||
|
load128(database->dshufti_database.mask1),
|
||||||
|
load128(database->dshufti_database.mask2),
|
||||||
|
load128(database->dshufti_database.mask3),
|
||||||
|
load128(database->dshufti_database.mask4), current_buf, buf_end);
|
||||||
|
// current_match is the pointer to the matching char, NOT past the
|
||||||
|
// matching char. or buf_end if no match.
|
||||||
|
if (current_match < buf_end) {
|
||||||
|
// Shufti doesn't return which pair matched so we have to find out.
|
||||||
|
// Use a 16 bits vector search on the original pattern string,
|
||||||
|
// then return the <first match>/2 as ID.
|
||||||
|
SuperVector<VECTORSIZE> found_pair = SuperVector<VECTORSIZE>(
|
||||||
|
*reinterpret_cast<const u16 *>(current_match));
|
||||||
|
size_t width = SuperVector<VECTORSIZE>::mask_width();
|
||||||
|
SuperVector<VECTORSIZE> all_pair;
|
||||||
|
vector_mask_type mask;
|
||||||
|
vector_mask_type merged_mask;
|
||||||
|
size_t loop = 0;
|
||||||
|
size_t vector_match_iterations_needed =
|
||||||
|
((database->dshufti_database.pair_count - 1) /
|
||||||
|
(VECTORSIZE / 2));
|
||||||
|
for (; loop <= vector_match_iterations_needed; loop++) {
|
||||||
|
all_pair = SuperVector<VECTORSIZE>::load(
|
||||||
|
database->dshufti_database.all_pairs + (VECTORSIZE * loop));
|
||||||
|
// It is fine if the vector isn't filled as we are guaranteed to
|
||||||
|
// have a match before reaching the garbage data
|
||||||
|
mask = all_pair.eqmask(found_pair);
|
||||||
|
// now we have <width> bit set to 1 when a char match.
|
||||||
|
// first we merge the lane result to keep only consecutive
|
||||||
|
// matches
|
||||||
|
merged_mask = mask & (mask >> width);
|
||||||
|
// Then we filter to keep only a single bit per lane, and only
|
||||||
|
// every other lane
|
||||||
|
merged_mask =
|
||||||
|
merged_mask & database->dshufti_database.bit_filter_mask;
|
||||||
|
if (merged_mask)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// And finaly we can ctz to get the first pair that match
|
||||||
|
unsigned int id =
|
||||||
|
(ctz64(merged_mask) / width / 2) + (loop * (VECTORSIZE / 2));
|
||||||
|
size_t match_start = current_match - reinterpret_cast<const u8*>(data);
|
||||||
|
if (!onEvent(id, match_start, match_start + 2, 0, context)) {
|
||||||
|
// user requested to stop matching
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_buf = current_match + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
207
src/hs_direct_search.h
Normal file
207
src/hs_direct_search.h
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DIRECT_SEARCH_H
|
||||||
|
#define DIRECT_SEARCH_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "allocator.h"
|
||||||
|
|
||||||
|
#include "fdr/fdr_internal.h"
|
||||||
|
#include "util/arch.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FDR_pattern_storage memory layout:
|
||||||
|
*
|
||||||
|
* |-------------------------------------------------|
|
||||||
|
* | size_t pattern_count |
|
||||||
|
* |------------------------|------------------------|
|
||||||
|
* | pattern_raw_storage : char* pattern_ptrs[] |
|
||||||
|
* | :------------------------|
|
||||||
|
* | : size_t pattern_sizes[] |
|
||||||
|
* | :------------------------|
|
||||||
|
* | : char actual_storage[] |
|
||||||
|
* |------------------------|------------------------|
|
||||||
|
*
|
||||||
|
* Use size_fdr_pattern() to get the size to allocate.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct FDR_pattern_storage {
|
||||||
|
size_t pattern_count;
|
||||||
|
char pattern_raw_storage[];
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline char **get_pattern_ptrs(struct FDR_pattern_storage *pat) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
|
return (char **)((char *)pat +
|
||||||
|
offsetof(struct FDR_pattern_storage, pattern_raw_storage));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline char *const *
|
||||||
|
get_const_pattern_ptrs(const struct FDR_pattern_storage *pat) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
|
return (char *const *)((const char *)pat +
|
||||||
|
offsetof(struct FDR_pattern_storage,
|
||||||
|
pattern_raw_storage));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t *get_pattern_sizes(struct FDR_pattern_storage *pat) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
|
return (size_t *)((char *)get_pattern_ptrs(pat) +
|
||||||
|
pat->pattern_count * sizeof(char *));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline const size_t *
|
||||||
|
get_const_pattern_sizes(const struct FDR_pattern_storage *pat) {
|
||||||
|
// cppcheck-suppress cstyleCast
|
||||||
|
return (const size_t *)((const char *)get_const_pattern_ptrs(pat) +
|
||||||
|
pat->pattern_count * sizeof(char *));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline char *
|
||||||
|
get_pattern_string_storage(struct FDR_pattern_storage *pat) {
|
||||||
|
return (char *)get_pattern_sizes(pat) + pat->pattern_count * sizeof(size_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline const char *
|
||||||
|
get_const_pattern_string_storage(const struct FDR_pattern_storage *pat) {
|
||||||
|
return (const char *)get_const_pattern_sizes(pat) +
|
||||||
|
pat->pattern_count * sizeof(size_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void init_pattern_store(struct FDR_pattern_storage *storage,
|
||||||
|
const char **in_expression, size_t in_pattern_count,
|
||||||
|
const size_t *in_expression_length) {
|
||||||
|
storage->pattern_count = in_pattern_count;
|
||||||
|
memcpy(get_pattern_sizes(storage), in_expression_length,
|
||||||
|
storage->pattern_count);
|
||||||
|
char *next_string = get_pattern_string_storage(storage);
|
||||||
|
for (size_t i = 0; i < storage->pattern_count; i++) {
|
||||||
|
memcpy(next_string, in_expression[i], in_expression_length[i]);
|
||||||
|
get_pattern_ptrs(storage)[i] = next_string;
|
||||||
|
get_pattern_sizes(storage)[i] = in_expression_length[i];
|
||||||
|
next_string += in_expression_length[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void init_pattern_store_single(struct FDR_pattern_storage *storage,
|
||||||
|
const char *in_expression,
|
||||||
|
const size_t in_expression_length) {
|
||||||
|
init_pattern_store(storage, &in_expression, 1, &in_expression_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
size_t size_fdr_pattern(size_t in_pattern_count,
|
||||||
|
const size_t *in_expression_length) {
|
||||||
|
size_t total_string_size = 0;
|
||||||
|
for (size_t i = 0; i < in_pattern_count; i++) {
|
||||||
|
total_string_size += in_expression_length[i];
|
||||||
|
}
|
||||||
|
size_t ptr_array_size = in_pattern_count * sizeof(char *);
|
||||||
|
size_t pattern_sizes_array_size = in_pattern_count * sizeof(size_t);
|
||||||
|
size_t required_mem = sizeof(struct FDR_pattern_storage) + ptr_array_size +
|
||||||
|
pattern_sizes_array_size + total_string_size;
|
||||||
|
return required_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* combined_fdr_database memory layout:
|
||||||
|
*
|
||||||
|
* |-------------------------------------------------|
|
||||||
|
* | FDR *database |
|
||||||
|
* |-------------------------------------------------|
|
||||||
|
* | FDR_pattern_storage *patterns |
|
||||||
|
* |------------------------|------------------------|
|
||||||
|
* | raw_storage : FDR fdr_storage |
|
||||||
|
* | :------------------------|
|
||||||
|
* | : FDR_pattern_storage |
|
||||||
|
* |------------------------|------------------------|
|
||||||
|
*
|
||||||
|
* Use size_fdr_database() to get the size to allocate.
|
||||||
|
*/
|
||||||
|
struct combined_fdr_database {
|
||||||
|
struct FDR *database;
|
||||||
|
struct FDR_pattern_storage *patterns;
|
||||||
|
unsigned char raw_storage[];
|
||||||
|
};
|
||||||
|
|
||||||
|
void init_combined_fdr_database(struct combined_fdr_database *database,
|
||||||
|
size_t fdr_size, const char **in_expression,
|
||||||
|
size_t in_pattern_count,
|
||||||
|
const size_t *in_expression_length);
|
||||||
|
|
||||||
|
void init_combined_fdr_database_single(struct combined_fdr_database *database,
|
||||||
|
size_t fdr_size,
|
||||||
|
const char *in_expression,
|
||||||
|
const size_t in_expression_length);
|
||||||
|
static inline
|
||||||
|
size_t size_fdr_database(size_t fdr_size, size_t in_pattern_count,
|
||||||
|
const size_t *in_expression_length) {
|
||||||
|
return sizeof(struct combined_fdr_database) +
|
||||||
|
size_fdr_pattern(in_pattern_count, in_expression_length) + fdr_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
size_t size_fdr_database_single(size_t fdr_size,
|
||||||
|
const size_t in_expression_length) {
|
||||||
|
return size_fdr_database(fdr_size, 1, &in_expression_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwlmcb_rv_t HS_CDECL noodle_to_hs_callback(size_t end, u32 id,
|
||||||
|
struct hs_scratch *scratch);
|
||||||
|
|
||||||
|
// Receive the FDR callback and perform the check for longer patterns (>8 char)
|
||||||
|
hwlmcb_rv_t HS_CDECL FDR_to_hs_callback(size_t end, u32 id,
|
||||||
|
struct hs_scratch *scratch);
|
||||||
|
|
||||||
|
struct FDR_cb_context {
|
||||||
|
void *usr_context;
|
||||||
|
const struct FDR_pattern_storage *patterns;
|
||||||
|
const char *buffer;
|
||||||
|
size_t buffer_length;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct noodle_context {
|
||||||
|
void *usr_context;
|
||||||
|
u8 pattern_length;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // DIRECT_SEARCH_H
|
495
src/hs_direct_search_compile.cpp
Normal file
495
src/hs_direct_search_compile.cpp
Normal file
@ -0,0 +1,495 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "hs_common.h"
|
||||||
|
#include "hs_compile.h"
|
||||||
|
#include "hs_direct_search.h"
|
||||||
|
#include "hs_direct_search_types.h"
|
||||||
|
|
||||||
|
#include "allocator.h" // hs_database_alloc()
|
||||||
|
#include "grey.h"
|
||||||
|
#include "hwlm/hwlm.h" // HWLM_LITERAL_MAX_LEN
|
||||||
|
#include "hwlm/hwlm_internal.h" // HWLM_ENGINE_FDR
|
||||||
|
#include "hwlm/hwlm_literal.h" // ue2::hwlmLiteral
|
||||||
|
#include "hwlm/noodle_internal.h" // noodTable
|
||||||
|
#include "ue2common.h" // likely() - unlikely()
|
||||||
|
#include "util/arch.h" // CAN_USE_WIDE_TRUFFLE
|
||||||
|
#include "util/bytecode_ptr.h"
|
||||||
|
#include "util/charreach.h"
|
||||||
|
#include "util/flat_containers.h" // flat_set
|
||||||
|
#include "util/supervector/supervector.hpp"
|
||||||
|
#include "util/target_info.h" // target_t
|
||||||
|
|
||||||
|
#include "fdr/fdr_compile.h"
|
||||||
|
#include "hwlm/noodle_build.h"
|
||||||
|
#include "nfa/shufticompile.h"
|
||||||
|
#include "nfa/trufflecompile.h"
|
||||||
|
|
||||||
|
typedef typename SuperVector<VECTORSIZE>::comparemask_type vector_mask_type;
|
||||||
|
|
||||||
|
void init_combined_fdr_database(struct combined_fdr_database *database,
|
||||||
|
size_t fdr_size, const char **in_expression,
|
||||||
|
size_t in_pattern_count,
|
||||||
|
const size_t *in_expression_length) {
|
||||||
|
database->database = reinterpret_cast<FDR *>(database->raw_storage);
|
||||||
|
database->patterns = reinterpret_cast<FDR_pattern_storage *>(
|
||||||
|
database->raw_storage + fdr_size);
|
||||||
|
init_pattern_store(database->patterns, in_expression, in_pattern_count,
|
||||||
|
in_expression_length);
|
||||||
|
};
|
||||||
|
|
||||||
|
void init_combined_fdr_database_single(struct combined_fdr_database *database,
|
||||||
|
size_t fdr_size,
|
||||||
|
const char *in_expression,
|
||||||
|
const size_t in_expression_length) {
|
||||||
|
database->database = reinterpret_cast<FDR *>(database->raw_storage);
|
||||||
|
database->patterns = reinterpret_cast<FDR_pattern_storage *>(
|
||||||
|
database->raw_storage + fdr_size);
|
||||||
|
init_pattern_store_single(database->patterns, in_expression,
|
||||||
|
in_expression_length);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void generic_free(void *database) {
|
||||||
|
if (likely(database)) {
|
||||||
|
hs_database_free(database);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- short_literal (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_short_literal_search(
|
||||||
|
const char *expression, size_t expression_length,
|
||||||
|
hs_short_literal_compiled_pattern **output_database) {
|
||||||
|
assert(expression_length > 0 &&
|
||||||
|
"hs_compile_short_literal_search called with an empty pattern");
|
||||||
|
assert(expression != nullptr &&
|
||||||
|
"hs_compile_short_literal_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_short_literal_search called with nullptr");
|
||||||
|
if (unlikely(expression_length > HS_SHORT_PATTERN_THRESHOLD)) {
|
||||||
|
return HS_INVALID;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Exposing caseness at the api level may restrict our ability to change
|
||||||
|
* the backing algorithm, so we decided to make all algo case sensitive
|
||||||
|
*/
|
||||||
|
bool is_case_insensitive = false;
|
||||||
|
bool only_need_first_match = false;
|
||||||
|
ue2::hwlmLiteral lit(std::string(expression, expression_length),
|
||||||
|
is_case_insensitive, only_need_first_match, 0,
|
||||||
|
HWLM_ALL_GROUPS, {}, {});
|
||||||
|
|
||||||
|
hs_short_literal_compiled_pattern *database =
|
||||||
|
reinterpret_cast<hs_short_literal_compiled_pattern *>(hs_database_alloc(
|
||||||
|
sizeof(hs_short_literal_compiled_pattern)));
|
||||||
|
if (unlikely(database == nullptr)) {
|
||||||
|
return HS_NOMEM;
|
||||||
|
}
|
||||||
|
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||||
|
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
database->pattern_length = expression_length;
|
||||||
|
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||||
|
sizeof(noodTable));
|
||||||
|
*output_database = database;
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_short_literal_pattern(
|
||||||
|
hs_short_literal_compiled_pattern *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- long_literal (FDR) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_long_literal_search(
|
||||||
|
const char *expression, size_t expression_length,
|
||||||
|
hs_long_literal_compiled_pattern_t **output_database) {
|
||||||
|
assert(expression_length > 0 &&
|
||||||
|
"hs_compile_long_literal_search called with an empty pattern");
|
||||||
|
assert(expression != nullptr &&
|
||||||
|
"hs_compile_long_literal_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_long_literal_search called with nullptr");
|
||||||
|
/*
|
||||||
|
* Exposing caseness at the api level may restrict our ability to change
|
||||||
|
* the backing algorithm, so we decided to make all algo case sensitive
|
||||||
|
*/
|
||||||
|
bool is_case_insensitive = false;
|
||||||
|
bool only_need_first_match = false;
|
||||||
|
std::vector<ue2::hwlmLiteral> lits;
|
||||||
|
// longer strings are checked in the callback
|
||||||
|
ue2::hwlmLiteral lit(
|
||||||
|
std::string(expression,
|
||||||
|
std::min(expression_length, (size_t)HWLM_LITERAL_MAX_LEN)),
|
||||||
|
is_case_insensitive, only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||||
|
lits.push_back(lit);
|
||||||
|
|
||||||
|
ue2::Grey g = ue2::Grey();
|
||||||
|
u8 engType = HWLM_ENGINE_FDR;
|
||||||
|
bool make_small = false;
|
||||||
|
|
||||||
|
hs_platform_info platform_info;
|
||||||
|
hs_populate_platform(&platform_info);
|
||||||
|
|
||||||
|
ue2::target_t target = ue2::target_t(platform_info);
|
||||||
|
|
||||||
|
std::unique_ptr<ue2::HWLMProto> proto =
|
||||||
|
ue2::fdrBuildProto(engType, lits, make_small, target, g);
|
||||||
|
|
||||||
|
ue2::bytecode_ptr<FDR> bytecode_database = ue2::fdrBuildTable(*proto, g);
|
||||||
|
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
size_t fdr_size = bytecode_database.get()->size;
|
||||||
|
|
||||||
|
size_t mem_required = size_fdr_database_single(fdr_size, expression_length);
|
||||||
|
struct combined_fdr_database *combined_database =
|
||||||
|
reinterpret_cast<struct combined_fdr_database *>(
|
||||||
|
hs_database_alloc(mem_required));
|
||||||
|
if (unlikely(combined_database == nullptr)) {
|
||||||
|
return HS_NOMEM;
|
||||||
|
}
|
||||||
|
init_combined_fdr_database_single(combined_database, fdr_size, expression,
|
||||||
|
expression_length);
|
||||||
|
memcpy(combined_database->database, bytecode_database.get(), fdr_size);
|
||||||
|
*output_database = reinterpret_cast<hs_long_literal_compiled_pattern_t *>(
|
||||||
|
combined_database);
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_long_literal_pattern(
|
||||||
|
hs_long_literal_compiled_pattern_t *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- multi_literal (FDR) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_multi_literal_search(
|
||||||
|
const char **expression, size_t pattern_count,
|
||||||
|
const size_t *expression_length,
|
||||||
|
hs_multi_literal_compiled_pattern_t **output_database) {
|
||||||
|
assert(pattern_count > 0 &&
|
||||||
|
"hs_compile_multi_literal_search called with no pattern");
|
||||||
|
assert(expression != nullptr &&
|
||||||
|
"hs_compile_multi_literal_search called with nullptr");
|
||||||
|
assert(expression_length != nullptr &&
|
||||||
|
"hs_compile_multi_literal_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_multi_literal_search called with nullptr");
|
||||||
|
/*
|
||||||
|
* Exposing caseness at the api level may restrict our ability to change
|
||||||
|
* the backing algorithm, so we decided to make all algo case sensitive
|
||||||
|
*/
|
||||||
|
bool is_case_insensitive = false;
|
||||||
|
bool only_need_first_match = false;
|
||||||
|
std::vector<ue2::hwlmLiteral> lits;
|
||||||
|
for (size_t i = 0; i < pattern_count; i++) {
|
||||||
|
assert(expression_length[i] > 0 && expression[i] &&
|
||||||
|
"hs_compile_multi_literal_search called with an empty pattern");
|
||||||
|
// longer strings are checked in the callback
|
||||||
|
ue2::hwlmLiteral lit(
|
||||||
|
std::string(expression[i], std::min(expression_length[i],
|
||||||
|
(size_t)HWLM_LITERAL_MAX_LEN)),
|
||||||
|
is_case_insensitive, only_need_first_match, i, HWLM_ALL_GROUPS, {},
|
||||||
|
{});
|
||||||
|
lits.push_back(lit);
|
||||||
|
}
|
||||||
|
|
||||||
|
ue2::Grey g = ue2::Grey();
|
||||||
|
u8 engType = HWLM_ENGINE_FDR;
|
||||||
|
bool make_small = false;
|
||||||
|
|
||||||
|
hs_platform_info platform_info;
|
||||||
|
hs_populate_platform(&platform_info);
|
||||||
|
|
||||||
|
ue2::target_t target = ue2::target_t(platform_info);
|
||||||
|
|
||||||
|
std::unique_ptr<ue2::HWLMProto> proto =
|
||||||
|
ue2::fdrBuildProto(engType, lits, make_small, target, g);
|
||||||
|
|
||||||
|
ue2::bytecode_ptr<FDR> bytecode_database = ue2::fdrBuildTable(*proto, g);
|
||||||
|
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
size_t fdr_size = bytecode_database.get()->size;
|
||||||
|
|
||||||
|
size_t mem_required =
|
||||||
|
size_fdr_database(fdr_size, pattern_count, expression_length);
|
||||||
|
struct combined_fdr_database *combined_database =
|
||||||
|
reinterpret_cast<struct combined_fdr_database *>(
|
||||||
|
hs_database_alloc(mem_required));
|
||||||
|
if (unlikely(combined_database == nullptr)) {
|
||||||
|
return HS_NOMEM;
|
||||||
|
}
|
||||||
|
init_combined_fdr_database(combined_database, fdr_size, expression,
|
||||||
|
pattern_count, expression_length);
|
||||||
|
memcpy(combined_database->database, bytecode_database.get(), fdr_size);
|
||||||
|
*output_database = reinterpret_cast<hs_multi_literal_compiled_pattern_t *>(
|
||||||
|
combined_database);
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_multi_literal_pattern(
|
||||||
|
hs_multi_literal_compiled_pattern_t *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- single_char (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_single_char_search(
|
||||||
|
const char character, hs_single_char_compiled_pattern **output_database) {
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_single_char_search called with nullptr");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Exposing caseness at the api level may restrict our ability to change
|
||||||
|
* the backing algorithm, so we decided to make all algo case sensitive
|
||||||
|
*/
|
||||||
|
bool is_case_insensitive = false;
|
||||||
|
bool only_need_first_match = false;
|
||||||
|
ue2::hwlmLiteral lit(std::string(&character, 1), is_case_insensitive,
|
||||||
|
only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||||
|
|
||||||
|
hs_single_char_compiled_pattern *database =
|
||||||
|
reinterpret_cast<hs_single_char_compiled_pattern *>(hs_database_alloc(
|
||||||
|
sizeof(hs_single_char_compiled_pattern)));
|
||||||
|
if (unlikely(database == nullptr)) {
|
||||||
|
return HS_NOMEM;
|
||||||
|
}
|
||||||
|
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||||
|
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||||
|
sizeof(noodTable));
|
||||||
|
*output_database = database;
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_single_char_pattern(
|
||||||
|
hs_single_char_compiled_pattern *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- char_set (Truffle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL
|
||||||
|
hs_compile_char_set_search(const char *character_array, size_t character_count,
|
||||||
|
hs_char_set_compiled_pattern **output_database) {
|
||||||
|
assert(character_count > 0 &&
|
||||||
|
"hs_compile_char_set_search called with an empty set");
|
||||||
|
assert(character_array != nullptr &&
|
||||||
|
"hs_compile_char_set_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_char_set_search called with nullptr");
|
||||||
|
|
||||||
|
const ue2::CharReach cr =
|
||||||
|
ue2::CharReach(std::string(character_array, character_count));
|
||||||
|
truffle_storage *database = reinterpret_cast<truffle_storage *>(
|
||||||
|
hs_database_alloc(sizeof(truffle_storage)));
|
||||||
|
// hs_database_alloc is meant to align to a machine word (likely 64b), which
|
||||||
|
// is actually required here
|
||||||
|
assert((((intptr_t)(database) & 3) == 0) &&
|
||||||
|
"user-provided alloc didn't meet alignment requirement in "
|
||||||
|
"hs_compile_char_set_search");
|
||||||
|
for (u8 i = 0; i < character_count; i++) {
|
||||||
|
database->char_id_map[(u8)character_array[i]] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||||
|
ue2::truffleBuildMasksWide(cr, database->wide_mask);
|
||||||
|
#else
|
||||||
|
ue2::truffleBuildMasks(cr, database->mask1,
|
||||||
|
database->mask2);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
*output_database = database;
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_char_set_pattern(hs_char_set_compiled_pattern *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- single_char_pair (Noodle) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_single_char_pair_search(
|
||||||
|
const char *pair, hs_single_char_pair_compiled_pattern **output_database) {
|
||||||
|
assert(pair != nullptr &&
|
||||||
|
"hs_compile_single_char_pair_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_single_char_pair_search called with nullptr");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Exposing caseness at the api level may restrict our ability to change
|
||||||
|
* the backing algorithm, so we decided to make all algo case sensitive
|
||||||
|
*/
|
||||||
|
bool is_case_insensitive = false;
|
||||||
|
bool only_need_first_match = false;
|
||||||
|
ue2::hwlmLiteral lit(std::string(pair, 2), is_case_insensitive,
|
||||||
|
only_need_first_match, 0, HWLM_ALL_GROUPS, {}, {});
|
||||||
|
|
||||||
|
hs_single_char_pair_compiled_pattern *database =
|
||||||
|
reinterpret_cast<hs_single_char_pair_compiled_pattern *>(
|
||||||
|
hs_database_alloc(sizeof(hs_single_char_pair_compiled_pattern)));
|
||||||
|
if (unlikely(database == nullptr)) {
|
||||||
|
return HS_NOMEM;
|
||||||
|
}
|
||||||
|
ue2::bytecode_ptr<noodTable> bytecode_database = ue2::noodBuildTable(lit);
|
||||||
|
if (unlikely(bytecode_database.get() == nullptr)) {
|
||||||
|
return HS_UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
memcpy(&(database->noodle_database), bytecode_database.get(),
|
||||||
|
sizeof(noodTable));
|
||||||
|
*output_database = database;
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_single_char_pair_pattern(
|
||||||
|
hs_single_char_pair_compiled_pattern *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// --- char_pair_set (Double shufti) ---
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
hs_error_t HS_CDECL hs_compile_char_pair_set_search(
|
||||||
|
const char *expression, size_t pair_count,
|
||||||
|
hs_char_pair_set_compiled_pattern **output_database) {
|
||||||
|
assert(pair_count > 0 &&
|
||||||
|
"hs_compile_char_pair_set_search called with an empty set");
|
||||||
|
assert(expression != nullptr &&
|
||||||
|
"hs_compile_char_pair_set_search called with nullptr");
|
||||||
|
assert(output_database != nullptr &&
|
||||||
|
"hs_compile_char_pair_set_search called with nullptr");
|
||||||
|
|
||||||
|
ue2::flat_set<std::pair<u8, u8>> pairs;
|
||||||
|
for (u8 i = 0; i < pair_count; i++) {
|
||||||
|
pairs.insert(
|
||||||
|
std::make_pair((u8)expression[2 * i], (u8)expression[2 * i + 1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
hs_char_pair_set_compiled_pattern *database =
|
||||||
|
reinterpret_cast<hs_char_pair_set_compiled_pattern *>(hs_database_alloc(
|
||||||
|
sizeof(hs_char_pair_set_compiled_pattern) +
|
||||||
|
sizeof(char) * 2 * pair_count));
|
||||||
|
// hs_database_alloc is meant to align to a machine word (likely 64b), which
|
||||||
|
// is actually required here
|
||||||
|
assert((((intptr_t)(database) & 3) == 0) &&
|
||||||
|
"user-provided alloc didn't meet alignment requirement in "
|
||||||
|
"hs_compile_char_pair_set_search");
|
||||||
|
|
||||||
|
bool success = ue2::shuftiBuildDoubleMasks(
|
||||||
|
ue2::CharReach(), pairs, database->dshufti_database.mask1,
|
||||||
|
database->dshufti_database.mask2, database->dshufti_database.mask3,
|
||||||
|
database->dshufti_database.mask4);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
return HS_COMPILER_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
database->dshufti_database.pair_count = pair_count;
|
||||||
|
|
||||||
|
size_t width = SuperVector<VECTORSIZE>::mask_width();
|
||||||
|
assert(width <= 4 &&
|
||||||
|
"Code needs rework if supervector's mask are bigger than 4");
|
||||||
|
assert(width != 3 &&
|
||||||
|
"Code needs rework if supervector's mask aren't a power of 2");
|
||||||
|
// we need a mask such that every 2*width bits, only the lsb is set to 1
|
||||||
|
// so for a width of 4, we repeat 0X01
|
||||||
|
unsigned char bit_filter_mask = 0;
|
||||||
|
for (size_t i = 8; i > 0; i -= 2 * width) {
|
||||||
|
bit_filter_mask = bit_filter_mask << (2 * width) | 0x1;
|
||||||
|
}
|
||||||
|
memset(&(database->dshufti_database.bit_filter_mask), bit_filter_mask,
|
||||||
|
sizeof(vector_mask_type));
|
||||||
|
memcpy(database->dshufti_database.all_pairs, expression, 2 * pair_count);
|
||||||
|
|
||||||
|
*output_database = database;
|
||||||
|
|
||||||
|
return HS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
HS_PUBLIC_API
|
||||||
|
void hs_free_char_pair_set_pattern(
|
||||||
|
hs_char_pair_set_compiled_pattern *database) {
|
||||||
|
generic_free(database);
|
||||||
|
}
|
||||||
|
|
87
src/hs_direct_search_types.h
Normal file
87
src/hs_direct_search_types.h
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DIRECT_SEARCH_TYPES_H
|
||||||
|
#define DIRECT_SEARCH_TYPES_H
|
||||||
|
|
||||||
|
#include <stdalign.h>
|
||||||
|
|
||||||
|
#include "util/supervector/supervector.hpp"
|
||||||
|
|
||||||
|
#include "fdr/fdr_internal.h"
|
||||||
|
#include "hwlm/noodle_internal.h"
|
||||||
|
|
||||||
|
|
||||||
|
struct hs_short_literal_compiled_pattern {
|
||||||
|
noodTable noodle_database;
|
||||||
|
u8 pattern_length;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hs_long_literal_compiled_pattern {
|
||||||
|
struct combined_fdr_database fdr_database;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hs_multi_literal_compiled_pattern {
|
||||||
|
struct combined_fdr_database fdr_database;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hs_single_char_compiled_pattern {
|
||||||
|
struct noodTable noodle_database;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hs_single_char_pair_compiled_pattern {
|
||||||
|
struct noodTable noodle_database;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct hs_char_set_compiled_pattern {
|
||||||
|
union
|
||||||
|
{
|
||||||
|
struct {
|
||||||
|
uint8_t mask1[16] __attribute__((aligned));
|
||||||
|
uint8_t mask2[16] __attribute__((aligned));
|
||||||
|
};
|
||||||
|
uint8_t wide_mask[32] __attribute__((aligned));
|
||||||
|
};
|
||||||
|
// allows us to get the id from the character
|
||||||
|
u8 char_id_map[256];
|
||||||
|
} truffle_storage;
|
||||||
|
|
||||||
|
struct dshufti_storage {
|
||||||
|
alignas(16) uint8_t mask1[16];
|
||||||
|
alignas(16) uint8_t mask2[16];
|
||||||
|
alignas(16) uint8_t mask3[16];
|
||||||
|
alignas(16) uint8_t mask4[16];
|
||||||
|
size_t pair_count;
|
||||||
|
typename SuperVector<VECTORSIZE>::comparemask_type bit_filter_mask;
|
||||||
|
alignas(VECTORSIZE) uint8_t all_pairs[];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hs_char_pair_set_compiled_pattern {
|
||||||
|
struct dshufti_storage dshufti_database;
|
||||||
|
};
|
||||||
|
#endif // DIRECT_SEARCH_TYPES_H
|
221
src/hs_runtime.h
221
src/hs_runtime.h
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2018, Intel Corporation
|
* Copyright (c) 2015-2018, Intel Corporation
|
||||||
|
* Copyright (c) 2024-2025, Arm ltd
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -614,6 +615,226 @@ hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch);
|
|||||||
*/
|
*/
|
||||||
#define HS_OFFSET_PAST_HORIZON (~0ULL)
|
#define HS_OFFSET_PAST_HORIZON (~0ULL)
|
||||||
|
|
||||||
|
/** @} */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The following functions are part of the extended API.
|
||||||
|
* This extension offers direct access to search algorithms
|
||||||
|
* allowing the user to minimise calling overhead for simple
|
||||||
|
* search use cases where type of the search is known.
|
||||||
|
*
|
||||||
|
* All search functions handle a limited kind of patterns. For more generic
|
||||||
|
* patterns, use @ref hs_scan()
|
||||||
|
*
|
||||||
|
* NOTE: All search functions are considered case-sensitive.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @defgroup DIRECT_API_RUNTIME
|
||||||
|
*
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Callback return value indicating that we should continue matching. */
|
||||||
|
#define CB_CONTINUE_MATCHING (int)(~0U)
|
||||||
|
|
||||||
|
/** Callback return value indicating that we should halt matching. */
|
||||||
|
#define CB_TERMINATE_MATCHING (int)0
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for the short literal pattern up to
|
||||||
|
* @ref HS_SHORT_PATTERN_THRESHOLD chars long. For longer patterns, use @ref
|
||||||
|
* hs_long_literal_search(). Other options exists for character pairs or set.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_short_literal_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_short_literal_search(
|
||||||
|
const hs_short_literal_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for the long literal pattern.
|
||||||
|
*
|
||||||
|
* If the pattern length is less or equal to @ref HS_SHORT_PATTERN_THRESHOLD,
|
||||||
|
* @ref hs_short_literal_search() may be faster.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_long_literal_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_long_literal_search(
|
||||||
|
const hs_long_literal_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for several long literal patterns at once.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_multi_literal_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* The reported ID is the index of the matching literal.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_multi_literal_search(
|
||||||
|
const hs_multi_literal_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for any occurrence of the given character.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_single_char_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_single_char_search(
|
||||||
|
const hs_single_char_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for occurrences of any character from the given
|
||||||
|
* character set.
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_char_set_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* The reported ID is the index of the matching char.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_char_set_search(
|
||||||
|
const hs_char_set_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for occurrences of the given ordered character pair
|
||||||
|
* ("Aj" won't match "jA").
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref hs_compile_char_pair_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_single_char_pair_search(
|
||||||
|
const hs_single_char_pair_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the given data for occurrences of any of the ordered character pair
|
||||||
|
* from the given set ("Aj" won't match "jA")
|
||||||
|
*
|
||||||
|
* @param database
|
||||||
|
* The compiled pattern returned by @ref
|
||||||
|
* hs_compile_char_pair_set_search()
|
||||||
|
* @param data
|
||||||
|
* Pointer to the data to be scanned.
|
||||||
|
* @param length
|
||||||
|
* The number of bytes to scan.
|
||||||
|
* @param onEvent
|
||||||
|
* Pointer to a @ref match_event_handler callback function. If a NULL
|
||||||
|
* pointer is given, no matches will be returned.
|
||||||
|
* The "flag" argument is unused.
|
||||||
|
* The reported ID is the index of the matching pair.
|
||||||
|
* @param context
|
||||||
|
* The user defined pointer which will be passed to the callback function.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
|
||||||
|
* match callback indicated that scanning should stop; other values on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
hs_error_t HS_CDECL hs_char_pair_set_search(
|
||||||
|
const hs_char_pair_set_compiled_pattern_t *database, const char *data,
|
||||||
|
size_t length, match_event_handler onEvent,
|
||||||
|
void *context);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user