From 8af4850d8597d9451fb323215ade9e7575fb596a Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Mon, 30 Jan 2017 16:06:48 +1100 Subject: [PATCH] remove 'fast teddy' models --- src/fdr/fdr.c | 4 +- src/fdr/teddy.h | 11 +- src/fdr/teddy_avx2.c | 401 --------------------------- src/fdr/teddy_engine_description.cpp | 2 - 4 files changed, 3 insertions(+), 415 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 5ac8388c..a965ba14 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -771,8 +771,8 @@ typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, static const FDRFUNCTYPE funcs[] = { fdr_engine_exec, - ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast), - ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast), + NULL, /* old: fast teddy */ + NULL, /* old: fast teddy */ ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat), ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat), ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat), diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h index e2936723..78cba847 100644 --- a/src/fdr/teddy.h +++ b/src/fdr/teddy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -104,15 +104,6 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control); -hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr, - const struct FDR_Runtime_Args *a, - hwlm_group_t control); - -hwlm_error_t -fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr, - const struct FDR_Runtime_Args *a, - hwlm_group_t control); - #endif /* __AVX2__ */ #endif /* TEDDY_H_ */ diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index 129b99c7..22b74408 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -39,75 +39,6 @@ #if defined(__AVX2__) -static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} -}; - #ifdef ARCH_64_BIT #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ @@ -199,22 +130,6 @@ do { \ } while (0); #endif -#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn) \ -do { \ - if (unlikely(isnonzero256(var))) { \ - u32 arrCnt = 0; \ - m128 lo = cast256to128(var); \ - m128 hi = movdq_hi(var); \ - bit_array_fast_teddy(lo, bitArr, &arrCnt, offset); \ - bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2); \ - for (u32 i = 0; i < arrCnt; i++) { \ - conf_fn(bitArr[i], confBase, reason, a, ptr, &control, \ - &last_match); \ - CHECK_HWLM_TERMINATE_MATCHING; \ - } \ - } \ -} while (0); - static really_inline m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, const u8 *buf_history, size_t len_history, @@ -226,183 +141,6 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, return ret; } -/* - * \brief Copy a block of [0,31] bytes efficiently. - * - * This function is a workaround intended to stop some compilers from - * synthesizing a memcpy function call out of the copy of a small number of - * bytes that we do in vectoredLoad128. - */ -static really_inline -void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { - switch (len) { - case 0: - break; - case 1: - *dst = *src; - break; - case 2: - unaligned_store_u16(dst, unaligned_load_u16(src)); - break; - case 3: - unaligned_store_u16(dst, unaligned_load_u16(src)); - dst[2] = src[2]; - break; - case 4: - unaligned_store_u32(dst, unaligned_load_u32(src)); - break; - case 5: - case 6: - case 7: - /* Perform copy with two overlapping 4-byte chunks. */ - unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); - unaligned_store_u32(dst, unaligned_load_u32(src)); - break; - case 8: - unaligned_store_u64a(dst, unaligned_load_u64a(src)); - break; - case 9: - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - /* Perform copy with two overlapping 8-byte chunks. */ - unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); - unaligned_store_u64a(dst, unaligned_load_u64a(src)); - break; - case 16: - storeu128(dst, loadu128(src)); - break; - default: - /* Perform copy with two overlapping 16-byte chunks. */ - assert(len < 32); - storeu128(dst + len - 16, loadu128(src + len - 16)); - storeu128(dst, loadu128(src)); - break; - } -} - -static really_inline -m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, - const u8 *buf_history, size_t len_history) { - union { - u8 val8[32]; - m256 val256; - } u; - - uintptr_t copy_start; - uintptr_t copy_len; - - if (ptr >= lo) { - uintptr_t avail = (uintptr_t)(hi - ptr); - if (avail >= 32) { - *p_mask = load256(p_mask_arr256[32] + 32); - return loadu256(ptr); - } - *p_mask = load256(p_mask_arr256[avail] + 32); - copy_start = 0; - copy_len = avail; - } else { - // need contains "how many chars to pull from history" - // calculate based on what we need, what we have in the buffer - // and only what we need to make primary confirm work - uintptr_t start = (uintptr_t)(lo - ptr); - uintptr_t i; - for (i = start; ptr + i < lo; i++) { - u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; - } - uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); - *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start); - copy_start = i; - copy_len = end - i; - } - - // Runt block from the buffer. - copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); - - return u.val256; -} - -static really_inline -void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase, - CautionReason reason, - const struct FDR_Runtime_Args *a, - const u8 *ptr, hwlmcb_rv_t *control, - u32 *last_match) { - u32 byte = bits / 8; - u32 cf = confBase[bits % 8]; - const struct FDRConfirm *fdrc = (const struct FDRConfirm *) - ((const u8 *)confBase + cf); - u64a confVal = getConfVal(a, ptr, byte, reason); - confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal); -} - -static really_inline -void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase, - CautionReason reason, - const struct FDR_Runtime_Args *a, const u8 *ptr, - hwlmcb_rv_t *control, u32 *last_match) { - u32 byte = bits / 8; - u32 cf = confBase[bits % 8]; - if (!cf) { - return; - } - const struct FDRConfirm *fdrc = (const struct FDRConfirm *) - ((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) { - return; - } - u64a confVal = getConfVal(a, ptr, byte, reason); - confWithBit(fdrc, a, ptr - a->buf + byte, control, last_match, confVal); -} - -static really_inline -void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) { - if (unlikely(isnonzero128(var))) { -#ifdef ARCH_64_BIT - u64a part_0 = movq(var); - while (unlikely(part_0)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) + - 64 * (offset); - *arrCnt += 1; - } - u64a part_1 = movq(rshiftbyte_m128(var, 8)); - while (unlikely(part_1)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + - 64 * (offset + 1); - *arrCnt += 1; - } -#else - u32 part_0 = movd(var); - while (unlikely(part_0)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) + - 32 * (offset * 2); - *arrCnt += 1; - } - u32 part_1 = movd(rshiftbyte_m128(var, 4)); - while (unlikely(part_1)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + - 32 * (offset * 2 + 1); - *arrCnt += 1; - } - u32 part_2 = movd(rshiftbyte_m128(var, 8)); - while (unlikely(part_2)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) + - 32 * (offset * 2 + 2); - *arrCnt += 1; - } - u32 part_3 = movd(rshiftbyte_m128(var, 12)); - while (unlikely(part_3)) { - bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) + - 32 * (offset * 2 + 3); - *arrCnt += 1; - } -#endif - } -} - static really_inline m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { m256 mask = set32x8(0xf); @@ -456,13 +194,6 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, return and256(r, res_shifted_3); } -static really_inline -m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) { - m256 lo = and256(val, mask); - m256 hi = and256(rshift64_m256(val, 4), mask); - return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi)); -} - static really_inline const m256 * getMaskBase_avx2(const struct Teddy *teddy) { return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy)); @@ -956,136 +687,4 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, return HWLM_SUCCESS; } -hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr, - const struct FDR_Runtime_Args *a, - hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 64; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy, 1); - - const m256 maskLo = set2x128(maskBase[0]); - const m256 maskHi = set2x128(maskBase[1]); - const m256 mask = set32x8(0xf); - u16 bitArr[512]; - - const u8 *mainStart = ROUNDUP_PTR(ptr, 32); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 32; - m256 p_mask; - m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, - buf_end, a->buf_history, a->len_history); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - res_0 = and256(res_0, p_mask); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); - ptr += 32; - } - - if (ptr + 32 < buf_end) { - m256 val_0 = load256(ptr + 0); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); - ptr += 32; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - - m256 val_0 = load256(ptr + 0); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); - - m256 val_1 = load256(ptr + 32); - m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); - } - - for (; ptr < buf_end; ptr += 32) { - m256 p_mask; - m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, - buf_end, a->buf_history, a->len_history); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - res_0 = and256(res_0, p_mask); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); - } - - return HWLM_SUCCESS; -} - -hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr, - const struct FDR_Runtime_Args *a, - hwlm_group_t control) { - const u8 *buf_end = a->buf + a->len; - const u8 *ptr = a->buf + a->start_offset; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 *tryFloodDetect = a->firstFloodDetect; - u32 last_match = (u32)-1; - const struct Teddy *teddy = (const struct Teddy *)fdr; - const size_t iterBytes = 64; - DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", - a->buf, a->len, a->start_offset); - - const m128 *maskBase = getMaskBase(teddy); - const u32 *confBase = getConfBase(teddy, 1); - - const m256 maskLo = set2x128(maskBase[0]); - const m256 maskHi = set2x128(maskBase[1]); - const m256 mask = set32x8(0xf); - u16 bitArr[512]; - - const u8 *mainStart = ROUNDUP_PTR(ptr, 32); - DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); - if (ptr < mainStart) { - ptr = mainStart - 32; - m256 p_mask; - m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, - buf_end, a->buf_history, a->len_history); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - res_0 = and256(res_0, p_mask); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); - ptr += 32; - } - - if (ptr + 32 < buf_end) { - m256 val_0 = load256(ptr + 0); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); - ptr += 32; - } - - for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - CHECK_FLOOD; - - m256 val_0 = load256(ptr + 0); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy); - - m256 val_1 = load256(ptr + 32); - m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi); - CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy); - } - - for (; ptr < buf_end; ptr += 32) { - m256 p_mask; - m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, - buf_end, a->buf_history, a->len_history); - m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi); - res_0 = and256(res_0, p_mask); - CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); - } - - return HWLM_SUCCESS; -} - #endif // __AVX2__ diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp index 9e876b0b..f7559b13 100644 --- a/src/fdr/teddy_engine_description.cpp +++ b/src/fdr/teddy_engine_description.cpp @@ -65,8 +65,6 @@ bool TeddyEngineDescription::needConfirm(const vector &lits) const void getTeddyDescriptions(vector *out) { static const TeddyEngineDef defns[] = { - { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false }, - { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true }, { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false }, { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true }, { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },