Merge 5ae6ca72750fb29c2a9a1db7f2ca8e86d6d7d3e6 into 9e9a10ad01fceb2032ae6e36cb0262c4dbba90c7

This commit is contained in:
Leslie Zhai 2025-06-14 14:08:27 +03:00 committed by GitHub
commit ed87b83f49
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 1911 additions and 4 deletions

View File

@ -141,6 +141,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
include (${CMAKE_MODULE_PATH}/cflags-arm.cmake) include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
elseif (ARCH_PPC64EL) elseif (ARCH_PPC64EL)
include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake) include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
elseif (ARCH_LOONGARCH64)
include (${CMAKE_MODULE_PATH}/cflags-loongarch64.cmake)
else () else ()
message(FATAL_ERROR "Unsupported platform") message(FATAL_ERROR "Unsupported platform")
endif () endif ()
@ -293,6 +295,11 @@ elseif (ARCH_PPC64EL)
set (hs_exec_common_SRCS set (hs_exec_common_SRCS
${hs_exec_common_SRCS} ${hs_exec_common_SRCS}
src/util/arch/ppc64el/cpuid_flags.c) src/util/arch/ppc64el/cpuid_flags.c)
elseif (ARCH_LOONGARCH64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/loongarch64/cpuid_flags.c
)
endif () endif ()
set (hs_exec_SRCS set (hs_exec_SRCS
@ -456,6 +463,11 @@ set (hs_exec_SRCS
${hs_exec_SRCS} ${hs_exec_SRCS}
src/nfa/vermicelli_simd.cpp src/nfa/vermicelli_simd.cpp
src/util/supervector/arch/ppc64el/impl.cpp) src/util/supervector/arch/ppc64el/impl.cpp)
elseif (ARCH_LOONGARCH64)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/nfa/vermicelli_simd.cpp
src/util/supervector/arch/loongarch64/impl.cpp)
endif() endif()

View File

@ -1,7 +1,7 @@
# About Vectorscan # About Vectorscan
A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD,
and Power VSX are 100% functional. ARM SVE2 support is in ongoing with Power VSX and LoongArch LSX are 100% functional. ARM SVE2 support is in ongoing with
access to hardware now. More platforms will follow in the future. access to hardware now. More platforms will follow in the future.
Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde) Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
port, which can be either used for platforms without official SIMD support, port, which can be either used for platforms without official SIMD support,

View File

@ -104,6 +104,9 @@ else()
elseif(ARCH_PPC64EL) elseif(ARCH_PPC64EL)
set(GNUCC_ARCH power8) set(GNUCC_ARCH power8)
set(TUNE_FLAG power8) set(TUNE_FLAG power8)
elseif(ARCH_LOONGARCH64)
set(GNUCC_ARCH la464)
set(TUNE_FLAG generic)
else() else()
set(GNUCC_ARCH native) set(GNUCC_ARCH native)
set(TUNE_FLAG native) set(TUNE_FLAG native)

View File

@ -0,0 +1,21 @@
CHECK_INCLUDE_FILE_CXX(lsxintrin.h HAVE_C_LOONGARCH64_LSXINTRIN_H)
if (HAVE_C_LOONGARCH64_LSXINTRIN_H)
set (INTRIN_INC_H "lsxintrin.h")
else()
message (FATAL_ERROR "No intrinsics header found for LSX")
endif ()
set(ARCH_C_FLAGS "-mlsx")
set(ARCH_CXX_FLAGS "-mlsx")
set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
__m128i a = __lsx_vreplgr2vr_w(1);
(void)a;
}" HAVE_LSX)
if (NOT HAVE_LSX)
message(FATAL_ERROR "LSX support required for LoongArch support")
endif ()

View File

@ -24,6 +24,9 @@
/* "Define if building for PPC64EL" */ /* "Define if building for PPC64EL" */
#cmakedefine ARCH_PPC64EL #cmakedefine ARCH_PPC64EL
/* "Define if building for LOONGARCH64" */
#cmakedefine ARCH_LOONGARCH64
/* "Define if cross compiling for AARCH64" */ /* "Define if cross compiling for AARCH64" */
#cmakedefine CROSS_COMPILE_AARCH64 #cmakedefine CROSS_COMPILE_AARCH64
@ -81,6 +84,9 @@
/* C compiler has arm_neon.h */ /* C compiler has arm_neon.h */
#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H #cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
/* C compiler has lsxintrin.h */
#cmakedefine HAVE_C_LOONGARCH64_LSXINTRIN_H
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */ 0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

View File

@ -5,7 +5,8 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error no
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) CHECK_C_SOURCE_COMPILES("#if !(defined(__loongarch_lp64) || defined( __loongarch64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64)
if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL OR ARCH_LOONGARCH64)
set(ARCH_64_BIT TRUE) set(ARCH_64_BIT TRUE)
else() else()
set(ARCH_32_BIT TRUE) set(ARCH_32_BIT TRUE)

View File

@ -48,6 +48,7 @@
#if defined(ARCH_IA32) || defined(ARCH_X86_64) #if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h" #include "util/arch/x86/cpuid_inline.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#elif defined(ARCH_LOONGARCH64)
#endif #endif
#include "util/depth.h" #include "util/depth.h"
#include "util/popcount.h" #include "util/popcount.h"

View File

@ -58,5 +58,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
} }
#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND) #elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
return HS_SUCCESS; return HS_SUCCESS;
#elif defined(ARCH_LOONGARCH64)
return HS_SUCCESS;
#endif #endif
} }

View File

@ -0,0 +1,75 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shufti: character class acceleration.
*/
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> c_lo = chars & low4bits;
SuperVector<S> c_hi = chars.template vshr_8_imm<4>();
c_lo = mask_lo.template pshufb<false>(c_lo);
c_hi = mask_hi.template pshufb<false>(c_hi);
return (c_lo & c_hi) > (SuperVector<S>::Zeroes());
}
template <uint16_t S>
static really_inline
SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> chars_lo = chars & low4bits;
chars_lo.print8("chars_lo");
SuperVector<S> chars_hi = chars.template vshr_64_imm<4>() & low4bits;
chars_hi.print8("chars_hi");
SuperVector<S> c1_lo = mask1_lo.template pshufb<true>(chars_lo);
c1_lo.print8("c1_lo");
SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
c1_hi.print8("c1_hi");
SuperVector<S> t1 = c1_lo | c1_hi;
t1.print8("t1");
SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
c2_lo.print8("c2_lo");
SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
c2_hi.print8("c2_hi");
SuperVector<S> t2 = c2_lo | c2_hi;
t2.print8("t2");
t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
t.print8("t");
return !t.eq(SuperVector<S>::Ones());
}

View File

@ -0,0 +1,63 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Truffle: character class acceleration.
*
*/
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
chars.print8("chars");
shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
highconst.print8("highconst");
SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
shuf_mask_hi.print8("shuf_mask_hi");
SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
shuf1.print8("shuf1");
SuperVector<S> t1 = chars ^ highconst;
t1.print8("t1");
SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
shuf2.print8("shuf2");
SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
t2.print8("t2");
SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
shuf3.print8("shuf3");
SuperVector<S> res = (shuf1 | shuf2) & shuf3;
res.print8("(shuf1 | shuf2) & shuf3");
return !res.eq(SuperVector<S>::Zeroes());
}

View File

@ -0,0 +1,130 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = !chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = !chars.eq(casemask & data);
mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = mask1 & (mask2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
}
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = (mask1 << 1)& mask2;
DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S, bool check_partial>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
SuperVector<S> mask = v1 & (v2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
}
return first_non_zero_match<S>(buf, mask, len);
}

View File

@ -61,6 +61,8 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
#include "arm/shufti.hpp" #include "arm/shufti.hpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "ppc64el/shufti.hpp" #include "ppc64el/shufti.hpp"
#elif defined(ARCH_LOONGARCH64)
#include "loongarch64/shufti.hpp"
#endif #endif
#endif #endif

View File

@ -68,6 +68,8 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
#include "arm/truffle.hpp" #include "arm/truffle.hpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "ppc64el/truffle.hpp" #include "ppc64el/truffle.hpp"
#elif defined(ARCH_LOONGARCH64)
#include "loongarch64/truffle.hpp"
#endif #endif
#endif #endif

View File

@ -80,6 +80,8 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
#include "arm/vermicelli.hpp" #include "arm/vermicelli.hpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "ppc64el/vermicelli.hpp" #include "ppc64el/vermicelli.hpp"
#elif defined(ARCH_LOONGARCH64)
#include "loongarch64/vermicelli.hpp"
#endif #endif
#endif #endif

View File

@ -42,10 +42,12 @@
#include "util/arch/arm/arm.h" #include "util/arch/arm/arm.h"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/ppc64el.h" #include "util/arch/ppc64el/ppc64el.h"
#elif defined(ARCH_LOONGARCH64)
#include "util/arch/loongarch64/loongarch64.h"
#endif #endif
#ifdef __NetBSD__ #ifdef __NetBSD__
#include <strings.h> #include <strings.h>
#endif #endif
#endif // UTIL_ARCH_X86_H_ #endif // UTIL_ARCH_H_

View File

@ -0,0 +1,214 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bit-twiddling primitives (ctz, compress etc)
*/
#ifndef BITUTILS_ARCH_LOONGARCH64_H
#define BITUTILS_ARCH_LOONGARCH64_H
#include "ue2common.h"
#include "util/popcount.h"
#include "util/arch.h"
#include "util/intrinsics.h"
#include "util/arch/common/bitutils.h"
static really_inline
u32 clz32_impl(u32 x) {
return clz32_impl_c(x);
}
static really_inline
u32 clz64_impl(u64a x) {
return clz64_impl_c(x);
}
static really_inline
u32 ctz32_impl(u32 x) {
return ctz32_impl_c(x);
}
static really_inline
u32 ctz64_impl(u64a x) {
return ctz64_impl_c(x);
}
static really_inline
u32 lg2_impl(u32 x) {
return lg2_impl_c(x);
}
static really_inline
u64a lg2_64_impl(u64a x) {
return lg2_64_impl_c(x);
}
static really_inline
u32 findAndClearLSB_32_impl(u32 *v) {
return findAndClearLSB_32_impl_c(v);
}
static really_inline
u32 findAndClearLSB_64_impl(u64a *v) {
return findAndClearLSB_64_impl_c(v);
}
static really_inline
u32 findAndClearMSB_32_impl(u32 *v) {
return findAndClearMSB_32_impl_c(v);
}
static really_inline
u32 findAndClearMSB_64_impl(u64a *v) {
return findAndClearMSB_64_impl_c(v);
}
static really_inline
u32 compress32_impl(u32 x, u32 m) {
return compress32_impl_c(x, m);
}
static really_inline
u64a compress64_impl(u64a x, u64a m) {
return compress64_impl_c(x, m);
}
static really_inline
m128 compress128_impl(m128 x, m128 m) {
m128 one = set1_2x64(1);
m128 bb = one;
m128 res = zeroes128();
while (isnonzero128(m)) {
m128 mm = sub_2x64(zeroes128(), m);
m128 xm = and128(x, m);
xm = and128(xm, mm);
m128 mask = not128(eq64_m128(xm, zeroes128()));
res = or128(res, and128(bb, mask));
m = and128(m, sub_2x64(m, one));
bb = lshift64_m128(bb, 1);
}
return res;
}
#if defined(HAVE_SVE2_BITPERM)
#include "bitutils_sve.h"
#else
static really_inline
u32 expand32_impl(u32 x, u32 m) {
return expand32_impl_c(x, m);
}
static really_inline
u64a expand64_impl(u64a x, u64a m) {
return expand64_impl_c(x, m);
}
#endif // HAVE_SVE2_BITPERM
static really_inline
m128 expand128_impl(m128 x, m128 m) {
m128 one = set1_2x64(1);
m128 bb = one;
m128 res = zeroes128();
while (isnonzero128(m)) {
m128 xm = and128(x, bb);
m128 mm = sub_2x64(zeroes128(), m);
m128 mask = not128(eq64_m128(xm, zeroes128()));
mask = and128(mask, and128(m, mm));
res = or128(res, mask);
m = and128(m, sub_2x64(m, one));
bb = lshift64_m128(bb, 1);
}
return res;
}
/* returns the first set bit after begin (if not ~0U). If no bit is set after
* begin returns ~0U
*/
static really_inline
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64_impl(bitfield);
}
static really_inline
char bf64_set_impl(u64a *bitfield, u32 i) {
return bf64_set_impl_c(bitfield, i);
}
static really_inline
void bf64_unset_impl(u64a *bitfield, u32 i) {
return bf64_unset_impl_c(bitfield, i);
}
static really_inline
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
return rank_in_mask32_impl_c(mask, bit);
}
static really_inline
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
return rank_in_mask64_impl_c(mask, bit);
}
static really_inline
u32 pext32_impl(u32 x, u32 mask) {
return pext32_impl_c(x, mask);
}
static really_inline
u64a pext64_impl(u64a x, u64a mask) {
return pext64_impl_c(x, mask);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn_impl(const u32 a, const u8 *b) {
return andn_impl_c(a, b);
}
#endif // BITUTILS_ARCH_LOONGARCH64_H

View File

@ -0,0 +1,42 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "util/arch/common/cpuid_flags.h"
#include "ue2common.h"
#include "hs_compile.h" // for HS_MODE_ flags
#include "util/arch.h"
u64a cpuid_flags(void) {
return 0;
}
u32 cpuid_tune(void) {
return HS_TUNE_FAMILY_GENERIC;
}

View File

@ -0,0 +1,47 @@
/*
* Copyright (c) 2017-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Per-platform architecture definitions
*/
#ifndef UTIL_ARCH_LOONGARCH64_H_
#define UTIL_ARCH_LOONGARCH64_H_
#define HAVE_LSX
#define HAVE_SIMD_128_BITS
#if defined(HAVE_SIMD_128_BITS)
#define CHUNKSIZE 128
#define VECTORSIZE 16
#endif
#endif // UTIL_ARCH_LOONGARCH64_H_

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
v4u32 m = mask.u.u32x4[0];
uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
v4u32 m = mask.u.u32x4[0];
uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + (15 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
v4u32 m = mask.u.u32x4[0];
uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
v4u32 m = mask.u.u32x4[0];
uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + (15 - pos);
} else {
return NULL; // no match
}
}

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMD_TYPES_LOONGARCH64_H
#define SIMD_TYPES_LOONGARCH64_H
#if !defined(m128) && defined(HAVE_LSX)
typedef v4i32 m128;
#endif
#endif /* SIMD_TYPES_LOONGARCH64_H */

View File

@ -0,0 +1,466 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD types and primitive operations.
*/
#ifndef ARCH_LOONGARCH64_SIMD_UTILS_H
#define ARCH_LOONGARCH64_SIMD_UTILS_H
#include <stdio.h>
#include <stdbool.h>
#include "ue2common.h"
#include "util/simd_types.h"
#include "util/unaligned.h"
#include "util/intrinsics.h"
#include <string.h> // for memcpy
static really_inline m128 vpmax_loongarch(v4u32 a, v4u32 b) {
u32 result[4];
u32 tmp1 = __lsx_vpickve2gr_wu(a, 0);
u32 tmp2 = __lsx_vpickve2gr_wu(a, 1);
result[0] = (tmp1 >= tmp2) ? tmp1 : tmp2;
tmp1 = __lsx_vpickve2gr_wu(a, 2);
tmp2 = __lsx_vpickve2gr_wu(a, 3);
result[1] = (tmp1 >= tmp2) ? tmp1 : tmp2;
tmp1 = __lsx_vpickve2gr_wu(b, 0);
tmp2 = __lsx_vpickve2gr_wu(b, 1);
result[2] = (tmp1 >= tmp2) ? tmp1 : tmp2;
tmp1 = __lsx_vpickve2gr_wu(b, 2);
tmp2 = __lsx_vpickve2gr_wu(b, 3);
result[3] = (tmp1 >= tmp2) ? tmp1 : tmp2;
v4u32 res = __lsx_vld((uint32_t *)result, 0);
return res;
}
static really_inline m128 ones128(void) {
return __lsx_vreplgr2vr_b(0xFF);
}
static really_inline m128 zeroes128(void) {
return __lsx_vreplgr2vr_w(0);
}
/** \brief Bitwise not for m128*/
static really_inline m128 not128(m128 a) {
return __lsx_vxor_v(a, ones128());
}
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
uint64_t res = __lsx_vpickve2gr_du(__lsx_vsrlni_b_h(zeroes128(), __lsx_vseq_w(a, b), 4), 0);
return (~0ull != res);
}
static really_inline int isnonzero128(m128 a) {
return diff128(a, zeroes128());
}
/**
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich128(m128 a, m128 b) {
static const v4u32 movemask = { 1, 2, 4, 8 };
m128 tmp = __lsx_vand_v(not128(__lsx_vseq_w(a, b)), movemask);
return __lsx_vpickve2gr_wu(tmp, 0) + __lsx_vpickve2gr_wu(tmp, 1) +
__lsx_vpickve2gr_wu(tmp, 2) + __lsx_vpickve2gr_wu(tmp, 3);
}
/**
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
* returns a 4-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_128(m128 a, m128 b) {
static const v2u64 movemask = { 1, 4 };
m128 tmp = __lsx_vand_v(not128(__lsx_vseq_d(a, b)), movemask);
return __lsx_vpickve2gr_du(tmp, 0) + __lsx_vpickve2gr_du(tmp, 1);
}
static really_really_inline
m128 add_2x64(m128 a, m128 b) {
return __lsx_vadd_d(a, b);
}
static really_really_inline
m128 sub_2x64(m128 a, m128 b) {
return __lsx_vsub_d(a, b);
}
static really_inline
m128 lshift_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return __lsx_vslli_w(a, b);
}
#endif
v4i32_w shift_indices = __lsx_vreplgr2vr_w(b);
return __lsx_vsll_w(a, shift_indices);
}
static really_really_inline
m128 rshift_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return __lsx_vsrli_w(a, b);
}
#endif
v4i32 shift_indices = __lsx_vreplgr2vr_w(b);
return __lsx_vsrl_w(a, shift_indices);
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return __lsx_vslli_d(a, b);
}
#endif
v2i64 shift_indices = __lsx_vreplgr2vr_d(b);
return __lsx_vsll_d(a, shift_indices);
}
static really_really_inline
m128 rshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return __lsx_vsrl_d(a, b);
}
#endif
v2i64 shift_indices = __lsx_vreplgr2vr_d(b);
return __lsx_vsrl_d(a, shift_indices);
}
static really_inline m128 eq128(m128 a, m128 b) {
return __lsx_vseq_b(a, b);
}
static really_inline m128 eq64_m128(m128 a, m128 b) {
return __lsx_vseq_d(a, b);
}
static really_inline u32 movemask128(m128 a) {
v16u8 input = (v16u8) a;
v8u16 high_bits = (v8u16) __lsx_vsrli_b(input, 7);
v4u32 paired16 = (v4u32) __lsx_vadd_h(high_bits, __lsx_vsrli_h(high_bits, 7));
v2u64 paired32 = (v2u64) __lsx_vadd_w(paired16, __lsx_vsrli_w(paired16, 14));
v16u8 paired64 = (v16u8) __lsx_vadd_d(paired32, __lsx_vsrli_d(paired32, 28));
return __lsx_vpickve2gr_bu(paired64, 0) | ((int) __lsx_vpickve2gr_bu(paired64, 8) << 8);
}
static really_inline m128 set1_16x8(u8 c) {
return __lsx_vreplgr2vr_b(c);
}
static really_inline m128 set1_4x32(u32 c) {
return __lsx_vreplgr2vr_w(c);
}
static really_inline m128 set1_2x64(u64a c) {
return __lsx_vreplgr2vr_d(c);
}
static really_inline u32 movd(const m128 in) {
return __lsx_vpickve2gr_wu(in, 0);
}
static really_inline u64a movq(const m128 in) {
return __lsx_vpickve2gr_du(in, 0);
}
/* another form of movq */
static really_inline
m128 load_m128_from_u64a(const u64a *p) {
m128 tmp = zeroes128();
return __lsx_vinsgr2vr_d(tmp, *p, 0);
}
static really_inline u32 extract32from128(const m128 in, unsigned imm) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(imm)) {
return __lsx_vpickve2gr_wu(in, imm);
}
#endif
switch (imm) {
case 0:
return __lsx_vpickve2gr_wu(in, 0);
break;
case 1:
return __lsx_vpickve2gr_wu(in, 1);
break;
case 2:
return __lsx_vpickve2gr_wu(in, 2);
break;
case 3:
return __lsx_vpickve2gr_wu(in, 3);
break;
default:
return 0;
break;
}
}
static really_inline u64a extract64from128(const m128 in, unsigned imm) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(imm)) {
return __lsx_vpickve2gr_du(in, imm);
}
#endif
switch (imm) {
case 0:
return __lsx_vpickve2gr_du(in, 0);
break;
case 1:
return __lsx_vpickve2gr_du(in, 1);
break;
default:
return 0;
break;
}
}
static really_inline m128 low64from128(const m128 in) {
m128 ret = zeroes128();
__lsx_vinsgr2vr_d(ret, __lsx_vpickve2gr_d(in, 0), 0);
return ret;
}
static really_inline m128 high64from128(const m128 in) {
m128 ret = zeroes128();
__lsx_vinsgr2vr_d(ret, __lsx_vpickve2gr_d(in, 1), 0);
return ret;
}
static really_inline m128 add128(m128 a, m128 b) {
return __lsx_vadd_q(a, b);
}
static really_inline m128 and128(m128 a, m128 b) {
return __lsx_vand_v(a, b);
}
static really_inline m128 xor128(m128 a, m128 b) {
return __lsx_vxor_v(a, b);
}
static really_inline m128 or128(m128 a, m128 b) {
return __lsx_vor_v(a, b);
}
static really_inline m128 andnot128(m128 a, m128 b) {
return __lsx_vandn_v(a, b);
}
// aligned load
static really_inline m128 load128(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m128)));
return __lsx_vld((const int32_t *)ptr, 0);
}
// aligned store
static really_inline void store128(void *ptr, m128 a) {
assert(ISALIGNED_N(ptr, alignof(m128)));
__lsx_vst(a, (int32_t *)ptr, 0);
}
// unaligned load
static really_inline m128 loadu128(const void *ptr) {
return __lsx_vld((const int32_t *)ptr, 0);
}
// unaligned store
static really_inline void storeu128(void *ptr, m128 a) {
__lsx_vst(a, (int32_t *)ptr, 0);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes128(void *ptr, m128 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m128 loadbytes128(const void *ptr, unsigned int n) {
m128 a = zeroes128();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline m128 case_algin_vectors(m128 a,m128 b,int offset) {
u8 index_shuf[16];
for(int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)offset;
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0);
return __lsx_vshuf_b(b, a, index);
}
static really_really_inline
m128 palignr_imm(m128 r, m128 l, int offset) {
switch (offset) {
case 0: return l; break;
case 1: return case_algin_vectors(l, r, 1); break;
case 2: return case_algin_vectors(l, r, 2); break;
case 3: return case_algin_vectors(l, r, 3); break;
case 4: return case_algin_vectors(l, r, 4); break;
case 5: return case_algin_vectors(l, r, 5); break;
case 6: return case_algin_vectors(l, r, 6); break;
case 7: return case_algin_vectors(l, r, 7); break;
case 8: return case_algin_vectors(l, r, 8); break;
case 9: return case_algin_vectors(l, r, 9); break;
case 10: return case_algin_vectors(l, r, 10); break;
case 11: return case_algin_vectors(l, r, 11); break;
case 12: return case_algin_vectors(l, r, 12); break;
case 13: return case_algin_vectors(l, r, 13); break;
case 14: return case_algin_vectors(l, r, 14); break;
case 15: return case_algin_vectors(l, r, 15); break;
case 16: return r; break;
default:
return zeroes128();
break;
}
}
static really_really_inline
m128 palignr(m128 r, m128 l, int offset) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
u8 index_shuf[16];
for (int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)offset;
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0);
if (__builtin_constant_p(index)) {
return __lsx_vshuf_b(r, l, index);
}
#endif
return palignr_imm(r, l, offset);
}
//#undef CASE_ALIGN_VECTORS
static really_really_inline
m128 rshiftbyte_m128(m128 a, unsigned b) {
if (b == 0) {
return a;
}
return palignr(zeroes128(), a, b);
}
static really_really_inline
m128 lshiftbyte_m128(m128 a, unsigned b) {
if (b == 0) {
return a;
}
return palignr(a, zeroes128(), 16 - b);
}
static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
assert(amount >= -16 && amount <= 16);
if (amount < 0) {
return palignr_imm(zeroes128(), in, -amount);
} else {
return palignr_imm(in, zeroes128(), 16 - amount);
}
}
static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
static m128 onebit = { 1, 0 };
m128 mask = lshiftbyte_m128( onebit, n / 8 );
return lshift64_m128( mask, n % 8 );
}
// switches on bit N in the given vector.
static really_inline
void setbit128(m128 *ptr, unsigned int n) {
*ptr = or128(mask1bit128(n), *ptr);
}
// switches off bit N in the given vector.
static really_inline
void clearbit128(m128 *ptr, unsigned int n) {
*ptr = andnot128(mask1bit128(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit128(m128 val, unsigned int n) {
const m128 mask = mask1bit128(n);
return isnonzero128(and128(mask, val));
}
static really_inline
m128 pshufb_m128(m128 a, m128 b) {
v16u8 tmp = __lsx_vand_v((v16u8)b,__lsx_vreplgr2vr_b(0x8f));
return __lsx_vshuf_b(zeroes128(),a, tmp);
}
static really_inline
m128 max_u8_m128(m128 a, m128 b) {
return __lsx_vmax_bu(a, b);
}
static really_inline
m128 min_u8_m128(m128 a, m128 b) {
return __lsx_vmin_bu(a, b);
}
static really_inline
m128 sadd_u8_m128(m128 a, m128 b) {
return __lsx_vsadd_bu(a, b);
}
static really_inline
m128 sub_u8_m128(m128 a, m128 b) {
return __lsx_vssub_bu(a, b);
}
static really_inline
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 };
return __lsx_vld((uint32_t *) data, 0);
}
static really_inline
m128 set2x64(u64a hi, u64a lo) {
uint64_t ALIGN_ATTR(16) data[2] = { lo, hi };
return __lsx_vld((uint64_t *) data, 0);
}
#endif // ARCH_LOONGARCH64_SIMD_UTILS_H

View File

@ -52,6 +52,8 @@
#include "util/arch/arm/bitutils.h" #include "util/arch/arm/bitutils.h"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/bitutils.h" #include "util/arch/ppc64el/bitutils.h"
#elif defined(ARCH_LOONGARCH64)
#include "util/arch/loongarch64/bitutils.h"
#endif #endif
#else #else
#include "util/arch/common/bitutils.h" #include "util/arch/common/bitutils.h"

View File

@ -53,6 +53,10 @@
# define USE_PPC64EL_ALTIVEC_H # define USE_PPC64EL_ALTIVEC_H
#endif #endif
#if defined(HAVE_C_LOONGARCH64_LSXINTRIN_H)
# define USE_LOONGARCH64_LSXINTRIN_H
#endif
#ifdef __cplusplus #ifdef __cplusplus
# if defined(HAVE_CXX_INTRIN_H) # if defined(HAVE_CXX_INTRIN_H)
# define USE_INTRIN_H # define USE_INTRIN_H
@ -74,6 +78,8 @@
# endif # endif
#elif defined(USE_PPC64EL_ALTIVEC_H) #elif defined(USE_PPC64EL_ALTIVEC_H)
#include <altivec.h> #include <altivec.h>
#elif defined(USE_LOONGARCH64_LSXINTRIN_H)
#include <lsxintrin.h>
#endif #endif
#endif // INTRINSICS_H #endif // INTRINSICS_H

View File

@ -58,6 +58,8 @@ const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S)
#include "util/arch/arm/match.hpp" #include "util/arch/arm/match.hpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/match.hpp" #include "util/arch/ppc64el/match.hpp"
#elif defined(ARCH_LOONGARCH64)
#include "util/arch/loongarch64/match.hpp"
#endif #endif
#endif #endif

View File

@ -52,6 +52,8 @@ typedef simde__m128i m128;
#include "util/arch/arm/simd_types.h" #include "util/arch/arm/simd_types.h"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/simd_types.h" #include "util/arch/ppc64el/simd_types.h"
#elif defined(ARCH_LOONGARCH64)
#include "util/arch/loongarch64/simd_types.h"
#endif #endif

View File

@ -71,6 +71,8 @@ extern const char vbs_mask_data[];
#include "util/arch/arm/simd_utils.h" #include "util/arch/arm/simd_utils.h"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/simd_utils.h" #include "util/arch/ppc64el/simd_utils.h"
#elif defined(ARCH_LOONGARCH64)
#include "util/arch/loongarch64/simd_utils.h"
#endif #endif
#endif #endif

View File

@ -0,0 +1,603 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMD_IMPL_HPP
#define SIMD_IMPL_HPP
#include <cstdint>
#include "ue2common.h"
#include "util/supervector/supervector.hpp"
// 128-bit LSX implementation
template<>
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
{
u.v128[0] = v;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v8i16_h other)
{
u.s8x16[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v8u16_h other)
{
u.u8x16[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v16i8_b other)
{
u.s16x8[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v16u8_b other)
{
u.u16x8[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v4i32_w other)
{
u.s32x4[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v4u32_w other)
{
u.u32x4[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v2i64_d other)
{
u.s64x2[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(v2u64_d other)
{
u.u64x2[0] = other;
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int8_t const other)
{
u.s8x16[0] = __lsx_vreplgr2vr_b(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint8_t const other)
{
u.u8x16[0] = (v16u8)__lsx_vreplgr2vr_b(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int16_t const other)
{
u.s16x8[0] = __lsx_vreplgr2vr_h(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint16_t const other)
{
u.u16x8[0] = (v8u16)__lsx_vreplgr2vr_h(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int32_t const other)
{
u.s32x4[0] = __lsx_vreplgr2vr_w(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint32_t const other)
{
u.u32x4[0] = (v4u32)__lsx_vreplgr2vr_w(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int64_t const other)
{
u.s64x2[0] = __lsx_vreplgr2vr_d(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint64_t const other)
{
u.u64x2[0] = (v2u64)__lsx_vreplgr2vr_d(other);
}
// Constants
template<>
really_inline SuperVector<16> SuperVector<16>::Ones(void)
{
return {__lsx_vreplgr2vr_b(0xFF)};
}
template<>
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
{
return {__lsx_vreplgr2vr_b(0)};
}
// Methods
template <>
really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
{
u.v128[0] = other.u.v128[0];
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
{
return {__lsx_vand_v(u.u8x16[0], b.u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
{
return {__lsx_vor_v(u.u8x16[0], b.u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
{
return {__lsx_vxor_v(u.u8x16[0], b.u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator!() const
{
return {__lsx_vnor_v(u.u8x16[0], u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
{
return {__lsx_vand_v(__lsx_vnor_v(u.u8x16[0], u.u8x16[0]), b.u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
{
return {__lsx_vseq_b(u.u8x16[0], b.u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
{
return !(*this == b);
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
{
return {__lsx_vslt_b(b.u.s8x16[0], u.s8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
{
return {__lsx_vsle_bu(b.u.u8x16[0], u.u8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
{
return {__lsx_vslt_b(u.s8x16[0], b.u.s8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
{
return {__lsx_vsle_b(u.s8x16[0], b.u.s8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
{
return (*this == b);
}
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::comparemask(void) const {
return static_cast<typename SuperVector<16>::comparemask_type>(
__lsx_vpickve2gr_du(__lsx_vsrlni_b_h(__lsx_vreplgr2vr_w(0), u.u16x8[0], 4), 0));
}
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::eqmask(SuperVector<16> const b) const {
return eq(b).comparemask();
}
template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::iteration_mask(
typename SuperVector<16>::comparemask_type mask) {
return mask & 0x1111111111111111ull;
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
{
return {__lsx_vslli_b(u.u8x16[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
{
return {__lsx_vslli_h(u.u16x8[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
{
return {__lsx_vslli_w(u.u32x4[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
{
return {__lsx_vslli_d(u.u64x2[0], N)};
}
static really_inline m128 create_index(int offset){
u8 index_shuf[16];
for (int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)offset;
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf,0);
return index;
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
{
return {__lsx_vshuf_b(u.u8x16[0], __lsx_vreplgr2vr_b(0), create_index(16 - N))};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
{
return vshl_128_imm<N>();
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
{
return {__lsx_vsrli_b(u.u8x16[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
{
return {__lsx_vsrli_h(u.u16x8[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
{
return {__lsx_vsrli_w(u.u32x4[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
{
return {__lsx_vsrli_d(u.u64x2[0], N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
{
return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], create_index(N))};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
{
return vshr_128_imm<N>();
}
#if !defined(HS_OPTIMIZE)
template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
#endif
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 8) return Zeroes();
v16i8 shift_indices = __lsx_vreplgr2vr_b(N);
return { __lsx_vsll_b(u.s8x16[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
v8i16 shift_indices = __lsx_vreplgr2vr_h(N);
return { __lsx_vsll_h(u.s16x8[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 32) return Zeroes();
v4i32 shift_indices = __lsx_vreplgr2vr_w(N);
return { __lsx_vsll_w(u.s32x4[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 64) return Zeroes();
v2i64 shift_indices = __lsx_vreplgr2vr_d(N);
return { __lsx_vsll_d(u.s64x2[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
#if defined(HAVE__BUILTIN_CONSTANT_P)
u8 index_shuf[16];
for(int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)(16-N);
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0);
if (__builtin_constant_p(index)) {
return {__lsx_vshuf_b(u.u8x16[0], __lsx_vreplgr2vr_b(0), index)};
}
#endif
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {__lsx_vshuf_b(v->u.u8x16[0], __lsx_vreplgr2vr_b(0), create_index(16 - n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
{
return vshl_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 8) return Zeroes();
v16i8 shift_indices = __lsx_vreplgr2vr_b(N);
return { __lsx_vsrl_b(u.s8x16[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
v8i16 shift_indices = __lsx_vreplgr2vr_h(N);
return { __lsx_vsrl_h(u.s16x8[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 32) return Zeroes();
v4i32 shift_indices = __lsx_vreplgr2vr_w(N);
return { __lsx_vsrl_w(u.s32x4[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 64) return Zeroes();
v2i64 shift_indices = __lsx_vreplgr2vr_d(N);
return { __lsx_vsrl_d(u.s64x2[0], shift_indices) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
#if defined(HAVE__BUILTIN_CONSTANT_P)
u8 index_shuf[16];
for (int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)N;
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0);
if (__builtin_constant_p(index)) {
return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], index)};
}
#endif
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), v->u.u8x16[0], create_index(n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
{
return vshr_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{
return vshr_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
{
return vshl_128(N);
}
template<>
really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
{
return Ones().vshr_128(N);
}
template<>
really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
{
return Ones().vshl_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
{
return {__lsx_vld((const int32_t *)ptr, 0)};
}
template <>
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
{
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
return {__lsx_vld((const int32_t *)ptr, 0)};
}
template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
{
SuperVector mask = Ones_vshr(16 - len);
SuperVector<16> v = loadu(ptr);
return mask & v;
}
template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
if (offset == 0) return other;
if (offset == 16) return *this;
#if defined(HAVE__BUILTIN_CONSTANT_P)
u8 index_shuf[16];
for (int i = 0; i < 16; i++) {
index_shuf[i] = (uint8_t)offset;
offset += 1;
}
v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0);
if (__builtin_constant_p(index)) {
return {__lsx_vshuf_b(u.u8x16[0], other.u.u8x16[0], index)};
}
#endif
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {__lsx_vshuf_b(v->u.u8x16[0], other.u.u8x16[0], create_index(n))}; });
return result;
}
template<>
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
{
return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], b.u.u8x16[0])};
}
template<>
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
{
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In LOONGARCH, if >=16, then the result is zero, otherwise it is that lane.
btranslated is the version that is converted from Intel to LOONGARCH. */
SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
return pshufb<false>(btranslated);
}
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
{
SuperVector mask = Ones_vshr(16 -len);
return mask & pshufb(b);
}
#endif // SIMD_IMPL_HPP

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2023, Loongson Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(m128) && defined(HAVE_LSX)
typedef v4i32 m128;
#endif

View File

@ -43,6 +43,8 @@
#include "util/supervector/arch/arm/types.hpp" #include "util/supervector/arch/arm/types.hpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/supervector/arch/ppc64el/types.hpp" #include "util/supervector/arch/ppc64el/types.hpp"
#elif defined(ARCH_LOONGARCH64)
#include "util/supervector/arch/loongarch64/types.hpp"
#endif #endif
#endif // VS_SIMDE_BACKEND #endif // VS_SIMDE_BACKEND
@ -66,6 +68,11 @@ using Z_TYPE = u64a;
#define Z_BITS 64 #define Z_BITS 64
#define Z_POSSHIFT 2 #define Z_POSSHIFT 2
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l))) #define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
#elif defined(ARCH_LOONGARCH64)
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_POSSHIFT 2
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
#else #else
using Z_TYPE = u32; using Z_TYPE = u32;
#define Z_BITS 32 #define Z_BITS 32
@ -190,6 +197,17 @@ public:
int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
#endif #endif
#if defined(ARCH_LOONGARCH64)
v2u64 ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
v2i64 ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
v4u32 ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
v4i32 ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
v8u16 ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
v8i16 ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
v16u8 ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
v16i8 ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
#endif
uint64_t u64[SIZE / sizeof(uint64_t)]; uint64_t u64[SIZE / sizeof(uint64_t)];
int64_t s64[SIZE / sizeof(int64_t)]; int64_t s64[SIZE / sizeof(int64_t)];
uint32_t u32[SIZE / sizeof(uint32_t)]; uint32_t u32[SIZE / sizeof(uint32_t)];
@ -395,6 +413,8 @@ struct Unroller<End, End>
#include "util/supervector/arch/arm/impl.cpp" #include "util/supervector/arch/arm/impl.cpp"
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#include "util/supervector/arch/ppc64el/impl.cpp" #include "util/supervector/arch/ppc64el/impl.cpp"
#elif defined(ARCH_LOONGARCH64)
#include "util/supervector/arch/loongarch64/impl.cpp"
#endif #endif
#endif #endif
#endif #endif

View File

@ -32,6 +32,7 @@
#include "util/arch/common/cpuid_flags.h" #include "util/arch/common/cpuid_flags.h"
#if defined(ARCH_IA32) || defined(ARCH_X86_64) #if defined(ARCH_IA32) || defined(ARCH_X86_64)
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#elif defined(ARCH_LOONGARCH64)
#endif #endif
namespace ue2 { namespace ue2 {

View File

@ -673,6 +673,9 @@ TEST(SimdUtilsTest, movq) {
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
int64x2_t a = { 0x123456789abcdefLL, ~0LL }; int64x2_t a = { 0x123456789abcdefLL, ~0LL };
simd = vreinterpretq_s32_s64(a); simd = vreinterpretq_s32_s64(a);
#elif defined(ARCH_LOONGARCH64)
v2i64 a = { 0x123456789abcdefLL, ~0LL };
simd = (m128) a;
#elif defined(ARCH_PPC64EL) #elif defined(ARCH_PPC64EL)
#if defined(__clang__) && (__clang_major__ >= 15) #if defined(__clang__) && (__clang_major__ >= 15)
#pragma clang diagnostic push #pragma clang diagnostic push