mirror of
https://github.com/NGSolve/netgen.git
synced 2024-12-27 06:10:34 +05:00
85e8c09ff6
Neither GCC nor Clang define an __arm64__ preprocessor macro, but use __aarch64__ (MSVC uses _MARM_64). Add a "64" suffix to the define, i.e. NETGEN_ARCH_ARM64 to make it more obvious in only refers to aarch64, and to be in line with NETGEN_ARCH_AMD64. Replace the (Clang specific) __builtin_readcyclecounter with inline asm: - The function return cycles (i.e. varies with CPU frequency), not time - It may return 0, depending on the PMU settings - It may cause an illegal instruction, in case it is not trapped by the kernel, e.g. on FreeBSD. Reading the generic timer/counter CNTVCT_EL0 instead of PMCCNTR_EL0 avoids these pitfalls. The inline asm works on GCC and Clang, instead of Clang only for the builtin.
76 lines
2.0 KiB
C++
76 lines
2.0 KiB
C++
#ifndef NETGEN_CORE_SIMD_HPP
|
|
#define NETGEN_CORE_SIMD_HPP
|
|
|
|
/**************************************************************************/
|
|
/* File: simd.hpp */
|
|
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
|
/* Date: 25. Mar. 16 */
|
|
/**************************************************************************/
|
|
|
|
#include "ngcore_api.hpp"
|
|
|
|
#include "simd_generic.hpp"
|
|
|
|
#ifdef NETGEN_ARCH_AMD64
|
|
#ifndef __SSE__
|
|
#define __SSE__
|
|
#endif
|
|
#include "simd_sse.hpp"
|
|
#endif
|
|
|
|
#ifdef __AVX__
|
|
#include "simd_avx.hpp"
|
|
#endif
|
|
|
|
#ifdef __AVX512F__
|
|
#include "simd_avx512.hpp"
|
|
#endif
|
|
|
|
#ifdef __aarch64__
|
|
#include "simd_arm64.hpp"
|
|
#endif
|
|
|
|
namespace ngcore
|
|
{
|
|
#ifdef NETGEN_ARCH_AMD64
|
|
NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
|
|
{
|
|
SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data());
|
|
SIMD<double,2> hsum2 = my_mm_hadd_pd (v3.Data(), v4.Data());
|
|
return SIMD<double,4> (hsum1, hsum2);
|
|
}
|
|
|
|
NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
|
|
{
|
|
return SIMD<mask64>::GetMaskFromBits(i);
|
|
}
|
|
#endif
|
|
|
|
|
|
NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
|
|
SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)
|
|
{
|
|
SIMD<double,4> h1,h2,h3,h4;
|
|
std::tie(h1,h2) = Unpack(a1,a2);
|
|
std::tie(h3,h4) = Unpack(a3,a4);
|
|
b1 = SIMD<double,4> (h1.Lo(), h3.Lo());
|
|
b2 = SIMD<double,4> (h2.Lo(), h4.Lo());
|
|
b3 = SIMD<double,4> (h1.Hi(), h3.Hi());
|
|
b4 = SIMD<double,4> (h2.Hi(), h4.Hi());
|
|
}
|
|
|
|
template<int N>
|
|
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2)
|
|
{
|
|
return SIMD<double,2>(HSum(s1), HSum(s2));
|
|
}
|
|
|
|
template<int N>
|
|
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2, SIMD<double,N> s3, SIMD<double,N> s4 )
|
|
{
|
|
return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4));
|
|
}
|
|
}
|
|
|
|
#endif // NETGEN_CORE_SIMD_HPP
|