diff --git a/libsrc/core/ngcore_api.hpp b/libsrc/core/ngcore_api.hpp index b6412157..330e7e33 100644 --- a/libsrc/core/ngcore_api.hpp +++ b/libsrc/core/ngcore_api.hpp @@ -67,6 +67,14 @@ #endif #endif +#if defined(__amd64__) || defined(_M_AMD64) +#define NETGEN_ARCH_AMD64 +#endif + +#if defined(__arm64__) || defined(_M_ARM64) +#define NETGEN_ARCH_ARM +#endif + #ifdef __MAC_OS_X_VERSION_MIN_REQUIRED #if __MAC_OS_X_VERSION_MIN_REQUIRED < 101400 // The c++ standard library on MacOS 10.13 and earlier has no aligned new operator, diff --git a/libsrc/core/simd.hpp b/libsrc/core/simd.hpp index 277dd851..0d69dec1 100644 --- a/libsrc/core/simd.hpp +++ b/libsrc/core/simd.hpp @@ -11,7 +11,7 @@ #include "simd_generic.hpp" -#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) +#ifdef NETGEN_ARCH_AMD64 #ifndef __SSE__ #define __SSE__ #endif @@ -28,6 +28,7 @@ namespace ngcore { +#ifdef NETGEN_ARCH_AMD64 NETGEN_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) { SIMD hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data()); @@ -35,6 +36,12 @@ namespace ngcore return SIMD (hsum1, hsum2); } + NETGEN_INLINE auto GetMaskFromBits( unsigned int i ) + { + return SIMD::GetMaskFromBits(i); + } +#endif + NETGEN_INLINE void SIMDTranspose (SIMD a1, SIMD a2, SIMD a3, SIMD a4, SIMD & b1, SIMD & b2, SIMD & b3, SIMD & b4) @@ -59,11 +66,6 @@ namespace ngcore { return SIMD(HSum(s1), HSum(s2), HSum(s3), HSum(s4)); } - - NETGEN_INLINE auto GetMaskFromBits( unsigned int i ) - { - return SIMD::GetMaskFromBits(i); - } } #endif // NETGEN_CORE_SIMD_HPP diff --git a/libsrc/core/simd_avx.hpp b/libsrc/core/simd_avx.hpp index 3f83dae6..0f3112a0 100644 --- a/libsrc/core/simd_avx.hpp +++ b/libsrc/core/simd_avx.hpp @@ -42,7 +42,7 @@ namespace ngcore { __m256i mask; public: - SIMD (size_t i) + SIMD (int64_t i) : mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i), _mm256_set_epi64x(3, 2, 1, 0))) { ; } diff --git a/libsrc/core/simd_generic.hpp b/libsrc/core/simd_generic.hpp index 8ebd399a..849e0922 100644 --- a/libsrc/core/simd_generic.hpp +++ b/libsrc/core/simd_generic.hpp @@ -21,10 +21,10 @@ namespace ngcore return 8; #elif defined __AVX__ return 4; -#elif (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) +#elif defined NETGEN_ARCH_AMD64 return 2; #else - return 1; + return 2; #endif } @@ -58,7 +58,7 @@ namespace ngcore { int64_t mask; public: - SIMD (size_t i) + SIMD (int64_t i) : mask(i > 0 ? -1 : 0) { ; } bool Data() const { return mask; } static constexpr int Size() { return 1; } @@ -67,7 +67,7 @@ namespace ngcore template - class SIMD + class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD { static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N2 = N-N1; @@ -76,7 +76,7 @@ namespace ngcore SIMD hi; public: - SIMD (int i) : lo(i), hi(i-N1) { ; } + SIMD (int64_t i) : lo(i), hi(i-N1 ) { ; } SIMD (SIMD lo_, SIMD hi_) : lo(lo_), hi(hi_) { ; } SIMD Lo() const { return lo; } SIMD Hi() const { return hi; } @@ -104,8 +104,10 @@ namespace ngcore SIMD () {} SIMD (const SIMD &) = default; SIMD & operator= (const SIMD &) = default; - SIMD (int64_t val) { data = val; } - SIMD (std::array arr) + SIMD (int val) : data{val} {} + SIMD (int64_t val) : data{val} {} + SIMD (size_t val) : data(val) {} + explicit SIMD (std::array arr) : data{arr[0]} {} @@ -121,7 +123,7 @@ namespace ngcore }; template - class SIMD + class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD { static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N2 = N-N1; @@ -136,16 +138,18 @@ namespace ngcore SIMD (const SIMD &) = default; SIMD & operator= (const SIMD &) = default; + SIMD (int val) : lo{val}, high{val} { ; } SIMD (int64_t val) : lo{val}, high{val} { ; } + SIMD (size_t val) : lo{val}, high{val} { ; } SIMD (SIMD lo_, SIMD high_) : lo(lo_), high(high_) { ; } - SIMD( std::array arr ) + explicit SIMD( std::array arr ) : lo(detail::array_range(arr, 0)), high(detail::array_range(arr, N1)) {} template - SIMD(const T... vals) + explicit SIMD(const T... vals) : lo(detail::array_range(std::array{vals...}, 0)), high(detail::array_range(std::array{vals...}, N1)) { @@ -204,7 +208,7 @@ namespace ngcore SIMD (size_t val) { data = val; } SIMD (double const * p) { data = *p; } SIMD (double const * p, SIMD mask) { data = mask.Data() ? *p : 0.0; } - SIMD (std::array arr) + explicit SIMD (std::array arr) : data{arr[0]} {} @@ -236,7 +240,7 @@ namespace ngcore template - class SIMD + class alignas(GetDefaultSIMDSize()*sizeof(double)) SIMD { static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N2 = N-N1; @@ -253,19 +257,17 @@ namespace ngcore template >::value,int>::type = 0> SIMD (const T & func) { - for(auto i : IntRange(N1)) - lo[i] = func(i); - for(auto i : IntRange(N2)) - high[i] = func(N1+i); + double *p = (double*)this; + for(auto i : IntRange(N)) + p[i] = func(i); } template >::value,int>::type = 0> SIMD & operator= (const T & func) { - for(auto i : IntRange(N1)) - lo[i] = func(i); - for(auto i : IntRange(N2)) - high[i] = func(N1+i); + double *p = (double*)this; + for(auto i : IntRange(N)) + p[i] = func(i); return *this; } @@ -285,13 +287,13 @@ namespace ngcore : lo{p, mask.Lo()}, high{p+N1, mask.Hi()} { } - SIMD( std::array arr ) + explicit SIMD( std::array arr ) : lo(detail::array_range(arr, 0)), high(detail::array_range(arr, N1)) {} template - SIMD(const T... vals) + explicit SIMD(const T... vals) : lo(detail::array_range(std::array{vals...}, 0)), high(detail::array_range(std::array{vals...}, N1)) { @@ -312,7 +314,10 @@ namespace ngcore template> operator std::tuple () - { return std::tuple((*this)[0], (*this)[1]); } + { + double *p = (double*)this; + return std::tuple(p[0], p[1]); + } template> operator std::tuple () @@ -325,6 +330,7 @@ namespace ngcore if constexpr(I(); else return high.template Get(); } + auto Data() const { return *this; } }; @@ -359,42 +365,42 @@ namespace ngcore } template - NETGEN_INLINE SIMD operator< (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator< (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() < b.Data(); else return { a.Lo() - NETGEN_INLINE SIMD operator<= (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator<= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() <= b.Data(); else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() }; } template - NETGEN_INLINE SIMD operator> (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator> (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() > b.Data(); else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() }; } template - NETGEN_INLINE SIMD operator>= (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator>= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() >= b.Data(); else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() }; } template - NETGEN_INLINE SIMD operator== (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator== (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() == b.Data(); else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() }; } template - NETGEN_INLINE SIMD operator!= (SIMD & a, SIMD b) + NETGEN_INLINE SIMD operator!= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() != b.Data(); else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() }; @@ -547,6 +553,30 @@ namespace ngcore return ost; } + using std::sqrt; + template + NETGEN_INLINE ngcore::SIMD sqrt (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return sqrt(a[i]); } ); + } + + using std::fabs; + template + NETGEN_INLINE ngcore::SIMD fabs (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return fabs(a[i]); } ); + } + + using std::floor; + template + NETGEN_INLINE ngcore::SIMD floor (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return floor(a[i]); } ); + } + + using std::ceil; + template + NETGEN_INLINE ngcore::SIMD ceil (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return ceil(a[i]); } ); + } + using std::exp; template NETGEN_INLINE ngcore::SIMD exp (ngcore::SIMD a) { @@ -634,6 +664,11 @@ namespace ngcore { return std::make_tuple(SIMD{a.Data()}, SIMD{b.Data()} ); } + else if constexpr(N==2) + { + return std::make_tuple(SIMD{ a.Lo(), b.Lo() }, + SIMD{ a.Hi(), b.Hi() }); + } else { auto [a1,b1] = Unpack(a.Lo(), b.Lo()); diff --git a/libsrc/core/taskmanager.cpp b/libsrc/core/taskmanager.cpp index a1049a1c..1d88b766 100644 --- a/libsrc/core/taskmanager.cpp +++ b/libsrc/core/taskmanager.cpp @@ -201,14 +201,14 @@ namespace ngcore ; } - static size_t calibrate_init_tsc = __rdtsc(); + static size_t calibrate_init_tsc = GetTimeCounter(); typedef std::chrono::system_clock TClock; static TClock::time_point calibrate_init_clock = TClock::now(); void TaskManager :: StopWorkers() { done = true; - double delta_tsc = __rdtsc()-calibrate_init_tsc; + double delta_tsc = GetTimeCounter()-calibrate_init_tsc; double delta_sec = std::chrono::duration(TClock::now()-calibrate_init_clock).count(); double frequ = (delta_sec != 0) ? delta_tsc/delta_sec : 2.7e9; @@ -421,7 +421,11 @@ namespace ngcore if (workers_on_node[j]) { while (complete[j] != jobnr) + { +#ifdef NETGEN_ARCH_AMD64 _mm_pause(); +#endif // NETGEN_ARCH_AMD64 + } } func = nullptr; diff --git a/libsrc/core/utils.hpp b/libsrc/core/utils.hpp index 81b0073f..ca015ae3 100644 --- a/libsrc/core/utils.hpp +++ b/libsrc/core/utils.hpp @@ -8,13 +8,19 @@ #include #include +#include "ngcore_api.hpp" // for NGCORE_API and CPU arch macros + +#if defined(__APPLE__) && defined(NETGEN_ARCH_ARM) +#include +#endif + +#ifdef NETGEN_ARCH_AMD64 #ifdef WIN32 #include // for __rdtsc() CPU time step counter #else #include // for __rdtsc() CPU time step counter #endif // WIN32 - -#include "ngcore_api.hpp" // for NGCORE_API +#endif // NETGEN_ARCH_AMD64 namespace ngcore { @@ -52,7 +58,16 @@ namespace ngcore inline TTimePoint GetTimeCounter() noexcept { - return TTimePoint(__rdtsc()); +#if defined(__APPLE__) && defined(NETGEN_ARCH_ARM) + return mach_absolute_time(); +#elif defined(NETGEN_ARCH_AMD64) + return __rdtsc(); +#elif defined(NETGEN_ARCH_ARM) + return __builtin_readcyclecounter(); +#else +#warning "Unsupported CPU architecture" + return 0; +#endif } template @@ -161,7 +176,9 @@ namespace ngcore while (!m.compare_exchange_weak(should, true)) { should = false; +#ifdef NETGEN_ARCH_AMD64 _mm_pause(); +#endif // NETGEN_ARCH_AMD64 } } void unlock()