Merge branch 'apple_silicon' into 'master'

Support for Apple M1

See merge request jschoeberl/netgen!359
This commit is contained in:
Joachim Schöberl 2020-12-16 20:47:21 +00:00
commit d30accdc1a
6 changed files with 107 additions and 41 deletions

View File

@ -67,6 +67,14 @@
#endif #endif
#endif #endif
#if defined(__amd64__) || defined(_M_AMD64)
#define NETGEN_ARCH_AMD64
#endif
#if defined(__arm64__) || defined(_M_ARM64)
#define NETGEN_ARCH_ARM
#endif
#ifdef __MAC_OS_X_VERSION_MIN_REQUIRED #ifdef __MAC_OS_X_VERSION_MIN_REQUIRED
#if __MAC_OS_X_VERSION_MIN_REQUIRED < 101400 #if __MAC_OS_X_VERSION_MIN_REQUIRED < 101400
// The c++ standard library on MacOS 10.13 and earlier has no aligned new operator, // The c++ standard library on MacOS 10.13 and earlier has no aligned new operator,

View File

@ -11,7 +11,7 @@
#include "simd_generic.hpp" #include "simd_generic.hpp"
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) #ifdef NETGEN_ARCH_AMD64
#ifndef __SSE__ #ifndef __SSE__
#define __SSE__ #define __SSE__
#endif #endif
@ -28,6 +28,7 @@
namespace ngcore namespace ngcore
{ {
#ifdef NETGEN_ARCH_AMD64
NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4) NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
{ {
SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data()); SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data());
@ -35,6 +36,12 @@ namespace ngcore
return SIMD<double,4> (hsum1, hsum2); return SIMD<double,4> (hsum1, hsum2);
} }
NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
{
return SIMD<mask64>::GetMaskFromBits(i);
}
#endif
NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4, NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4) SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)
@ -59,11 +66,6 @@ namespace ngcore
{ {
return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4)); return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4));
} }
NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
{
return SIMD<mask64>::GetMaskFromBits(i);
}
} }
#endif // NETGEN_CORE_SIMD_HPP #endif // NETGEN_CORE_SIMD_HPP

View File

@ -42,7 +42,7 @@ namespace ngcore
{ {
__m256i mask; __m256i mask;
public: public:
SIMD (size_t i) SIMD (int64_t i)
: mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i), : mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i),
_mm256_set_epi64x(3, 2, 1, 0))) _mm256_set_epi64x(3, 2, 1, 0)))
{ ; } { ; }

View File

@ -21,10 +21,10 @@ namespace ngcore
return 8; return 8;
#elif defined __AVX__ #elif defined __AVX__
return 4; return 4;
#elif (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) #elif defined NETGEN_ARCH_AMD64
return 2; return 2;
#else #else
return 1; return 2;
#endif #endif
} }
@ -58,7 +58,7 @@ namespace ngcore
{ {
int64_t mask; int64_t mask;
public: public:
SIMD (size_t i) SIMD (int64_t i)
: mask(i > 0 ? -1 : 0) { ; } : mask(i > 0 ? -1 : 0) { ; }
bool Data() const { return mask; } bool Data() const { return mask; }
static constexpr int Size() { return 1; } static constexpr int Size() { return 1; }
@ -67,7 +67,7 @@ namespace ngcore
template <int N> template <int N>
class SIMD<mask64,N> class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<mask64,N>
{ {
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1; static constexpr int N2 = N-N1;
@ -76,7 +76,7 @@ namespace ngcore
SIMD<mask64,N2> hi; SIMD<mask64,N2> hi;
public: public:
SIMD (int i) : lo(i), hi(i-N1) { ; } SIMD (int64_t i) : lo(i), hi(i-N1 ) { ; }
SIMD (SIMD<mask64,N1> lo_, SIMD<mask64,N2> hi_) : lo(lo_), hi(hi_) { ; } SIMD (SIMD<mask64,N1> lo_, SIMD<mask64,N2> hi_) : lo(lo_), hi(hi_) { ; }
SIMD<mask64,N1> Lo() const { return lo; } SIMD<mask64,N1> Lo() const { return lo; }
SIMD<mask64,N2> Hi() const { return hi; } SIMD<mask64,N2> Hi() const { return hi; }
@ -104,8 +104,10 @@ namespace ngcore
SIMD () {} SIMD () {}
SIMD (const SIMD &) = default; SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default; SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) { data = val; } SIMD (int val) : data{val} {}
SIMD (std::array<int64_t, 1> arr) SIMD (int64_t val) : data{val} {}
SIMD (size_t val) : data(val) {}
explicit SIMD (std::array<int64_t, 1> arr)
: data{arr[0]} : data{arr[0]}
{} {}
@ -121,7 +123,7 @@ namespace ngcore
}; };
template<int N> template<int N>
class SIMD<int64_t,N> class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<int64_t,N>
{ {
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1; static constexpr int N2 = N-N1;
@ -136,16 +138,18 @@ namespace ngcore
SIMD (const SIMD &) = default; SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default; SIMD & operator= (const SIMD &) = default;
SIMD (int val) : lo{val}, high{val} { ; }
SIMD (int64_t val) : lo{val}, high{val} { ; } SIMD (int64_t val) : lo{val}, high{val} { ; }
SIMD (size_t val) : lo{val}, high{val} { ; }
SIMD (SIMD<int64_t,N1> lo_, SIMD<int64_t,N2> high_) : lo(lo_), high(high_) { ; } SIMD (SIMD<int64_t,N1> lo_, SIMD<int64_t,N2> high_) : lo(lo_), high(high_) { ; }
SIMD( std::array<int64_t, N> arr ) explicit SIMD( std::array<int64_t, N> arr )
: lo(detail::array_range<N1>(arr, 0)), : lo(detail::array_range<N1>(arr, 0)),
high(detail::array_range<N2>(arr, N1)) high(detail::array_range<N2>(arr, N1))
{} {}
template<typename ...T> template<typename ...T>
SIMD(const T... vals) explicit SIMD(const T... vals)
: lo(detail::array_range<N1>(std::array<int64_t, N>{vals...}, 0)), : lo(detail::array_range<N1>(std::array<int64_t, N>{vals...}, 0)),
high(detail::array_range<N2>(std::array<int64_t, N>{vals...}, N1)) high(detail::array_range<N2>(std::array<int64_t, N>{vals...}, N1))
{ {
@ -204,7 +208,7 @@ namespace ngcore
SIMD (size_t val) { data = val; } SIMD (size_t val) { data = val; }
SIMD (double const * p) { data = *p; } SIMD (double const * p) { data = *p; }
SIMD (double const * p, SIMD<mask64,1> mask) { data = mask.Data() ? *p : 0.0; } SIMD (double const * p, SIMD<mask64,1> mask) { data = mask.Data() ? *p : 0.0; }
SIMD (std::array<double, 1> arr) explicit SIMD (std::array<double, 1> arr)
: data{arr[0]} : data{arr[0]}
{} {}
@ -236,7 +240,7 @@ namespace ngcore
template<int N> template<int N>
class SIMD<double, N> class alignas(GetDefaultSIMDSize()*sizeof(double)) SIMD<double, N>
{ {
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1; static constexpr int N2 = N-N1;
@ -253,19 +257,17 @@ namespace ngcore
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0> template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD (const T & func) SIMD (const T & func)
{ {
for(auto i : IntRange(N1)) double *p = (double*)this;
lo[i] = func(i); for(auto i : IntRange(N))
for(auto i : IntRange(N2)) p[i] = func(i);
high[i] = func(N1+i);
} }
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0> template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD & operator= (const T & func) SIMD & operator= (const T & func)
{ {
for(auto i : IntRange(N1)) double *p = (double*)this;
lo[i] = func(i); for(auto i : IntRange(N))
for(auto i : IntRange(N2)) p[i] = func(i);
high[i] = func(N1+i);
return *this; return *this;
} }
@ -285,13 +287,13 @@ namespace ngcore
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()} : lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
{ } { }
SIMD( std::array<double, N> arr ) explicit SIMD( std::array<double, N> arr )
: lo(detail::array_range<N1>(arr, 0)), : lo(detail::array_range<N1>(arr, 0)),
high(detail::array_range<N2>(arr, N1)) high(detail::array_range<N2>(arr, N1))
{} {}
template<typename ...T> template<typename ...T>
SIMD(const T... vals) explicit SIMD(const T... vals)
: lo(detail::array_range<N1>(std::array<double, N>{vals...}, 0)), : lo(detail::array_range<N1>(std::array<double, N>{vals...}, 0)),
high(detail::array_range<N2>(std::array<double, N>{vals...}, N1)) high(detail::array_range<N2>(std::array<double, N>{vals...}, N1))
{ {
@ -312,7 +314,10 @@ namespace ngcore
template<typename=std::enable_if<N==2>> template<typename=std::enable_if<N==2>>
operator std::tuple<double&,double&> () operator std::tuple<double&,double&> ()
{ return std::tuple<double&,double&>((*this)[0], (*this)[1]); } {
double *p = (double*)this;
return std::tuple<double&,double&>(p[0], p[1]);
}
template<typename=std::enable_if<N==4>> template<typename=std::enable_if<N==4>>
operator std::tuple<double&,double&,double&,double&> () operator std::tuple<double&,double&,double&,double&> ()
@ -325,6 +330,7 @@ namespace ngcore
if constexpr(I<N1) return lo.template Get<I>(); if constexpr(I<N1) return lo.template Get<I>();
else return high.template Get<I-N1>(); else return high.template Get<I-N1>();
} }
auto Data() const { return *this; }
}; };
@ -359,42 +365,42 @@ namespace ngcore
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator< (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator< (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() < b.Data(); if constexpr(N==1) return a.Data() < b.Data();
else return { a.Lo()<b.Lo(), a.Hi()<b.Hi() }; else return { a.Lo()<b.Lo(), a.Hi()<b.Hi() };
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator<= (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator<= (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() <= b.Data(); if constexpr(N==1) return a.Data() <= b.Data();
else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() }; else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() };
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator> (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator> (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() > b.Data(); if constexpr(N==1) return a.Data() > b.Data();
else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() }; else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() };
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator>= (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator>= (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() >= b.Data(); if constexpr(N==1) return a.Data() >= b.Data();
else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() }; else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() };
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator== (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator== (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() == b.Data(); if constexpr(N==1) return a.Data() == b.Data();
else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() }; else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() };
} }
template <typename T, int N> template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator!= (SIMD<T,N> & a, SIMD<T,N> b) NETGEN_INLINE SIMD<mask64,N> operator!= (SIMD<T,N> a, SIMD<T,N> b)
{ {
if constexpr(N==1) return a.Data() != b.Data(); if constexpr(N==1) return a.Data() != b.Data();
else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() }; else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() };
@ -547,6 +553,30 @@ namespace ngcore
return ost; return ost;
} }
using std::sqrt;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> sqrt (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double>([a](int i)->double { return sqrt(a[i]); } );
}
using std::fabs;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> fabs (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double>([a](int i)->double { return fabs(a[i]); } );
}
using std::floor;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> floor (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double>([a](int i)->double { return floor(a[i]); } );
}
using std::ceil;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> ceil (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double>([a](int i)->double { return ceil(a[i]); } );
}
using std::exp; using std::exp;
template <int N> template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> exp (ngcore::SIMD<double,N> a) { NETGEN_INLINE ngcore::SIMD<double,N> exp (ngcore::SIMD<double,N> a) {
@ -634,6 +664,11 @@ namespace ngcore
{ {
return std::make_tuple(SIMD<double,N>{a.Data()}, SIMD<double,N>{b.Data()} ); return std::make_tuple(SIMD<double,N>{a.Data()}, SIMD<double,N>{b.Data()} );
} }
else if constexpr(N==2)
{
return std::make_tuple(SIMD<double,N>{ a.Lo(), b.Lo() },
SIMD<double,N>{ a.Hi(), b.Hi() });
}
else else
{ {
auto [a1,b1] = Unpack(a.Lo(), b.Lo()); auto [a1,b1] = Unpack(a.Lo(), b.Lo());

View File

@ -201,14 +201,14 @@ namespace ngcore
; ;
} }
static size_t calibrate_init_tsc = __rdtsc(); static size_t calibrate_init_tsc = GetTimeCounter();
typedef std::chrono::system_clock TClock; typedef std::chrono::system_clock TClock;
static TClock::time_point calibrate_init_clock = TClock::now(); static TClock::time_point calibrate_init_clock = TClock::now();
void TaskManager :: StopWorkers() void TaskManager :: StopWorkers()
{ {
done = true; done = true;
double delta_tsc = __rdtsc()-calibrate_init_tsc; double delta_tsc = GetTimeCounter()-calibrate_init_tsc;
double delta_sec = std::chrono::duration<double>(TClock::now()-calibrate_init_clock).count(); double delta_sec = std::chrono::duration<double>(TClock::now()-calibrate_init_clock).count();
double frequ = (delta_sec != 0) ? delta_tsc/delta_sec : 2.7e9; double frequ = (delta_sec != 0) ? delta_tsc/delta_sec : 2.7e9;
@ -421,7 +421,11 @@ namespace ngcore
if (workers_on_node[j]) if (workers_on_node[j])
{ {
while (complete[j] != jobnr) while (complete[j] != jobnr)
{
#ifdef NETGEN_ARCH_AMD64
_mm_pause(); _mm_pause();
#endif // NETGEN_ARCH_AMD64
}
} }
func = nullptr; func = nullptr;

View File

@ -8,13 +8,19 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "ngcore_api.hpp" // for NGCORE_API and CPU arch macros
#if defined(__APPLE__) && defined(NETGEN_ARCH_ARM)
#include <mach/mach_time.h>
#endif
#ifdef NETGEN_ARCH_AMD64
#ifdef WIN32 #ifdef WIN32
#include <intrin.h> // for __rdtsc() CPU time step counter #include <intrin.h> // for __rdtsc() CPU time step counter
#else #else
#include <x86intrin.h> // for __rdtsc() CPU time step counter #include <x86intrin.h> // for __rdtsc() CPU time step counter
#endif // WIN32 #endif // WIN32
#endif // NETGEN_ARCH_AMD64
#include "ngcore_api.hpp" // for NGCORE_API
namespace ngcore namespace ngcore
{ {
@ -52,7 +58,16 @@ namespace ngcore
inline TTimePoint GetTimeCounter() noexcept inline TTimePoint GetTimeCounter() noexcept
{ {
return TTimePoint(__rdtsc()); #if defined(__APPLE__) && defined(NETGEN_ARCH_ARM)
return mach_absolute_time();
#elif defined(NETGEN_ARCH_AMD64)
return __rdtsc();
#elif defined(NETGEN_ARCH_ARM)
return __builtin_readcyclecounter();
#else
#warning "Unsupported CPU architecture"
return 0;
#endif
} }
template <class T> template <class T>
@ -161,7 +176,9 @@ namespace ngcore
while (!m.compare_exchange_weak(should, true)) while (!m.compare_exchange_weak(should, true))
{ {
should = false; should = false;
#ifdef NETGEN_ARCH_AMD64
_mm_pause(); _mm_pause();
#endif // NETGEN_ARCH_AMD64
} }
} }
void unlock() void unlock()