#ifndef NETGEN_CORE_SIMD_GENERIC_HPP #define NETGEN_CORE_SIMD_GENERIC_HPP /**************************************************************************/ /* File: simd_base.hpp */ /* Author: Joachim Schoeberl, Matthias Hochsteger */ /* Date: 25. Mar. 16 */ /**************************************************************************/ #include #include #include #include #include "array.hpp" namespace ngcore { #if defined __AVX512F__ #define NETGEN_DEFAULT_SIMD_SIZE 8 #define NETGEN_NATIVE_SIMD_SIZE 8 #elif defined __AVX__ #define NETGEN_DEFAULT_SIMD_SIZE 4 #define NETGEN_NATIVE_SIMD_SIZE 4 #elif defined NETGEN_ARCH_AMD64 #define NETGEN_DEFAULT_SIMD_SIZE 2 #define NETGEN_NATIVE_SIMD_SIZE 2 #else #define NETGEN_DEFAULT_SIMD_SIZE 2 #define NETGEN_NATIVE_SIMD_SIZE 1 #endif constexpr int GetDefaultSIMDSize() { return NETGEN_DEFAULT_SIMD_SIZE; } constexpr bool IsNativeSIMDSize(int n) { if(n==1) return true; #if defined NETGEN_ARCH_AMD64 || defined __SSE__ || defined __aarch64__ if(n==2) return true; #endif #if defined __AVX__ if(n==4) return true; #endif #if defined __AVX512F__ if(n==8) return true; #endif return false; } // split n = k+l such that k is the largest natively supported simd size < n constexpr int GetLargestNativeSIMDPart(int n) { int k = n-1; while(!IsNativeSIMDSize(k)) k--; return k; } template class SIMD; class mask64; //////////////////////////////////////////////////////////////////////////// namespace detail { template auto array_range_impl(std::array const& arr, size_t first, std::index_sequence) -> std::array { return {arr[first + I]...}; } template auto array_range(std::array const& arr, size_t first) { return array_range_impl(arr, first, std::make_index_sequence{}); } } // namespace detail //////////////////////////////////////////////////////////////////////////// // mask template <> class SIMD { int64_t mask; public: SIMD (int64_t i) : mask(i > 0 ? -1 : 0) { ; } bool Data() const { return mask; } static constexpr int Size() { return 1; } auto operator[] (int /* i */) const { return mask; } }; template class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD { static constexpr int N1 = GetLargestNativeSIMDPart(N); static constexpr int N2 = N-N1; SIMD lo; SIMD hi; public: SIMD (int64_t i) : lo(i), hi(i-N1 ) { ; } SIMD (SIMD lo_, SIMD hi_) : lo(lo_), hi(hi_) { ; } SIMD Lo() const { return lo; } SIMD Hi() const { return hi; } static constexpr int Size() { return N; } }; template NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() && b.Data(); else return { a.Lo() && b.Lo(), a.Hi() && b.Hi() }; } //////////////////////////////////////////////////////////////////////////// // int64 template<> class SIMD { int64_t data; public: static constexpr int Size() { return 1; } SIMD () {} SIMD (const SIMD &) = default; SIMD & operator= (const SIMD &) = default; SIMD (int val) : data{val} {} SIMD (int64_t val) : data{val} {} SIMD (size_t val) : data(val) {} explicit SIMD (std::array arr) : data{arr[0]} {} int64_t operator[] (int i) const { return ((int64_t*)(&data))[i]; } auto Data() const { return data; } static SIMD FirstInt(int64_t n0=0) { return {n0}; } template int64_t Get() { static_assert(I==0); return data; } }; template class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD { static constexpr int N1 = GetLargestNativeSIMDPart(N); static constexpr int N2 = N-N1; SIMD lo; SIMD high; public: static constexpr int Size() { return N; } SIMD () {} SIMD (const SIMD &) = default; SIMD & operator= (const SIMD &) = default; SIMD (int val) : lo{val}, high{val} { ; } SIMD (int64_t val) : lo{val}, high{val} { ; } SIMD (size_t val) : lo{val}, high{val} { ; } SIMD (SIMD lo_, SIMD high_) : lo(lo_), high(high_) { ; } explicit SIMD( std::array arr ) : lo(detail::array_range(arr, 0)), high(detail::array_range(arr, N1)) {} template explicit SIMD(const T... vals) : lo(detail::array_range(std::array{vals...}, 0)), high(detail::array_range(std::array{vals...}, N1)) { static_assert(sizeof...(vals)==N, "wrong number of arguments"); } template>::value, int>::type = 0> SIMD (const T & func) { for(auto i : IntRange(N1)) lo[i] = func(i); for(auto i : IntRange(N2)) high[i] = func(N1+i); } auto Lo() const { return lo; } auto Hi() const { return high; } int64_t operator[] (int i) const { return ((int64_t*)(&lo))[i]; } /* operator tuple () { return tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } */ /* static SIMD FirstInt() { return { 0, 1, 2, 3 }; } */ static SIMD FirstInt(int64_t n0=0) { return {SIMD::FirstInt(n0), SIMD::FirstInt(n0+N1)}; } template int64_t Get() { static_assert(I>=0 && I(); else return high.template Get(); } }; //////////////////////////////////////////////////////////////////////////// // double template<> class SIMD { double data; public: static constexpr int Size() { return 1; } SIMD () {} SIMD (const SIMD &) = default; SIMD & operator= (const SIMD &) = default; SIMD (double val) { data = val; } SIMD (int val) { data = val; } SIMD (size_t val) { data = val; } SIMD (double const * p) { data = *p; } SIMD (double const * p, SIMD mask) { data = mask.Data() ? *p : 0.0; } explicit SIMD (std::array arr) : data{arr[0]} {} template >::value,int>::type = 0> SIMD (const T & func) { data = func(0); } template >::value,int>::type = 0> SIMD & operator= (const T & func) { data = func(0); return *this; } void Store (double * p) { *p = data; } void Store (double * p, SIMD mask) { if (mask.Data()) *p = data; } double operator[] (int i) const { return ((double*)(&data))[i]; } double Data() const { return data; } template double Get() { static_assert(I==0); return data; } }; template class alignas(GetLargestNativeSIMDPart(N)*sizeof(double)) SIMD { static constexpr int N1 = GetLargestNativeSIMDPart(N); static constexpr int N2 = N-N1; SIMD lo; SIMD high; public: static constexpr int Size() { return N; } SIMD () {} SIMD (const SIMD &) = default; SIMD (SIMD lo_, SIMD hi_) : lo(lo_), high(hi_) { ; } template >::value,int>::type = 0> SIMD (const T & func) { double *p = (double*)this; for(auto i : IntRange(N)) p[i] = func(i); } template >::value,int>::type = 0> SIMD & operator= (const T & func) { double *p = (double*)this; for(auto i : IntRange(N)) p[i] = func(i); return *this; } SIMD & operator= (const SIMD &) = default; SIMD (double val) : lo{val}, high{val} { ; } SIMD (int val) : lo{val}, high{val} { ; } SIMD (size_t val) : lo{val}, high{val} { ; } SIMD (double const * p) : lo{p}, high{p+N1} { ; } SIMD (double const * p, SIMD mask) : lo{p, mask.Lo()}, high{p+N1, mask.Hi()} { } SIMD (double * p) : lo{p}, high{p+N1} { ; } SIMD (double * p, SIMD mask) : lo{p, mask.Lo()}, high{p+N1, mask.Hi()} { } explicit SIMD( std::array arr ) : lo(detail::array_range(arr, 0)), high(detail::array_range(arr, N1)) {} template explicit SIMD(const T... vals) : lo(detail::array_range(std::array{vals...}, 0)), high(detail::array_range(std::array{vals...}, N1)) { static_assert(sizeof...(vals)==N, "wrong number of arguments"); } void Store (double * p) { lo.Store(p); high.Store(p+N1); } void Store (double * p, SIMD mask) { lo.Store(p, mask.Lo()); high.Store(p+N1, mask.Hi()); } auto Lo() const { return lo; } auto Hi() const { return high; } double operator[] (int i) const { return ((double*)(&lo))[i]; } template> operator std::tuple () { double *p = (double*)this; return std::tuple(p[0], p[1]); } template> operator std::tuple () { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } template double Get() { static_assert(I>=0 && I(); else return high.template Get(); } auto Data() const { return *this; } }; // Generic operators for any arithmetic type/simd width template NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { if constexpr(N==1) return a.Data()+b.Data(); else return { a.Lo()+b.Lo(), a.Hi()+b.Hi() }; } template NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { if constexpr(N==1) return a.Data()-b.Data(); else return { a.Lo()-b.Lo(), a.Hi()-b.Hi() }; } template NETGEN_INLINE SIMD operator- (SIMD a) { if constexpr(N==1) return -a.Data(); else return { -a.Lo(), -a.Hi() }; } template NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { if constexpr(N==1) return a.Data()*b.Data(); else return { a.Lo()*b.Lo(), a.Hi()*b.Hi() }; } template NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { if constexpr(N==1) return a.Data()/b.Data(); else return { a.Lo()/b.Lo(), a.Hi()/b.Hi() }; } template NETGEN_INLINE SIMD operator< (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() < b.Data(); else return { a.Lo() NETGEN_INLINE SIMD operator<= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() <= b.Data(); else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() }; } template NETGEN_INLINE SIMD operator> (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() > b.Data(); else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() }; } template NETGEN_INLINE SIMD operator>= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() >= b.Data(); else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() }; } template NETGEN_INLINE SIMD operator== (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() == b.Data(); else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() }; } template NETGEN_INLINE SIMD operator!= (SIMD a, SIMD b) { if constexpr(N==1) return a.Data() != b.Data(); else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() }; } // int64_t operators with scalar operand (implement overloads to allow implicit casts for second operand) template NETGEN_INLINE SIMD operator+ (SIMD a, int64_t b) { return a+SIMD(b); } template NETGEN_INLINE SIMD operator+ (int64_t a, SIMD b) { return SIMD(a)+b; } template NETGEN_INLINE SIMD operator- (int64_t a, SIMD b) { return SIMD(a)-b; } template NETGEN_INLINE SIMD operator- (SIMD a, int64_t b) { return a-SIMD(b); } template NETGEN_INLINE SIMD operator* (int64_t a, SIMD b) { return SIMD(a)*b; } template NETGEN_INLINE SIMD operator* (SIMD b, int64_t a) { return SIMD(a)*b; } template NETGEN_INLINE SIMD operator/ (SIMD a, int64_t b) { return a/SIMD(b); } template NETGEN_INLINE SIMD operator/ (int64_t a, SIMD b) { return SIMD(a)/b; } template NETGEN_INLINE SIMD & operator+= (SIMD & a, SIMD b) { a=a+b; return a; } template NETGEN_INLINE SIMD & operator+= (SIMD & a, int64_t b) { a+=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator-= (SIMD & a, SIMD b) { a = a-b; return a; } template NETGEN_INLINE SIMD & operator-= (SIMD & a, int64_t b) { a-=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator*= (SIMD & a, SIMD b) { a=a*b; return a; } template NETGEN_INLINE SIMD & operator*= (SIMD & a, int64_t b) { a*=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator/= (SIMD & a, SIMD b) { a = a/b; return a; } // double operators with scalar operand (implement overloads to allow implicit casts for second operand) template NETGEN_INLINE SIMD operator+ (SIMD a, double b) { return a+SIMD(b); } template NETGEN_INLINE SIMD operator+ (double a, SIMD b) { return SIMD(a)+b; } template NETGEN_INLINE SIMD operator- (double a, SIMD b) { return SIMD(a)-b; } template NETGEN_INLINE SIMD operator- (SIMD a, double b) { return a-SIMD(b); } template NETGEN_INLINE SIMD operator* (double a, SIMD b) { return SIMD(a)*b; } template NETGEN_INLINE SIMD operator* (SIMD b, double a) { return SIMD(a)*b; } template NETGEN_INLINE SIMD operator/ (SIMD a, double b) { return a/SIMD(b); } template NETGEN_INLINE SIMD operator/ (double a, SIMD b) { return SIMD(a)/b; } template NETGEN_INLINE SIMD & operator+= (SIMD & a, SIMD b) { a=a+b; return a; } template NETGEN_INLINE SIMD & operator+= (SIMD & a, double b) { a+=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator-= (SIMD & a, SIMD b) { a = a-b; return a; } template NETGEN_INLINE SIMD & operator-= (SIMD & a, double b) { a-=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator*= (SIMD & a, SIMD b) { a=a*b; return a; } template NETGEN_INLINE SIMD & operator*= (SIMD & a, double b) { a*=SIMD(b); return a; } template NETGEN_INLINE SIMD & operator/= (SIMD & a, SIMD b) { a = a/b; return a; } // double functions template NETGEN_INLINE SIMD L2Norm2 (SIMD a) { return a*a; } template NETGEN_INLINE SIMD Trans (SIMD a) { return a; } template NETGEN_INLINE double HSum (SIMD a) { if constexpr(N==1) return a.Data(); else return HSum(a.Lo()) + HSum(a.Hi()); } template NETGEN_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) { if constexpr(N==1) return a.Data()>0.0 ? b : c; else return { IfPos(a.Lo(), b.Lo(), c.Lo()), IfPos(a.Hi(), b.Hi(), c.Hi())}; } template NETGEN_INLINE SIMD IfZero (SIMD a, SIMD b, SIMD c) { if constexpr(N==1) return a.Data()==0.0 ? b : c; else return { IfZero(a.Lo(), b.Lo(), c.Lo()), IfZero(a.Hi(), b.Hi(), c.Hi())}; } template NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) { if constexpr(N==1) return a.Data() ? b : c; else return { If(a.Lo(), b.Lo(), c.Lo()), If(a.Hi(), b.Hi(), c.Hi())}; } // a*b+c template NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c) { return c+a*b; } template NETGEN_INLINE auto FNMA(T1 a, T2 b, T3 c) { return c-a*b; } // update form of fma template void FMAasm (SIMD a, SIMD b, SIMD & sum) { sum = FMA(a,b,sum); } // update form of fms template void FNMAasm (SIMD a, SIMD b, SIMD & sum) { // sum -= a*b; sum = FNMA(a,b,sum); } template T get(SIMD a) { return a.template Get(); } template NETGEN_INLINE void Iterate2 (FUNC f) { if constexpr (NUM > 1) Iterate2 (f); if constexpr (NUM >= 1) f(std::integral_constant()); } template ostream & operator<< (ostream & ost, SIMD simd) { /* ost << simd[0]; for (int i = 1; i < simd.Size(); i++) ost << " " << simd[i]; */ Iterate2 ([&] (auto I) { if (I.value != 0) ost << " "; ost << get(simd); }); return ost; } using std::sqrt; template NETGEN_INLINE ngcore::SIMD sqrt (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return sqrt(a[i]); } ); } using std::fabs; template NETGEN_INLINE ngcore::SIMD fabs (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return fabs(a[i]); } ); } using std::floor; template NETGEN_INLINE ngcore::SIMD floor (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return floor(a[i]); } ); } using std::ceil; template NETGEN_INLINE ngcore::SIMD ceil (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return ceil(a[i]); } ); } using std::exp; template NETGEN_INLINE ngcore::SIMD exp (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return exp(a[i]); } ); } using std::log; template NETGEN_INLINE ngcore::SIMD log (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return log(a[i]); } ); } using std::erf; template NETGEN_INLINE ngcore::SIMD erf (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return erf(a[i]); } ); } using std::pow; template NETGEN_INLINE ngcore::SIMD pow (ngcore::SIMD a, double x) { return ngcore::SIMD([a,x](int i)->double { return pow(a[i],x); } ); } template NETGEN_INLINE ngcore::SIMD pow (ngcore::SIMD a, ngcore::SIMD b) { return ngcore::SIMD([a,b](int i)->double { return pow(a[i],b[i]); } ); } using std::sin; template NETGEN_INLINE ngcore::SIMD sin (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return sin(a[i]); } ); } using std::cos; template NETGEN_INLINE ngcore::SIMD cos (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return cos(a[i]); } ); } using std::tan; template NETGEN_INLINE ngcore::SIMD tan (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return tan(a[i]); } ); } using std::atan; template NETGEN_INLINE ngcore::SIMD atan (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return atan(a[i]); } ); } using std::atan2; template NETGEN_INLINE ngcore::SIMD atan2 (ngcore::SIMD y, ngcore::SIMD x) { return ngcore::SIMD([y,x](int i)->double { return atan2(y[i], x[i]); } ); } using std::acos; template NETGEN_INLINE ngcore::SIMD acos (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return acos(a[i]); } ); } using std::asin; template NETGEN_INLINE ngcore::SIMD asin (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return asin(a[i]); } ); } using std::sinh; template NETGEN_INLINE ngcore::SIMD sinh (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return sinh(a[i]); } ); } using std::cosh; template NETGEN_INLINE ngcore::SIMD cosh (ngcore::SIMD a) { return ngcore::SIMD([a](int i)->double { return cosh(a[i]); } ); } template using MultiSIMD = SIMD; template NETGEN_INLINE auto Unpack (SIMD a, SIMD b) { if constexpr(N==1) { return std::make_tuple(SIMD{a.Data()}, SIMD{b.Data()} ); } else if constexpr(N==2) { return std::make_tuple(SIMD{ a.Lo(), b.Lo() }, SIMD{ a.Hi(), b.Hi() }); } else { auto [a1,b1] = Unpack(a.Lo(), b.Lo()); auto [a2,b2] = Unpack(a.Hi(), b.Hi()); return std::make_tuple(SIMD{ a1, a2 }, SIMD{ b1, b2 }); } } // TODO: specialize for AVX, ... template NETGEN_INLINE auto SwapPairs (SIMD a) { if constexpr(N==1) { // static_assert(false); return a; } else if constexpr(N==2) { return SIMD (a.Hi(), a.Lo()); } else { return SIMD (SwapPairs(a.Lo()), SwapPairs(a.Hi())); } } template NETGEN_INLINE auto HSum128 (SIMD a) { if constexpr(N==1) { // static_assert(false); return a; } else if constexpr(N==2) { return a; } else { return HSum128(a.Lo()) + HSum128(a.Hi()); } } // TODO: specialize for AVX, ... // a*b+-c (even: -, odd: +) template NETGEN_INLINE auto FMAddSub (SIMD a, SIMD b, SIMD c) { if constexpr(N==1) { // static_assert(false); return a*b-c; } else if constexpr(N==2) { return SIMD (a.Lo()*b.Lo()-c.Lo(), a.Hi()*b.Hi()+c.Hi()); } else { return SIMD (FMAddSub(a.Lo(), b.Lo(), c.Lo()), FMAddSub(a.Hi(), b.Hi(), c.Hi())); } } } namespace std { // structured binding support template struct tuple_size> : std::integral_constant {}; template struct tuple_element> { using type = T; }; } #endif // NETGEN_CORE_SIMD_GENERIC_HPP