#include "arm_neon.h" namespace ngcore { template <> class SIMD { int64x2_t mask; public: SIMD (int i) { mask[0] = i > 0 ? -1 : 0; mask[1] = i > 1 ? -1 : 0; } SIMD (bool i0, bool i1) { mask[0] = i0 ? -1:0; mask[1] = i1 ? -1 : 0; } SIMD (SIMD i0, SIMD i1) { mask[0] = i0[0]; mask[1] = i1[0]; } SIMD (float64x2_t _data) : mask{_data} { } auto Data() const { return mask; } static constexpr int Size() { return 2; } // static NETGEN_INLINE SIMD GetMaskFromBits (unsigned int i); int64_t operator[] (int i) const { return mask[i]; } auto Lo() const { return mask[0]; } auto Hi() const { return mask[1]; } }; template<> class SIMD { float64x2_t data; public: static constexpr int Size() { return 2; } SIMD () {} SIMD (const SIMD &) = default; // SIMD (double v0, double v1) : data{v0,v1} { } SIMD (double v0, double v1) : data{vcombine_f64(float64x1_t{v0}, float64x1_t{v1})} { } SIMD (std::array arr) : data{arr[0], arr[1]} { } SIMD & operator= (const SIMD &) = default; SIMD (double val) : data{val,val} { } SIMD (int val) : data{double(val),double(val)} { } SIMD (size_t val) : data{double(val),double(val)} { } SIMD (double const * p) { data = vld1q_f64(p); // data[0] = p[0]; // data[1] = p[1]; } SIMD (double const * p, SIMD mask) { data[0] = mask[0] ? p[0] : 0; data[1] = mask[1] ? p[1] : 0; } SIMD (float64x2_t _data) { data = _data; } template>::value, int>::type = 0> SIMD (const T & func) { data[0] = func(0); data[1] = func(1); } void Store (double * p) { vst1q_f64(p, data); /* p[0] = data[0]; p[1] = data[1]; */ } void Store (double * p, SIMD mask) { if (mask[0]) p[0] = data[0]; if (mask[1]) p[1] = data[1]; } // NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } NETGEN_INLINE double operator[] (int i) const { return data[i]; } NETGEN_INLINE auto Data() const { return data; } NETGEN_INLINE auto & Data() { return data; } operator std::tuple () { auto pdata = (double*)&data; return std::tuple(pdata[0], pdata[1]); } double Lo() const { return data[0]; } double Hi() const { return data[1]; } // __ai float64x1_t vget_high_f64(float64x2_t __p0) { }; NETGEN_INLINE double HSum (SIMD sd) { return sd[0]+sd[1]; } NETGEN_INLINE SIMD HSum (SIMD a, SIMD b) { return SIMD (a[0]+a[1], b[0]+b[1]); } // a*b+c NETGEN_INLINE SIMD FMA (SIMD a, SIMD b, SIMD c) { return vmlaq_f64(c.Data(), a.Data(), b.Data()); } NETGEN_INLINE SIMD FMA (const double & a, SIMD b, SIMD c) { return FMA(SIMD (a), b, c); } // -a*b+c NETGEN_INLINE SIMD FNMA (SIMD a, SIMD b, SIMD c) { return vmlsq_f64(c.Data(), a.Data(), b.Data()); // return c-a*b; } NETGEN_INLINE SIMD FNMA (const double & a, SIMD b, SIMD c) { return FNMA(SIMD (a), b, c); } NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } NETGEN_INLINE SIMD operator- (SIMD a) { return -a.Data(); } NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) { // return { a[0] ? b[0] : c[0], a[1] ? b[1] : c[1] }; return vbslq_f64(a.Data(), b.Data(), c.Data()); } NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) { return SIMD (a[0] ? b[0] : c[0], a[1] ? b[1] : c[1]); } NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) { return vandq_u64 (a.Data(), b.Data()); } }