diff --git a/libsrc/core/CMakeLists.txt b/libsrc/core/CMakeLists.txt index c3eba6a5..8c8506fe 100644 --- a/libsrc/core/CMakeLists.txt +++ b/libsrc/core/CMakeLists.txt @@ -73,6 +73,7 @@ install(FILES ngcore.hpp archive.hpp type_traits.hpp version.hpp ngcore_api.hpp exception.hpp symboltable.hpp paje_trace.hpp utils.hpp profiler.hpp mpi_wrapper.hpp array.hpp taskmanager.hpp concurrentqueue.h localheap.hpp python_ngcore.hpp flags.hpp xbool.hpp signal.hpp bitarray.hpp table.hpp hashtable.hpp ranges.hpp + simd.hpp simd_avx.hpp simd_avx512.hpp simd_generic.hpp simd_sse.hpp DESTINATION ${NG_INSTALL_DIR_INCLUDE}/core COMPONENT netgen_devel) if(ENABLE_CPP_CORE_GUIDELINES_CHECK) diff --git a/libsrc/core/ngcore.hpp b/libsrc/core/ngcore.hpp index 72ebde25..37ce696a 100644 --- a/libsrc/core/ngcore.hpp +++ b/libsrc/core/ngcore.hpp @@ -13,6 +13,7 @@ #include "mpi_wrapper.hpp" #include "profiler.hpp" #include "signal.hpp" +#include "simd.hpp" #include "symboltable.hpp" #include "taskmanager.hpp" #include "version.hpp" diff --git a/libsrc/core/simd.hpp b/libsrc/core/simd.hpp new file mode 100644 index 00000000..277dd851 --- /dev/null +++ b/libsrc/core/simd.hpp @@ -0,0 +1,69 @@ +#ifndef NETGEN_CORE_SIMD_HPP +#define NETGEN_CORE_SIMD_HPP + +/**************************************************************************/ +/* File: simd.hpp */ +/* Author: Joachim Schoeberl, Matthias Hochsteger */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include "ngcore_api.hpp" + +#include "simd_generic.hpp" + +#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) +#ifndef __SSE__ +#define __SSE__ +#endif +#include "simd_sse.hpp" +#endif + +#ifdef __AVX__ +#include "simd_avx.hpp" +#endif + +#ifdef __AVX512F__ +#include "simd_avx512.hpp" +#endif + +namespace ngcore +{ + NETGEN_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + SIMD hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data()); + SIMD hsum2 = my_mm_hadd_pd (v3.Data(), v4.Data()); + return SIMD (hsum1, hsum2); + } + + + NETGEN_INLINE void SIMDTranspose (SIMD a1, SIMD a2, SIMD a3, SIMD a4, + SIMD & b1, SIMD & b2, SIMD & b3, SIMD & b4) + { + SIMD h1,h2,h3,h4; + std::tie(h1,h2) = Unpack(a1,a2); + std::tie(h3,h4) = Unpack(a3,a4); + b1 = SIMD (h1.Lo(), h3.Lo()); + b2 = SIMD (h2.Lo(), h4.Lo()); + b3 = SIMD (h1.Hi(), h3.Hi()); + b4 = SIMD (h2.Hi(), h4.Hi()); + } + + template + NETGEN_INLINE auto HSum (SIMD s1, SIMD s2) + { + return SIMD(HSum(s1), HSum(s2)); + } + + template + NETGEN_INLINE auto HSum (SIMD s1, SIMD s2, SIMD s3, SIMD s4 ) + { + return SIMD(HSum(s1), HSum(s2), HSum(s3), HSum(s4)); + } + + NETGEN_INLINE auto GetMaskFromBits( unsigned int i ) + { + return SIMD::GetMaskFromBits(i); + } +} + +#endif // NETGEN_CORE_SIMD_HPP diff --git a/libsrc/core/simd_avx.hpp b/libsrc/core/simd_avx.hpp new file mode 100644 index 00000000..f089a0b2 --- /dev/null +++ b/libsrc/core/simd_avx.hpp @@ -0,0 +1,256 @@ +#ifndef NETGEN_CORE_SIMD_AVX_HPP +#define NETGEN_CORE_SIMD_AVX_HPP + +/**************************************************************************/ +/* File: simd_avx.hpp */ +/* Author: Joachim Schoeberl, Matthias Hochsteger */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include + +namespace ngcore +{ +#if defined(__AVX2__) + NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b) + { + return _mm256_cmpgt_epi64 (a,b); + } +#else + NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b) + { + __m128i rlo = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 0), + _mm256_extractf128_si256(b, 0)); + __m128i rhi = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 1), + _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1); + } +#endif + + + template <> + class SIMD + { + __m256i mask; + public: + SIMD (size_t i) + : mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i), + _mm256_set_epi64x(3, 2, 1, 0))) + { ; } + SIMD (__m256i _mask) : mask(_mask) { ; } + SIMD (__m256d _mask) : mask(_mm256_castpd_si256(_mask)) { ; } + __m256i Data() const { return mask; } + static constexpr int Size() { return 4; } + static SIMD GetMaskFromBits (unsigned int i); + }; + + static SIMD masks_from_4bits[16] = { + _mm256_set_epi64x (0,0,0,0), _mm256_set_epi64x (0,0,0,-1), + _mm256_set_epi64x (0,0,-1,0), _mm256_set_epi64x (0,0,-1,-1), + _mm256_set_epi64x (0,-1,0,0), _mm256_set_epi64x (0,-1,0,-1), + _mm256_set_epi64x (0,-1,-1,0), _mm256_set_epi64x (0,-1,-1,-1), + _mm256_set_epi64x (-1,0,0,0), _mm256_set_epi64x (-1,0,0,-1), + _mm256_set_epi64x (-1,0,-1,0), _mm256_set_epi64x (-1,0,-1,-1), + _mm256_set_epi64x (-1,-1,0,0), _mm256_set_epi64x (-1,-1,0,-1), + _mm256_set_epi64x (-1,-1,-1,0), _mm256_set_epi64x (-1,-1,-1,-1) + }; + + NETGEN_INLINE SIMD SIMD :: GetMaskFromBits (unsigned int i) + { + return masks_from_4bits[i & 15]; + } + + template<> + class alignas(32) SIMD + { + __m256i data; + + public: + static constexpr int Size() { return 4; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (int64_t val) { data = _mm256_set1_epi64x(val); } + SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3) { data = _mm256_set_epi64x(v3,v2,v1,v0); } + SIMD (std::array a) + : data{_mm256_set_epi64x(a[3],a[2],a[1],a[0])} + {} + SIMD (SIMD v0, SIMD v1) + : data(_mm256_set_m128i(v0.Data(),v1.Data())) + {} + SIMD (__m256i _data) { data = _data; } + + NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; } + NETGEN_INLINE __m256i Data() const { return data; } + NETGEN_INLINE __m256i & Data() { return data; } + + SIMD Lo() const { return _mm256_extractf128_si256(data, 0); } + SIMD Hi() const { return _mm256_extractf128_si256(data, 1); } + static SIMD FirstInt(int n0=0) { return { n0+0, n0+1, n0+2, n0+3 }; } + }; + + + NETGEN_INLINE SIMD operator-(SIMD a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a.Data()); } + +#ifdef __AVX2__ + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm256_add_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm256_sub_epi64(a.Data(),b.Data()); } +#endif // __AVX2__ + + template<> + class alignas(32) SIMD + { + __m256d data; + + public: + static constexpr int Size() { return 4; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (double val) { data = _mm256_set1_pd(val); } + SIMD (int val) { data = _mm256_set1_pd(val); } + SIMD (size_t val) { data = _mm256_set1_pd(val); } + SIMD (double v0, double v1, double v2, double v3) { data = _mm256_set_pd(v3,v2,v1,v0); } + SIMD (SIMD v0, SIMD v1) : SIMD(v0[0], v0[1], v1[0], v1[1]) { ; } + SIMD (double const * p) { data = _mm256_loadu_pd(p); } + SIMD (double const * p, SIMD mask) { data = _mm256_maskload_pd(p, mask.Data()); } + SIMD (__m256d _data) { data = _data; } + SIMD (std::array a) + : data{_mm256_set_pd(a[3],a[2],a[1],a[0])} + {} + + void Store (double * p) { _mm256_storeu_pd(p, data); } + void Store (double * p, SIMD mask) { _mm256_maskstore_pd(p, mask.Data(), data); } + + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm256_set_pd(func(3), func(2), func(1), func(0)); + } + + NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } + NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; } + // [[deprecated("don't write to individual elments of SIMD")]] + // NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; } + template + double Get() const { return ((double*)(&data))[I]; } + NETGEN_INLINE __m256d Data() const { return data; } + NETGEN_INLINE __m256d & Data() { return data; } + + SIMD Lo() const { return _mm256_extractf128_pd(data, 0); } + SIMD Hi() const { return _mm256_extractf128_pd(data, 1); } + + operator std::tuple () + { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } + }; + + NETGEN_INLINE auto Unpack (SIMD a, SIMD b) + { + return std::make_tuple(SIMD(_mm256_unpacklo_pd(a.Data(),b.Data())), + SIMD(_mm256_unpackhi_pd(a.Data(),b.Data()))); + } + + NETGEN_INLINE SIMD operator- (SIMD a) { return _mm256_xor_pd(a.Data(), _mm256_set1_pd(-0.0)); } + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm256_add_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm256_sub_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { return _mm256_mul_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { return _mm256_div_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (double a, SIMD b) { return _mm256_set1_pd(a)*b.Data(); } + NETGEN_INLINE SIMD operator* (SIMD b, double a) { return _mm256_set1_pd(a)*b.Data(); } + + NETGEN_INLINE SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } + NETGEN_INLINE SIMD floor (SIMD a) { return _mm256_floor_pd(a.Data()); } + NETGEN_INLINE SIMD ceil (SIMD a) { return _mm256_ceil_pd(a.Data()); } + NETGEN_INLINE SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), (-a).Data()); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LE_OQ); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LT_OQ); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GE_OQ); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GT_OQ); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_EQ_OQ); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_NEQ_OQ); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm256_xor_si256(_mm256_cmpgt_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return my_mm256_cmpgt_epi64(b.Data(),a.Data()); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm256_xor_si256(_mm256_cmpgt_epi64(b.Data(),a.Data()),_mm256_set1_epi32(-1)); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return my_mm256_cmpgt_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm256_cmpeq_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm256_xor_si256(_mm256_cmpeq_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); } + +#ifdef __AVX2__ + NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) + { return _mm256_and_si256 (a.Data(), b.Data()); } + NETGEN_INLINE SIMD operator|| (SIMD a, SIMD b) + { return _mm256_or_si256 (a.Data(), b.Data()); } + NETGEN_INLINE SIMD operator! (SIMD a) + { return _mm256_xor_si256 (a.Data(), _mm256_cmpeq_epi64(a.Data(),a.Data())); } +#else //AVX2 is a superset of AVX. Without it, it is necessary to reinterpret the types + NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) + { return _mm256_castpd_si256(_mm256_and_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( b.Data()))); } + NETGEN_INLINE SIMD operator|| (SIMD a, SIMD b) + { return _mm256_castpd_si256(_mm256_or_pd (_mm256_castsi256_pd(a.Data()), _mm256_castsi256_pd(b.Data()))); } + NETGEN_INLINE SIMD operator! (SIMD a) + { return _mm256_castpd_si256(_mm256_xor_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( _mm256_cmpeq_epi64(a.Data(),a.Data())))); } +#endif + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { return _mm256_blendv_pd(c.Data(), b.Data(), _mm256_castsi256_pd(a.Data())); } + + NETGEN_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); + return _mm256_blendv_pd(c.Data(), b.Data(), cp); + } + + NETGEN_INLINE SIMD IfZero (SIMD a, SIMD b, SIMD c) + { + auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_EQ_OS); + return _mm256_blendv_pd(c.Data(), b.Data(), cp); + } + + NETGEN_INLINE double HSum (SIMD sd) + { + // __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); + __m128d hv = (sd.Lo()+sd.Hi()).Data(); + return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); + } + + NETGEN_INLINE auto HSum (SIMD sd1, SIMD sd2) + { + __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data()); + __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1)); + return SIMD(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); + } + + NETGEN_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data()); + __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data()); + SIMD hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16), + _mm256_blend_pd (hsum1, hsum2, 12)); + return hsum; + // return make_tuple(hsum[0], hsum[1], hsum[2], hsum[3]); + } + + + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(c.Data()), _mm256_castsi256_pd(b.Data()), + _mm256_castsi256_pd(a.Data()))); } + +} + +#endif // NETGEN_CORE_SIMD_AVX_HPP + diff --git a/libsrc/core/simd_avx512.hpp b/libsrc/core/simd_avx512.hpp new file mode 100644 index 00000000..e453b7e4 --- /dev/null +++ b/libsrc/core/simd_avx512.hpp @@ -0,0 +1,239 @@ +#ifndef NETGEN_CORE_SIMD_AVX512_HPP +#define NETGEN_CORE_SIMD_AVX512_HPP + +/**************************************************************************/ +/* File: simd_avx512.hpp */ +/* Author: Joachim Schoeberl, Matthias Hochsteger */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include + +namespace ngcore +{ + + template <> + class SIMD + { + __mmask8 mask; + public: + SIMD (size_t i) + : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i), + _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0))) + { ; } + SIMD (int i) + : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i), + _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0))) + { ; } + SIMD (int64_t i) + : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i), + _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0))) + { ; } + SIMD (__mmask8 _mask) : mask(_mask) { ; } + __mmask8 Data() const { return mask; } + static constexpr int Size() { return 8; } + static NETGEN_INLINE SIMD GetMaskFromBits (unsigned int i) + { + return SIMD(__mmask8(i)); + } + }; + + template<> + class alignas(64) SIMD + { + __m512i data; + + public: + static constexpr int Size() { return 8; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (int64_t val) { data = _mm512_set1_epi64(val); } + SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7) { data = _mm512_set_epi64(v7,v6,v5,v4,v3,v2,v1,v0); } + SIMD (__m512i _data) { data = _data; } + + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm512_set_epi64(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0)); + } + + + NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; } + NETGEN_INLINE __m512i Data() const { return data; } + NETGEN_INLINE __m512i & Data() { return data; } + static SIMD FirstInt() { return { 0, 1, 2, 3, 4, 5, 6, 7 }; } + }; + + NETGEN_INLINE SIMD operator-(SIMD a) { return _mm512_sub_epi64(_mm512_setzero_si512(), a.Data()); } + + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm512_add_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm512_sub_epi64(a.Data(),b.Data()); } + + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { return _mm512_mask_blend_epi64(a.Data(), c.Data(), b.Data()); } + + + template<> + class alignas(64) SIMD + { + __m512d data; + public: + static constexpr int Size() { return 8; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (double val) { data = _mm512_set1_pd(val); } + SIMD (int val) { data = _mm512_set1_pd(val); } + SIMD (size_t val) { data = _mm512_set1_pd(val); } + SIMD (double const * p) { data = _mm512_loadu_pd(p); } + SIMD (double const * p, SIMD mask) + { data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); } + SIMD (__m512d _data) { data = _data; } + + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm512_set_pd(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0)); + } + + void Store (double * p) { _mm512_storeu_pd(p, data); } + void Store (double * p, SIMD mask) { _mm512_mask_storeu_pd(p, mask.Data(), data); } + + template + void SIMD_function (const Function & func, std::true_type) + { + data = (__m512){ func(7), func(6), func(5), func(4), + func(3), func(2), func(1), func(0) }; + } + + // not a function + void SIMD_function (double const * p, std::false_type) + { + data = _mm512_loadu_pd(p); + } + + void SIMD_function (double val, std::false_type) + { + data = _mm512_set1_pd(val); + } + + void SIMD_function (__m512d _data, std::false_type) + { + data = _data; + } + + NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } + NETGEN_INLINE __m512d Data() const { return data; } + NETGEN_INLINE __m512d & Data() { return data; } + + }; + + NETGEN_INLINE SIMD operator- (SIMD a) { return -a.Data(); } + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm512_add_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm512_sub_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { return _mm512_mul_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { return _mm512_div_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (double a, SIMD b) { return _mm512_set1_pd(a)*b.Data(); } + NETGEN_INLINE SIMD operator* (SIMD b, double a) { return _mm512_set1_pd(a)*b.Data(); } + + NETGEN_INLINE SIMD sqrt (SIMD a) { return _mm512_sqrt_pd(a.Data()); } + NETGEN_INLINE SIMD floor (SIMD a) { return _mm512_floor_pd(a.Data()); } + NETGEN_INLINE SIMD ceil (SIMD a) { return _mm512_ceil_pd(a.Data()); } + NETGEN_INLINE SIMD fabs (SIMD a) { return _mm512_max_pd(a.Data(), -a.Data()); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LE_OQ); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LT_OQ); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GE_OQ); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GT_OQ); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_EQ_OQ); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_NEQ_OQ); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LE); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LT); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLT); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLE); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_EQ); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NE); } + + NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) + { return (__mmask8)(a.Data() & b.Data()); } + NETGEN_INLINE SIMD operator|| (SIMD a, SIMD b) + { return (__mmask8)(a.Data() | b.Data()); } + NETGEN_INLINE SIMD operator! (SIMD a) + { return (__mmask8)(~a.Data()); } + + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { return _mm512_mask_blend_pd(a.Data(), c.Data(), b.Data()); } + + NETGEN_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_GT_OS); + return _mm512_mask_blend_pd(k,c.Data(),b.Data()); + } + NETGEN_INLINE SIMD IfZero (SIMD a, SIMD b, SIMD c) + { + auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_EQ_OS); + return _mm512_mask_blend_pd(k,c.Data(),b.Data()); + } + + + NETGEN_INLINE auto Unpack (SIMD a, SIMD b) + { + return std::make_tuple(SIMD(_mm512_unpacklo_pd(a.Data(),b.Data())), + SIMD(_mm512_unpackhi_pd(a.Data(),b.Data()))); + } + + + NETGEN_INLINE double HSum (SIMD sd) + { + SIMD low = _mm512_extractf64x4_pd(sd.Data(),0); + SIMD high = _mm512_extractf64x4_pd(sd.Data(),1); + return HSum(low)+HSum(high); + } + + NETGEN_INLINE auto HSum (SIMD sd1, SIMD sd2) + { + return SIMD(HSum(sd1), HSum(sd2)); + } + + NETGEN_INLINE SIMD HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + SIMD lo,hi; + std::tie(lo,hi) = Unpack(v1, v2); + SIMD sum01 = lo+hi; + std::tie(lo,hi) = Unpack(v3, v4); + SIMD sum23 = lo+hi; + // sum01 b a b a b a b a + // sum23 d c d c d c d c + // __m512 perm = _mm512_permutex2var_pd (sum01.Data(), _mm512_set_epi64(1,2,3,4,5,6,7,8), sum23.Data()); + __m256d ab = _mm512_extractf64x4_pd(sum01.Data(),0) + _mm512_extractf64x4_pd(sum01.Data(),1); + __m256d cd = _mm512_extractf64x4_pd(sum23.Data(),0) + _mm512_extractf64x4_pd(sum23.Data(),1); + return _mm256_add_pd (_mm256_permute2f128_pd (ab, cd, 1+2*16), _mm256_blend_pd (ab, cd, 12)); + } + + NETGEN_INLINE SIMD FMA (SIMD a, SIMD b, SIMD c) + { + return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data()); + } + NETGEN_INLINE SIMD FMA (const double & a, SIMD b, SIMD c) + { + return _mm512_fmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data()); + } +} + +#endif // NETGEN_CORE_SIMD_AVX512_HPP diff --git a/libsrc/core/simd_generic.hpp b/libsrc/core/simd_generic.hpp new file mode 100644 index 00000000..8ebd399a --- /dev/null +++ b/libsrc/core/simd_generic.hpp @@ -0,0 +1,657 @@ +#ifndef NETGEN_CORE_SIMD_GENERIC_HPP +#define NETGEN_CORE_SIMD_GENERIC_HPP + +/**************************************************************************/ +/* File: simd_base.hpp */ +/* Author: Joachim Schoeberl, Matthias Hochsteger */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include +#include +#include + +#include "array.hpp" + +namespace ngcore +{ + + constexpr int GetDefaultSIMDSize() { +#if defined __AVX512F__ + return 8; +#elif defined __AVX__ + return 4; +#elif (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__)) + return 2; +#else + return 1; +#endif + } + + + template class SIMD; + + class mask64; + + //////////////////////////////////////////////////////////////////////////// + namespace detail { + template + auto array_range_impl(std::array const& arr, + size_t first, + std::index_sequence) + -> std::array { + return {arr[first + I]...}; + } + + template + auto array_range(std::array const& arr, size_t first) { + return array_range_impl(arr, first, std::make_index_sequence{}); + } + + } // namespace detail + + //////////////////////////////////////////////////////////////////////////// + // mask + + template <> + class SIMD + { + int64_t mask; + public: + SIMD (size_t i) + : mask(i > 0 ? -1 : 0) { ; } + bool Data() const { return mask; } + static constexpr int Size() { return 1; } + auto operator[] (int /* i */) const { return mask; } + }; + + + template + class SIMD + { + static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); + static constexpr int N2 = N-N1; + + SIMD lo; + SIMD hi; + public: + + SIMD (int i) : lo(i), hi(i-N1) { ; } + SIMD (SIMD lo_, SIMD hi_) : lo(lo_), hi(hi_) { ; } + SIMD Lo() const { return lo; } + SIMD Hi() const { return hi; } + static constexpr int Size() { return N; } + }; + + template + NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) + { + if constexpr(N==1) return a.Data() && b.Data(); + else return { a.Lo() && b.Lo(), a.Hi() && b.Hi() }; + } + + + //////////////////////////////////////////////////////////////////////////// + // int64 + + template<> + class SIMD + { + int64_t data; + + public: + static constexpr int Size() { return 1; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + SIMD (int64_t val) { data = val; } + SIMD (std::array arr) + : data{arr[0]} + {} + + int64_t operator[] (int i) const { return ((int64_t*)(&data))[i]; } + auto Data() const { return data; } + static SIMD FirstInt(int64_t n0=0) { return {n0}; } + template + int64_t Get() + { + static_assert(I==0); + return data; + } + }; + + template + class SIMD + { + static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); + static constexpr int N2 = N-N1; + + SIMD lo; + SIMD high; + + public: + static constexpr int Size() { return N; } + + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (int64_t val) : lo{val}, high{val} { ; } + SIMD (SIMD lo_, SIMD high_) : lo(lo_), high(high_) { ; } + + SIMD( std::array arr ) + : lo(detail::array_range(arr, 0)), + high(detail::array_range(arr, N1)) + {} + + template + SIMD(const T... vals) + : lo(detail::array_range(std::array{vals...}, 0)), + high(detail::array_range(std::array{vals...}, N1)) + { + static_assert(sizeof...(vals)==N, "wrong number of arguments"); + } + + + template>::value, int>::type = 0> + SIMD (const T & func) + { + for(auto i : IntRange(N1)) + lo[i] = func(i); + for(auto i : IntRange(N2)) + high[i] = func(N1+i); + } + + auto Lo() const { return lo; } + auto Hi() const { return high; } + + int64_t operator[] (int i) const { return ((int64_t*)(&lo))[i]; } + + /* + operator tuple () + { return tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } + */ + + /* + static SIMD FirstInt() { return { 0, 1, 2, 3 }; } + */ + static SIMD FirstInt(int64_t n0=0) { return {SIMD::FirstInt(n0), SIMD::FirstInt(n0+N1)}; } + template + int64_t Get() + { + static_assert(I>=0 && I(); + else return high.template Get(); + } + }; + + + //////////////////////////////////////////////////////////////////////////// + // double + + template<> + class SIMD + { + double data; + + public: + static constexpr int Size() { return 1; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + SIMD (double val) { data = val; } + SIMD (int val) { data = val; } + SIMD (size_t val) { data = val; } + SIMD (double const * p) { data = *p; } + SIMD (double const * p, SIMD mask) { data = mask.Data() ? *p : 0.0; } + SIMD (std::array arr) + : data{arr[0]} + {} + + template >::value,int>::type = 0> + SIMD (const T & func) + { + data = func(0); + } + + template >::value,int>::type = 0> + SIMD & operator= (const T & func) + { + data = func(0); + return *this; + } + + void Store (double * p) { *p = data; } + void Store (double * p, SIMD mask) { if (mask.Data()) *p = data; } + + double operator[] (int i) const { return ((double*)(&data))[i]; } + double Data() const { return data; } + template + double Get() + { + static_assert(I==0); + return data; + } + }; + + + template + class SIMD + { + static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2); + static constexpr int N2 = N-N1; + + SIMD lo; + SIMD high; + + public: + static constexpr int Size() { return N; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD (SIMD lo_, SIMD hi_) : lo(lo_), high(hi_) { ; } + + template >::value,int>::type = 0> + SIMD (const T & func) + { + for(auto i : IntRange(N1)) + lo[i] = func(i); + for(auto i : IntRange(N2)) + high[i] = func(N1+i); + } + + template >::value,int>::type = 0> + SIMD & operator= (const T & func) + { + for(auto i : IntRange(N1)) + lo[i] = func(i); + for(auto i : IntRange(N2)) + high[i] = func(N1+i); + return *this; + } + + + SIMD & operator= (const SIMD &) = default; + + SIMD (double val) : lo{val}, high{val} { ; } + SIMD (int val) : lo{val}, high{val} { ; } + SIMD (size_t val) : lo{val}, high{val} { ; } + + SIMD (double const * p) : lo{p}, high{p+N1} { ; } + SIMD (double const * p, SIMD mask) + : lo{p, mask.Lo()}, high{p+N1, mask.Hi()} + { } + SIMD (double * p) : lo{p}, high{p+N1} { ; } + SIMD (double * p, SIMD mask) + : lo{p, mask.Lo()}, high{p+N1, mask.Hi()} + { } + + SIMD( std::array arr ) + : lo(detail::array_range(arr, 0)), + high(detail::array_range(arr, N1)) + {} + + template + SIMD(const T... vals) + : lo(detail::array_range(std::array{vals...}, 0)), + high(detail::array_range(std::array{vals...}, N1)) + { + static_assert(sizeof...(vals)==N, "wrong number of arguments"); + } + + void Store (double * p) { lo.Store(p); high.Store(p+N1); } + void Store (double * p, SIMD mask) + { + lo.Store(p, mask.Lo()); + high.Store(p+N1, mask.Hi()); + } + + auto Lo() const { return lo; } + auto Hi() const { return high; } + + double operator[] (int i) const { return ((double*)(&lo))[i]; } + + template> + operator std::tuple () + { return std::tuple((*this)[0], (*this)[1]); } + + template> + operator std::tuple () + { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } + + template + double Get() + { + static_assert(I>=0 && I(); + else return high.template Get(); + } + }; + + + // Generic operators for any arithmetic type/simd width + template + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { + if constexpr(N==1) return a.Data()+b.Data(); + else return { a.Lo()+b.Lo(), a.Hi()+b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { + if constexpr(N==1) return a.Data()-b.Data(); + else return { a.Lo()-b.Lo(), a.Hi()-b.Hi() }; + } + template + NETGEN_INLINE SIMD operator- (SIMD a) { + if constexpr(N==1) return -a.Data(); + else return { -a.Lo(), -a.Hi() }; + } + + template + NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { + if constexpr(N==1) return a.Data()*b.Data(); + else return { a.Lo()*b.Lo(), a.Hi()*b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { + if constexpr(N==1) return a.Data()/b.Data(); + else return { a.Lo()/b.Lo(), a.Hi()/b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator< (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() < b.Data(); + else return { a.Lo() + NETGEN_INLINE SIMD operator<= (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() <= b.Data(); + else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator> (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() > b.Data(); + else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator>= (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() >= b.Data(); + else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator== (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() == b.Data(); + else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() }; + } + + template + NETGEN_INLINE SIMD operator!= (SIMD & a, SIMD b) + { + if constexpr(N==1) return a.Data() != b.Data(); + else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() }; + } + + // int64_t operators with scalar operand (implement overloads to allow implicit casts for second operand) + template + NETGEN_INLINE SIMD operator+ (SIMD a, int64_t b) { return a+SIMD(b); } + template + NETGEN_INLINE SIMD operator+ (int64_t a, SIMD b) { return SIMD(a)+b; } + template + NETGEN_INLINE SIMD operator- (int64_t a, SIMD b) { return SIMD(a)-b; } + template + NETGEN_INLINE SIMD operator- (SIMD a, int64_t b) { return a-SIMD(b); } + template + NETGEN_INLINE SIMD operator* (int64_t a, SIMD b) { return SIMD(a)*b; } + template + NETGEN_INLINE SIMD operator* (SIMD b, int64_t a) { return SIMD(a)*b; } + template + NETGEN_INLINE SIMD operator/ (SIMD a, int64_t b) { return a/SIMD(b); } + template + NETGEN_INLINE SIMD operator/ (int64_t a, SIMD b) { return SIMD(a)/b; } + template + NETGEN_INLINE SIMD & operator+= (SIMD & a, SIMD b) { a=a+b; return a; } + template + NETGEN_INLINE SIMD & operator+= (SIMD & a, int64_t b) { a+=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator-= (SIMD & a, SIMD b) { a = a-b; return a; } + template + NETGEN_INLINE SIMD & operator-= (SIMD & a, int64_t b) { a-=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator*= (SIMD & a, SIMD b) { a=a*b; return a; } + template + NETGEN_INLINE SIMD & operator*= (SIMD & a, int64_t b) { a*=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator/= (SIMD & a, SIMD b) { a = a/b; return a; } + + // double operators with scalar operand (implement overloads to allow implicit casts for second operand) + template + NETGEN_INLINE SIMD operator+ (SIMD a, double b) { return a+SIMD(b); } + template + NETGEN_INLINE SIMD operator+ (double a, SIMD b) { return SIMD(a)+b; } + template + NETGEN_INLINE SIMD operator- (double a, SIMD b) { return SIMD(a)-b; } + template + NETGEN_INLINE SIMD operator- (SIMD a, double b) { return a-SIMD(b); } + template + NETGEN_INLINE SIMD operator* (double a, SIMD b) { return SIMD(a)*b; } + template + NETGEN_INLINE SIMD operator* (SIMD b, double a) { return SIMD(a)*b; } + template + NETGEN_INLINE SIMD operator/ (SIMD a, double b) { return a/SIMD(b); } + template + NETGEN_INLINE SIMD operator/ (double a, SIMD b) { return SIMD(a)/b; } + template + NETGEN_INLINE SIMD & operator+= (SIMD & a, SIMD b) { a=a+b; return a; } + template + NETGEN_INLINE SIMD & operator+= (SIMD & a, double b) { a+=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator-= (SIMD & a, SIMD b) { a = a-b; return a; } + template + NETGEN_INLINE SIMD & operator-= (SIMD & a, double b) { a-=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator*= (SIMD & a, SIMD b) { a=a*b; return a; } + template + NETGEN_INLINE SIMD & operator*= (SIMD & a, double b) { a*=SIMD(b); return a; } + template + NETGEN_INLINE SIMD & operator/= (SIMD & a, SIMD b) { a = a/b; return a; } + + // double functions + + template + NETGEN_INLINE SIMD L2Norm2 (SIMD a) { return a*a; } + template + NETGEN_INLINE SIMD Trans (SIMD a) { return a; } + + template + NETGEN_INLINE double HSum (SIMD a) + { + if constexpr(N==1) + return a.Data(); + else + return HSum(a.Lo()) + HSum(a.Hi()); + } + + NETGEN_INLINE double IfPos (double a, double b, double c) { return a>0 ? b : c; } + NETGEN_INLINE double IfZero (double a, double b, double c) { return a==0. ? b : c; } + + template + NETGEN_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + if constexpr(N==1) return a.Data()>0.0 ? b : c; + else return { IfPos(a.Lo(), b.Lo(), c.Lo()), IfPos(a.Hi(), b.Hi(), c.Hi())}; + + } + + template + NETGEN_INLINE SIMD IfZero (SIMD a, SIMD b, SIMD c) + { + if constexpr(N==1) return a.Data()==0.0 ? b : c; + else return { IfZero(a.Lo(), b.Lo(), c.Lo()), IfZero(a.Hi(), b.Hi(), c.Hi())}; + + } + + template + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { + if constexpr(N==1) return a.Data() ? b : c; + else return { If(a.Lo(), b.Lo(), c.Lo()), If(a.Hi(), b.Hi(), c.Hi())}; + + } + + template + // a*b+c + NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c) + { + return a*b+c; + } + + // update form of fma + template + void FMAasm (SIMD a, SIMD b, SIMD & sum) + { + sum = FMA(a,b,sum); + } + + template + T get(SIMD a) { return a[i]; } + + template + NETGEN_INLINE void Iterate2 (FUNC f) + { + if constexpr (NUM > 1) Iterate2 (f); + if constexpr (NUM >= 1) f(std::integral_constant()); + } + + + template + ostream & operator<< (ostream & ost, SIMD simd) + { + /* + ost << simd[0]; + for (int i = 1; i < simd.Size(); i++) + ost << " " << simd[i]; + */ + Iterate2 ([&] (auto I) { + if (I.value != 0) ost << " "; + ost << get(simd); + }); + return ost; + } + + using std::exp; + template + NETGEN_INLINE ngcore::SIMD exp (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return exp(a[i]); } ); + } + + using std::log; + template + NETGEN_INLINE ngcore::SIMD log (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return log(a[i]); } ); + } + + using std::pow; + template + NETGEN_INLINE ngcore::SIMD pow (ngcore::SIMD a, double x) { + return ngcore::SIMD([a,x](int i)->double { return pow(a[i],x); } ); + } + + template + NETGEN_INLINE ngcore::SIMD pow (ngcore::SIMD a, ngcore::SIMD b) { + return ngcore::SIMD([a,b](int i)->double { return pow(a[i],b[i]); } ); + } + + using std::sin; + template + NETGEN_INLINE ngcore::SIMD sin (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return sin(a[i]); } ); + } + + using std::cos; + template + NETGEN_INLINE ngcore::SIMD cos (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return cos(a[i]); } ); + } + + using std::tan; + template + NETGEN_INLINE ngcore::SIMD tan (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return tan(a[i]); } ); + } + + using std::atan; + template + NETGEN_INLINE ngcore::SIMD atan (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return atan(a[i]); } ); + } + + using std::atan2; + template + NETGEN_INLINE ngcore::SIMD atan2 (ngcore::SIMD y, ngcore::SIMD x) { + return ngcore::SIMD([y,x](int i)->double { return atan2(y[i], x[i]); } ); + } + + using std::acos; + template + NETGEN_INLINE ngcore::SIMD acos (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return acos(a[i]); } ); + } + + using std::asin; + template + NETGEN_INLINE ngcore::SIMD asin (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return asin(a[i]); } ); + } + + using std::sinh; + template + NETGEN_INLINE ngcore::SIMD sinh (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return sinh(a[i]); } ); + } + + using std::cosh; + template + NETGEN_INLINE ngcore::SIMD cosh (ngcore::SIMD a) { + return ngcore::SIMD([a](int i)->double { return cosh(a[i]); } ); + } + + template + using MultiSIMD = SIMD; + + template + NETGEN_INLINE auto Unpack (SIMD a, SIMD b) + { + if constexpr(N==1) + { + return std::make_tuple(SIMD{a.Data()}, SIMD{b.Data()} ); + } + else + { + auto [a1,b1] = Unpack(a.Lo(), b.Lo()); + auto [a2,b2] = Unpack(a.Hi(), b.Hi()); + return std::make_tuple(SIMD{ a1, a2 }, + SIMD{ b1, b2 }); + } + } + +} + + +namespace std +{ + // structured binding support + template + struct tuple_size> : std::integral_constant {}; + template struct tuple_element> { using type = T; }; +} + +#endif // NETGEN_CORE_SIMD_GENERIC_HPP diff --git a/libsrc/core/simd_sse.hpp b/libsrc/core/simd_sse.hpp new file mode 100644 index 00000000..b6f9c61e --- /dev/null +++ b/libsrc/core/simd_sse.hpp @@ -0,0 +1,266 @@ +#ifndef NETGEN_CORE_SIMD_SSE_HPP +#define NETGEN_CORE_SIMD_SSE_HPP + +/**************************************************************************/ +/* File: simd_sse.hpp */ +/* Author: Joachim Schoeberl, Matthias Hochsteger */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include + +namespace ngcore +{ + + template <> + class SIMD + { + __m128i mask; + public: + SIMD (int i) + : mask(_mm_cmpgt_epi32(_mm_set1_epi32(i), + _mm_set_epi32(1, 1, 0, 0))) + { ; } + SIMD (__m128i _mask) : mask(_mask) { ; } + __m128i Data() const { return mask; } + static constexpr int Size() { return 2; } + static NETGEN_INLINE SIMD GetMaskFromBits (unsigned int i); + }; + + static SIMD masks_from_2bits[4] = { + _mm_set_epi32 (0,0,0,0), _mm_set_epi32 (0,0,-1,0), + _mm_set_epi32 (-1,0,0,0), _mm_set_epi32 (-1,0,-1,0), + }; + + NETGEN_INLINE SIMD SIMD :: GetMaskFromBits (unsigned int i) + { + return masks_from_2bits[i & 3]; + } + + + template<> + class alignas(16) SIMD + { + __m128i data; + + public: + static constexpr int Size() { return 2; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD (int64_t v0, int64_t v1) { data = _mm_set_epi64x(v1,v0); } + SIMD (std::array arr) + : data{_mm_set_epi64x(arr[1],arr[0])} + {} + + SIMD & operator= (const SIMD &) = default; + + SIMD (int64_t val) { data = _mm_set1_epi64x(val); } + SIMD (__m128i _data) { data = _data; } + + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm_set_epi64(func(1), func(0)); + } + + NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; } + NETGEN_INLINE __m128i Data() const { return data; } + NETGEN_INLINE __m128i & Data() { return data; } + static SIMD FirstInt(int n0=0) { return { n0, n0+1 }; } + }; + + + +NETGEN_INLINE SIMD operator-(SIMD a) { return _mm_sub_epi64(_mm_setzero_si128(), a.Data()); } +NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm_add_epi64(a.Data(),b.Data()); } +NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm_sub_epi64(a.Data(),b.Data()); } + + + template<> + class alignas(16) SIMD + { + __m128d data; + + public: + static constexpr int Size() { return 2; } + SIMD () {} + SIMD (const SIMD &) = default; + SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); } + SIMD (std::array arr) + : data{_mm_set_pd(arr[1], arr[0])} + {} + + SIMD & operator= (const SIMD &) = default; + + SIMD (double val) { data = _mm_set1_pd(val); } + SIMD (int val) { data = _mm_set1_pd(val); } + SIMD (size_t val) { data = _mm_set1_pd(val); } + + SIMD (double const * p) { data = _mm_loadu_pd(p); } + SIMD (double const * p, SIMD mask) + { +#ifdef __AVX__ + data = _mm_maskload_pd(p, mask.Data()); +#else + // this versions segfaults if p points to the last allowed element + // happened on Mac with the new SparseCholesky-factorization + // data = _mm_and_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p)); + auto pmask = (int64_t*)&mask; + data = _mm_set_pd (pmask[1] ? p[1] : 0.0, pmask[0] ? p[0] : 0.0); +#endif + } + SIMD (__m128d _data) { data = _data; } + + void Store (double * p) { _mm_storeu_pd(p, data); } + void Store (double * p, SIMD mask) + { +#ifdef __AVX__ + _mm_maskstore_pd(p, mask.Data(), data); +#else + /* + _mm_storeu_pd (p, _mm_or_pd (_mm_and_pd(_mm_castsi128_pd(mask.Data()), data), + _mm_andnot_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p)))); + */ + auto pmask = (int64_t*)&mask; + if (pmask[0]) p[0] = (*this)[0]; + if (pmask[1]) p[1] = (*this)[1]; +#endif + } + + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm_set_pd(func(1), func(0)); + } + + NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } + NETGEN_INLINE __m128d Data() const { return data; } + NETGEN_INLINE __m128d & Data() { return data; } + + operator std::tuple () + { + auto pdata = (double*)&data; + return std::tuple(pdata[0], pdata[1]); + } + }; + + NETGEN_INLINE SIMD operator- (SIMD a) { return _mm_xor_pd(a.Data(), _mm_set1_pd(-0.0)); } + NETGEN_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm_add_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm_sub_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (SIMD a, SIMD b) { return _mm_mul_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator/ (SIMD a, SIMD b) { return _mm_div_pd(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator* (double a, SIMD b) { return _mm_set1_pd(a)*b; } + NETGEN_INLINE SIMD operator* (SIMD b, double a) { return _mm_set1_pd(a)*b; } + + template<> + NETGEN_INLINE auto Unpack (SIMD a, SIMD b) + { + return std::make_tuple(SIMD(_mm_unpacklo_pd(a.Data(),b.Data())), + SIMD(_mm_unpackhi_pd(a.Data(),b.Data()))); + } + + NETGEN_INLINE __m128d my_mm_hadd_pd(__m128d a, __m128d b) { +#if defined(__SSE3__) || defined(__AVX__) + return _mm_hadd_pd(a,b); +#else + return _mm_add_pd( _mm_unpacklo_pd(a,b), _mm_unpackhi_pd(a,b) ); +#endif + } + +#ifndef __AVX__ + NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) { + auto res_lo = _mm_cvtsi128_si64(a) > _mm_cvtsi128_si64(b) ? -1:0; + auto res_hi = _mm_cvtsi128_si64(_mm_srli_si128(a,8)) > _mm_cvtsi128_si64(_mm_srli_si128(b,8)) ? -1 : 0; + return _mm_set_epi64x(res_hi,res_lo); + } +#else + NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) { + return _mm_cmpgt_epi64(a,b); + } +#endif + + + NETGEN_INLINE SIMD sqrt (SIMD a) { return _mm_sqrt_pd(a.Data()); } + NETGEN_INLINE SIMD fabs (SIMD a) { return _mm_max_pd(a.Data(), (-a).Data()); } + using std::floor; + NETGEN_INLINE SIMD floor (SIMD a) + { return ngcore::SIMD([&](int i)->double { return floor(a[i]); } ); } + using std::ceil; + NETGEN_INLINE SIMD ceil (SIMD a) + { return ngcore::SIMD([&](int i)->double { return ceil(a[i]); } ); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmple_pd(a.Data(),b.Data())); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmplt_pd(a.Data(),b.Data())); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmpge_pd(a.Data(),b.Data())); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmpgt_pd(a.Data(),b.Data())); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmpeq_pd(a.Data(),b.Data())); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm_castpd_si128( _mm_cmpneq_pd(a.Data(),b.Data())); } + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) + { return _mm_xor_si128(_mm_cmpgt_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); } + NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) + { return my_mm_cmpgt_epi64(b.Data(),a.Data()); } + NETGEN_INLINE SIMD operator>= (SIMD a , SIMD b) + { return _mm_xor_si128(_mm_cmpgt_epi64(b.Data(),a.Data()),_mm_set1_epi32(-1)); } + NETGEN_INLINE SIMD operator> (SIMD a , SIMD b) + { return my_mm_cmpgt_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator== (SIMD a , SIMD b) + { return _mm_cmpeq_epi64(a.Data(),b.Data()); } + NETGEN_INLINE SIMD operator!= (SIMD a , SIMD b) + { return _mm_xor_si128(_mm_cmpeq_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); } + + + + NETGEN_INLINE SIMD operator&& (SIMD a, SIMD b) + { return _mm_castpd_si128(_mm_and_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( b.Data()))); } + NETGEN_INLINE SIMD operator|| (SIMD a, SIMD b) + { return _mm_castpd_si128(_mm_or_pd (_mm_castsi128_pd(a.Data()), _mm_castsi128_pd(b.Data()))); } + NETGEN_INLINE SIMD operator! (SIMD a) + { return _mm_castpd_si128(_mm_xor_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( _mm_cmpeq_epi64(a.Data(),a.Data())))); } +#ifdef __SSE4_1__ + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { return _mm_blendv_pd(c.Data(), b.Data(), _mm_castsi128_pd(a.Data())); } +#else + NETGEN_INLINE SIMD If (SIMD a, SIMD b, SIMD c) + { + return _mm_or_pd( + _mm_andnot_pd(_mm_castsi128_pd(a.Data()),c.Data()), + _mm_and_pd(b.Data(),_mm_castsi128_pd(a.Data())) + );} +#endif // __SSE4_1__ + + NETGEN_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) + { return ngcore::SIMD([&](int i)->double { return a[i]>0 ? b[i] : c[i]; }); } + NETGEN_INLINE SIMD IfZero (SIMD a, SIMD b, SIMD c) + { return ngcore::SIMD([&](int i)->double { return a[i]==0. ? b[i] : c[i]; }); } + + + NETGEN_INLINE double HSum (SIMD sd) + { + return _mm_cvtsd_f64 (my_mm_hadd_pd (sd.Data(), sd.Data())); + } + + NETGEN_INLINE auto HSum (SIMD sd1, SIMD sd2) + { + __m128d hv2 = my_mm_hadd_pd(sd1.Data(), sd2.Data()); + return SIMD (hv2); + // return SIMD(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); + } + + NETGEN_INLINE SIMD If(SIMD a, SIMD b, + SIMD c) { + return _mm_or_si128( + _mm_andnot_si128(a.Data(),c.Data()), + _mm_and_si128(b.Data(),a.Data()) + ); + } + +} + +#endif // NETGEN_CORE_SIMD_SSE_HPP diff --git a/libsrc/general/CMakeLists.txt b/libsrc/general/CMakeLists.txt index 50cd53ce..cf1f4c63 100644 --- a/libsrc/general/CMakeLists.txt +++ b/libsrc/general/CMakeLists.txt @@ -11,7 +11,7 @@ target_sources(gen INTERFACE install(FILES ngarray.hpp autodiff.hpp autoptr.hpp ngbitarray.hpp dynamicmem.hpp hashtabl.hpp mpi_interface.hpp myadt.hpp - ngsimd.hpp mystring.hpp netgenout.hpp ngpython.hpp + mystring.hpp netgenout.hpp ngpython.hpp optmem.hpp parthreads.hpp seti.hpp sort.hpp spbita2d.hpp stack.hpp table.hpp template.hpp gzstream.h diff --git a/libsrc/general/myadt.hpp b/libsrc/general/myadt.hpp index 123bd04c..ad6fd1c4 100644 --- a/libsrc/general/myadt.hpp +++ b/libsrc/general/myadt.hpp @@ -47,7 +47,5 @@ namespace netgen #include "netgenout.hpp" #include "gzstream.h" -#include "ngsimd.hpp" - #endif diff --git a/libsrc/general/ngsimd.hpp b/libsrc/general/ngsimd.hpp deleted file mode 100644 index 9170e402..00000000 --- a/libsrc/general/ngsimd.hpp +++ /dev/null @@ -1,672 +0,0 @@ -#ifndef FILE_NGSIMD -#define FILE_NGSIMD -/**************************************************************************/ -/* File: ngsimd.hpp */ -/* Author: Joachim Schoeberl */ -/* Date: 25. Mar. 16 */ -/**************************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include - -#ifdef WIN32 -#ifndef AVX_OPERATORS_DEFINED -#define AVX_OPERATORS_DEFINED -NG_INLINE __m128d operator- (__m128d a) { return _mm_xor_pd(a, _mm_set1_pd(-0.0)); } -NG_INLINE __m128d operator+ (__m128d a, __m128d b) { return _mm_add_pd(a,b); } -NG_INLINE __m128d operator- (__m128d a, __m128d b) { return _mm_sub_pd(a,b); } -NG_INLINE __m128d operator* (__m128d a, __m128d b) { return _mm_mul_pd(a,b); } -NG_INLINE __m128d operator/ (__m128d a, __m128d b) { return _mm_div_pd(a,b); } -NG_INLINE __m128d operator* (double a, __m128d b) { return _mm_set1_pd(a)*b; } -NG_INLINE __m128d operator* (__m128d b, double a) { return _mm_set1_pd(a)*b; } - -NG_INLINE __m128d operator+= (__m128d &a, __m128d b) { return a = a+b; } -NG_INLINE __m128d operator-= (__m128d &a, __m128d b) { return a = a-b; } -NG_INLINE __m128d operator*= (__m128d &a, __m128d b) { return a = a*b; } -NG_INLINE __m128d operator/= (__m128d &a, __m128d b) { return a = a/b; } - -NG_INLINE __m256d operator- (__m256d a) { return _mm256_xor_pd(a, _mm256_set1_pd(-0.0)); } -NG_INLINE __m256d operator+ (__m256d a, __m256d b) { return _mm256_add_pd(a,b); } -NG_INLINE __m256d operator- (__m256d a, __m256d b) { return _mm256_sub_pd(a,b); } -NG_INLINE __m256d operator* (__m256d a, __m256d b) { return _mm256_mul_pd(a,b); } -NG_INLINE __m256d operator/ (__m256d a, __m256d b) { return _mm256_div_pd(a,b); } -NG_INLINE __m256d operator* (double a, __m256d b) { return _mm256_set1_pd(a)*b; } -NG_INLINE __m256d operator* (__m256d b, double a) { return _mm256_set1_pd(a)*b; } - -NG_INLINE __m256d operator+= (__m256d &a, __m256d b) { return a = a+b; } -NG_INLINE __m256d operator-= (__m256d &a, __m256d b) { return a = a-b; } -NG_INLINE __m256d operator*= (__m256d &a, __m256d b) { return a = a*b; } -NG_INLINE __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; } -#endif -#endif - - - -namespace ngsimd -{ - // MSVC does not define SSE. It's always present on 64bit cpus -#if (defined(_M_AMD64) || defined(_M_X64) || defined(__AVX__)) -#ifndef __SSE__ -#define __SSE__ -#endif -#ifndef __SSE2__ -#define __SSE2__ -#endif -#endif - - - constexpr int GetDefaultSIMDSize() { -#if defined __AVX512F__ - return 8; -#elif defined __AVX__ - return 4; -#elif defined __SSE__ - return 2; -#else - return 1; -#endif - } - -#if defined __AVX512F__ - typedef __m512 tAVX; - typedef __m512d tAVXd; -#elif defined __AVX__ - typedef __m256 tAVX; - typedef __m256d tAVXd; -#elif defined __SSE__ - typedef __m128 tAVX; - typedef __m128d tAVXd; -#endif - - - - template class SIMD; - - template - struct has_call_operator - { - template static std::true_type check( decltype( sizeof(&C::operator() )) ) { return std::true_type(); } - template static std::false_type check(...) { return std::false_type(); } - typedef decltype( check(sizeof(char)) ) type; - static constexpr type value = type(); - }; - - template - // a*b+c - NG_INLINE auto FMA(T1 a, T2 b, T3 c) - { - return a*b+c; - } - - template::value, int>::type = 0> - NG_INLINE SIMD operator+ (T a, SIMD b) { return SIMD(a) + b; } - template::value, int>::type = 0> - NG_INLINE SIMD operator- (T a, SIMD b) { return SIMD(a) - b; } - template::value, int>::type = 0> - NG_INLINE SIMD operator* (T a, SIMD b) { return SIMD(a) * b; } - template::value, int>::type = 0> - NG_INLINE SIMD operator/ (T a, SIMD b) { return SIMD(a) / b; } - template::value, int>::type = 0> - NG_INLINE SIMD operator+ (SIMD a, T b) { return a + SIMD(b); } - template::value, int>::type = 0> - NG_INLINE SIMD operator- (SIMD a, T b) { return a - SIMD(b); } - template::value, int>::type = 0> - NG_INLINE SIMD operator* (SIMD a, T b) { return a * SIMD(b); } - template::value, int>::type = 0> - NG_INLINE SIMD operator/ (SIMD a, T b) { return a / SIMD(b); } - - -using std::sqrt; -using std::fabs; - - class ExceptionNOSIMD : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - std::string What() { return what(); } - }; - - using std::exp; - template NG_INLINE SIMD exp (SIMD a) - { - return SIMD([&](int i)->double { return exp(a[i]); } ); - } - - using std::log; - template NG_INLINE SIMD log (SIMD a) - { - return SIMD([&](int i)->double { return log(a[i]); } ); - } - - using std::pow; - template NG_INLINE SIMD pow (SIMD a, double x) - { - return SIMD([&](int i)->double { return pow(a[i],x); } ); - } - - using std::sin; - template NG_INLINE SIMD sin (SIMD a) - { - return SIMD([&](int i)->double { return sin(a[i]); } ); - } - - using std::cos; - template NG_INLINE SIMD cos (SIMD a) - { - return SIMD([&](int i)->double { return cos(a[i]); } ); - } - - using std::tan; - template NG_INLINE SIMD tan (SIMD a) - { - return SIMD([&](int i)->double { return tan(a[i]); } ); - } - - using std::atan; - template NG_INLINE SIMD atan (SIMD a) - { - return SIMD([&](int i)->double { return atan(a[i]); } ); - } - - -///////////////////////////////////////////////////////////////////////////// -// SIMD width 1 (in case no AVX support is available) -///////////////////////////////////////////////////////////////////////////// - template<> - class SIMD - { - double data; - - public: - static constexpr int Size() { return 1; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - // only called if T has a call operator of appropriate type - template>::value, int>::type = 0> - SIMD (const T & func) - { - data = func(0); - } - - // only called if T is arithmetic (integral or floating point types) - template::value, int>::type = 0> - SIMD (const T & val) - { - data = val; - } - - SIMD (double const * p) - { - data = *p; - } - - NG_INLINE operator double() const { return data; } - NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } - NG_INLINE double Data() const { return data; } - NG_INLINE double & Data() { return data; } - - NG_INLINE SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } - NG_INLINE SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } - NG_INLINE SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } - NG_INLINE SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } - - }; - - NG_INLINE SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } - NG_INLINE SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } - NG_INLINE SIMD operator- (SIMD a) { return -a.Data(); } - NG_INLINE SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } - NG_INLINE SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } - - NG_INLINE SIMD sqrt (SIMD a) { return std::sqrt(a.Data()); } - NG_INLINE SIMD floor (SIMD a) { return std::floor(a.Data()); } - NG_INLINE SIMD ceil (SIMD a) { return std::ceil(a.Data()); } - NG_INLINE SIMD fabs (SIMD a) { return std::fabs(a.Data()); } - NG_INLINE SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - NG_INLINE SIMD Trans (SIMD a) { return a; } - NG_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - return (a.Data() > 0) ? b : c; - } - - NG_INLINE double HSum (SIMD sd) - { - return sd.Data(); - } - - NG_INLINE auto HSum (SIMD sd1, SIMD sd2) - { - return std::make_tuple(sd1.Data(), sd2.Data()); - } - - NG_INLINE auto HSum (SIMD sd1, SIMD sd2, SIMD sd3, SIMD sd4) - { - return std::make_tuple(sd1.Data(), sd2.Data(), sd3.Data(), sd4.Data()); - } - - -///////////////////////////////////////////////////////////////////////////// -// SSE - Simd width 2 -///////////////////////////////////////////////////////////////////////////// -#ifdef __SSE__ - template<> - class alignas(16) SIMD - { - __m128d data; - - public: - static constexpr int Size() { return 2; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - SIMD (__m128d adata) - : data(adata) - { ; } - - // only called if T has a call operator of appropriate type - template>::value, int>::type = 0> - SIMD (const T & func) - { - data = _mm_set_pd(func(1), func(0)); - } - - // only called if T is arithmetic (integral or floating point types) - template::value, int>::type = 0> - SIMD (const T & val) - { - data = _mm_set1_pd(val); - } - - SIMD (double const * p) - { - data = _mm_loadu_pd(p); - } - - NG_INLINE operator __m128d() const { return data; } - NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } - NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; } - NG_INLINE __m128d Data() const { return data; } - NG_INLINE __m128d & Data() { return data; } - - // NG_INLINE operator std::tuple () - // { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } - - - NG_INLINE SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } - NG_INLINE SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } - NG_INLINE SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } - NG_INLINE SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } - - }; - - NG_INLINE SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } - NG_INLINE SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } - NG_INLINE SIMD operator- (SIMD a) { return -a.Data(); } - NG_INLINE SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } - NG_INLINE SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } - - /* - NG_INLINE SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } - NG_INLINE SIMD floor (SIMD a) { return _mm256_floor_pd(a.Data()); } - NG_INLINE SIMD ceil (SIMD a) { return _mm256_ceil_pd(a.Data()); } - NG_INLINE SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), -a.Data()); } - NG_INLINE SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - NG_INLINE SIMD Trans (SIMD a) { return a; } - NG_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); - return _mm256_blendv_pd(c.Data(), b.Data(), cp); - } - - NG_INLINE double HSum (SIMD sd) - { - __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); - return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); - } - - NG_INLINE auto HSum (SIMD sd1, SIMD sd2) - { - __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data()); - __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1)); - return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); - } - - NG_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) - { - __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data()); - __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data()); - __m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16), - _mm256_blend_pd (hsum1, hsum2, 12)); - return SIMD(hsum); - } - */ -#endif // __SSE__ - - - -///////////////////////////////////////////////////////////////////////////// -// AVX - Simd width 4 -///////////////////////////////////////////////////////////////////////////// -#ifdef __AVX__ - template<> - class alignas(32) SIMD - { - __m256d data; - - public: - static constexpr int Size() { return 4; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - SIMD (__m256d adata) - : data(adata) - { ; } - - // only called if T has a call operator of appropriate type - template>::value, int>::type = 0> - SIMD (const T & func) - { - data = _mm256_set_pd(func(3), func(2), func(1), func(0)); - } - - // only called if T is arithmetic (integral or floating point types) - template::value, int>::type = 0> - SIMD (const T & val) - { - data = _mm256_set1_pd(val); - } - - SIMD (double const * p) - { - data = _mm256_loadu_pd(p); - } - - NG_INLINE operator __m256d() const { return data; } - NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } - NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; } - NG_INLINE __m256d Data() const { return data; } - NG_INLINE __m256d & Data() { return data; } - - NG_INLINE operator std::tuple () - { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } - - - NG_INLINE SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } - NG_INLINE SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } - NG_INLINE SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } - NG_INLINE SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } - - }; - - NG_INLINE SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } - NG_INLINE SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } - NG_INLINE SIMD operator- (SIMD a) { return -a.Data(); } - NG_INLINE SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } - NG_INLINE SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } - - NG_INLINE SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } - NG_INLINE SIMD floor (SIMD a) { return _mm256_floor_pd(a.Data()); } - NG_INLINE SIMD ceil (SIMD a) { return _mm256_ceil_pd(a.Data()); } - NG_INLINE SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), -a.Data()); } - NG_INLINE SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - NG_INLINE SIMD Trans (SIMD a) { return a; } - NG_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); - return _mm256_blendv_pd(c.Data(), b.Data(), cp); - } - - NG_INLINE double HSum (SIMD sd) - { - __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); - return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); - } - - NG_INLINE auto HSum (SIMD sd1, SIMD sd2) - { - __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data()); - __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1)); - return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); - } - - NG_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) - { - __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data()); - __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data()); - __m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16), - _mm256_blend_pd (hsum1, hsum2, 12)); - return SIMD(hsum); - } - -#endif // __AVX__ - -///////////////////////////////////////////////////////////////////////////// -// AVX512 - Simd width 8 -///////////////////////////////////////////////////////////////////////////// -#ifdef __AVX512F__ - template<> - class alignas(64) SIMD - { - __m512d data; - - public: - static constexpr int Size() { return 8; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - SIMD (__m512d adata) - : data(adata) - { ; } - - // only called if T has a call operator of appropriate type - template>::value, int>::type = 0> - SIMD (const T & func) - { - data = _mm512_set_pd(func(7), func(6), func(5), func(4), - func(3), func(2), func(1), func(0)); - } - - // only called if T is arithmetic (integral or floating point types) - template::value, int>::type = 0> - SIMD (const T & val) - { - data = _mm512_set1_pd(val); - } - - SIMD (double const * p) - { - data = _mm512_loadu_pd(p); - } - - NG_INLINE operator __m512d() const { return data; } - NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } - NG_INLINE __m512d Data() const { return data; } - NG_INLINE __m512d & Data() { return data; } - - NG_INLINE SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } - NG_INLINE SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } - NG_INLINE SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } - NG_INLINE SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } - - }; - - NG_INLINE SIMD operator- (SIMD a) { return _mm512_sub_pd(_mm512_setzero_pd(), a.Data()); } - - NG_INLINE SIMD operator+ (SIMD a, SIMD b) { return _mm512_add_pd(a.Data(),b.Data()); } - NG_INLINE SIMD operator- (SIMD a, SIMD b) { return _mm512_sub_pd(a.Data(),b.Data()); } - NG_INLINE SIMD operator* (SIMD a, SIMD b) { return _mm512_mul_pd(a.Data(),b.Data()); } - NG_INLINE SIMD operator/ (SIMD a, SIMD b) { return _mm512_div_pd(a.Data(),b.Data()); } - - NG_INLINE SIMD sqrt (SIMD a) { return _mm512_sqrt_pd(a.Data()); } - NG_INLINE SIMD floor (SIMD a) { return _mm512_floor_pd(a.Data()); } - NG_INLINE SIMD ceil (SIMD a) { return _mm512_ceil_pd(a.Data()); } - NG_INLINE SIMD fabs (SIMD a) { return _mm512_max_pd(a.Data(), -a.Data()); } - NG_INLINE SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - NG_INLINE SIMD Trans (SIMD a) { return a; } - NG_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - auto cp = _mm512_cmp_pd_mask (a.Data(), _mm512_setzero_pd(), _MM_CMPINT_GT); - return _mm512_mask_blend_pd(cp, c.Data(), b.Data()); - } - - - template<> NG_INLINE auto FMA (SIMD a, SIMD b, SIMD c) - { - return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data()); - } - - NG_INLINE double HSum (SIMD sd) - { - SIMD low = _mm512_extractf64x4_pd(sd.Data(),0); - SIMD high = _mm512_extractf64x4_pd(sd.Data(),1); - return HSum(low)+HSum(high); - } - - NG_INLINE auto HSum (SIMD sd1, SIMD sd2) - { - return std::make_tuple(HSum(sd1), HSum(sd2)); - } - - NG_INLINE SIMD HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) - { - SIMD high1 = _mm512_extractf64x4_pd(v1.Data(),1); - SIMD high2 = _mm512_extractf64x4_pd(v2.Data(),1); - SIMD high3 = _mm512_extractf64x4_pd(v3.Data(),1); - SIMD high4 = _mm512_extractf64x4_pd(v4.Data(),1); - SIMD low1 = _mm512_extractf64x4_pd(v1.Data(),0); - SIMD low2 = _mm512_extractf64x4_pd(v2.Data(),0); - SIMD low3 = _mm512_extractf64x4_pd(v3.Data(),0); - SIMD low4 = _mm512_extractf64x4_pd(v4.Data(),0); - return HSum(low1,low2,low3,low4) + HSum(high1,high2,high3,high4); - } -#endif // __AVX512F__ - - -//////////////////////////////////////////////////////////////////////////////// -// MultiSIMD - Multiple SIMD values in one struct using head-tail implementation -//////////////////////////////////////////////////////////////////////////////// - template - class MultiSIMD - { - SIMD head; - MultiSIMD tail; - public: - MultiSIMD () = default; - MultiSIMD (const MultiSIMD & ) = default; - MultiSIMD (T v) : head(v), tail(v) { ; } - MultiSIMD (SIMD _head, MultiSIMD _tail) - : head(_head), tail(_tail) { ; } - SIMD Head() const { return head; } - MultiSIMD Tail() const { return tail; } - SIMD & Head() { return head; } - MultiSIMD & Tail() { return tail; } - - template - SIMD Get() const { return NR==0 ? head : tail.template Get(); } - template - SIMD & Get() { return NR==0 ? head : tail.template Get(); } - auto MakeTuple() { return std::tuple_cat(std::tuple&> (head), tail.MakeTuple()); } - // not yet possible for MSVC - // operator auto () { return MakeTuple(); } - }; - - template - class MultiSIMD<2,T> - { - SIMD v0, v1; - public: - MultiSIMD () = default; - MultiSIMD (const MultiSIMD & ) = default; - MultiSIMD (T v) : v0(v), v1(v) { ; } - MultiSIMD (SIMD _v0, SIMD _v1) : v0(_v0), v1(_v1) { ; } - - SIMD Head() const { return v0; } - SIMD Tail() const { return v1; } - SIMD & Head() { return v0; } - SIMD & Tail() { return v1; } - - template - SIMD Get() const { return NR==0 ? v0 : v1; } - template - SIMD & Get() { return NR==0 ? v0 : v1; } - auto MakeTuple() { return std::tuple&, SIMD&> (v0, v1); } - operator std::tuple&, SIMD&>() { return MakeTuple(); } - }; - - template NG_INLINE MultiSIMD operator+ (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()+b.Head(), a.Tail()+b.Tail()); } - template NG_INLINE MultiSIMD operator+ (double a, MultiSIMD b) - { return MultiSIMD (a+b.Head(), a+b.Tail()); } - template NG_INLINE MultiSIMD operator+ (MultiSIMD b, double a) - { return MultiSIMD (a+b.Head(), a+b.Tail()); } - - template NG_INLINE MultiSIMD operator- (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()-b.Head(), a.Tail()-b.Tail()); } - template NG_INLINE MultiSIMD operator- (double a, MultiSIMD b) - { return MultiSIMD (a-b.Head(), a-b.Tail()); } - template NG_INLINE MultiSIMD operator- (MultiSIMD b, double a) - { return MultiSIMD (b.Head()-a, b.Tail()-a); } - template NG_INLINE MultiSIMD operator- (MultiSIMD a) - { return MultiSIMD (-a.Head(), -a.Tail()); } - template NG_INLINE MultiSIMD operator* (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()*b.Head(), a.Tail()*b.Tail()); } - template NG_INLINE MultiSIMD operator/ (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()/b.Head(), a.Tail()/b.Tail()); } - template NG_INLINE MultiSIMD operator* (double a, MultiSIMD b) - { return MultiSIMD ( a*b.Head(), a*b.Tail()); } - template NG_INLINE MultiSIMD operator* (MultiSIMD b, double a) - { return MultiSIMD ( a*b.Head(), a*b.Tail()); } - - template NG_INLINE MultiSIMD & operator+= (MultiSIMD & a, MultiSIMD b) - { a.Head()+=b.Head(); a.Tail()+=b.Tail(); return a; } - template NG_INLINE MultiSIMD operator-= (MultiSIMD & a, double b) - { a.Head()-=b; a.Tail()-=b; return a; } - template NG_INLINE MultiSIMD operator-= (MultiSIMD & a, MultiSIMD b) - { a.Head()-=b.Head(); a.Tail()-=b.Tail(); return a; } - template NG_INLINE MultiSIMD & operator*= (MultiSIMD & a, MultiSIMD b) - { a.Head()*=b.Head(); a.Tail()*=b.Tail(); return a; } - template NG_INLINE MultiSIMD & operator*= (MultiSIMD & a, double b) - { a.Head()*=b; a.Tail()*=b; return a; } - // NG_INLINE MultiSIMD operator/= (MultiSIMD & a, MultiSIMD b) { return a.Data()/=b.Data(); } - - NG_INLINE SIMD HVSum (SIMD a) { return a; } - template - NG_INLINE SIMD HVSum (MultiSIMD a) { return a.Head() + HVSum(a.Tail()); } - - template NG_INLINE double HSum (MultiSIMD a) { return HSum(HVSum(a)); } - template NG_INLINE auto HSum (MultiSIMD a, MultiSIMD b) - { return HSum(HVSum(a), HVSum(b)); } - - template - std::ostream & operator<< (std::ostream & ost, MultiSIMD multi) - { - ost << multi.Head() << " " << multi.Tail(); - return ost; - } - - template - std::ostream & operator<< (std::ostream & ost, SIMD simd) - { - ost << simd[0]; - for (int i = 1; i < simd.Size(); i++) - ost << " " << simd[i]; - return ost; - } -} - -namespace netgen -{ - using namespace ngsimd; -} -#endif diff --git a/libsrc/interface/nginterface_v2.cpp b/libsrc/interface/nginterface_v2.cpp index a276f376..18563f91 100644 --- a/libsrc/interface/nginterface_v2.cpp +++ b/libsrc/interface/nginterface_v2.cpp @@ -779,29 +779,23 @@ namespace netgen -#ifdef __SSE__ -#include - template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,1> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "multi-eltrafo simd called, 1,1,simd" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<2,2> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<2> - (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + (elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -828,15 +822,15 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<3,3> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointElementTransformation (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + xi, sxi, + x, sx, + dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -863,33 +857,30 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<0,2> (int elnr, int npts, - const tAVXd *xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD *xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "MultiElementtransformation<0,2> simd not implemented" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<0,1> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "multi-eltrafo simd called, 0,1,simd" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,3> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<3> - (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + (elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi); /* double hxi[4][1]; double hx[4][3]; @@ -912,15 +903,12 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,2> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<2> - (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + (elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -947,15 +935,12 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<2,3> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<3> - (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + (elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -982,9 +967,9 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<0,3> (int elnr, int npts, - const tAVXd * xi, size_t sxi, - tAVXd * x, size_t sx, - tAVXd * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { for (int i = 0; i < npts; i++) { @@ -1003,10 +988,6 @@ namespace netgen } } - -#endif - - diff --git a/libsrc/visualization/soldata.hpp b/libsrc/visualization/soldata.hpp index c522f1a3..6a5c5626 100644 --- a/libsrc/visualization/soldata.hpp +++ b/libsrc/visualization/soldata.hpp @@ -6,15 +6,6 @@ namespace netgen { using namespace std; - /* -#if defined __AVX512F__ - typedef __m512 tAVX; - typedef __m512d tAVXd; -#elif defined __AVX__ - typedef __m256 tAVX; - typedef __m256d tAVXd; -#endif - */ class SolutionData { @@ -101,17 +92,15 @@ namespace netgen return res; } -#ifdef __SSE__ virtual bool GetMultiSurfValue (size_t selnr, size_t facetnr, size_t npts, - const ngsimd::tAVXd * xref, - const ngsimd::tAVXd * x, - const ngsimd::tAVXd * dxdxref, - ngsimd::tAVXd * values) + const SIMD * xref, + const SIMD * x, + const SIMD * dxdxref, + SIMD * values) { cerr << "GetMultiSurfVaue not overloaded for SIMD" << endl; return false; } -#endif virtual bool GetSegmentValue (int segnr, double xref, double * values) { return false; }