From 643c89538d41935dac18e8f44105ac7def81f9ae Mon Sep 17 00:00:00 2001 From: Matthias Hochsteger Date: Wed, 19 Apr 2017 18:02:27 +0200 Subject: [PATCH] Add support for AVX512 Restructure mysimd.hpp and add support for AVX512. Changes include - Move mysimd.hpp to ngsimd.hpp - Expose ngsimd.hpp to NGSolve - New namespace ngsimd - Second template parameter (width) for SIMD class, default to the largest width available - Avoid raw avx register types in the interface, use SIMD<> instead --- libsrc/general/CMakeLists.txt | 2 +- libsrc/general/myadt.hpp | 2 +- libsrc/general/mysimd.hpp | 403 -------------------- libsrc/general/ngsimd.hpp | 553 ++++++++++++++++++++++++++++ libsrc/include/CMakeLists.txt | 2 +- libsrc/include/ngsimd.hpp | 1 + libsrc/interface/nginterface_v2.cpp | 70 ++-- libsrc/visualization/soldata.hpp | 12 +- libsrc/visualization/vssolution.cpp | 8 +- 9 files changed, 597 insertions(+), 456 deletions(-) delete mode 100644 libsrc/general/mysimd.hpp create mode 100644 libsrc/general/ngsimd.hpp create mode 100644 libsrc/include/ngsimd.hpp diff --git a/libsrc/general/CMakeLists.txt b/libsrc/general/CMakeLists.txt index f0c5b97f..811bc80d 100644 --- a/libsrc/general/CMakeLists.txt +++ b/libsrc/general/CMakeLists.txt @@ -13,7 +13,7 @@ install( FILES ngexception.hpp DESTINATION ${INCDIR} COMPONENT netgen_devel ) install(FILES archive_base.hpp array.hpp autodiff.hpp autoptr.hpp bitarray.hpp dynamicmem.hpp flags.hpp hashtabl.hpp mpi_interface.hpp myadt.hpp - mysimd.hpp mystring.hpp netgenout.hpp ngexception.hpp ngpython.hpp + ngsimd.hpp mystring.hpp netgenout.hpp ngexception.hpp ngpython.hpp optmem.hpp parthreads.hpp profiler.hpp seti.hpp sort.hpp spbita2d.hpp stack.hpp symbolta.hpp table.hpp template.hpp gzstream.h diff --git a/libsrc/general/myadt.hpp b/libsrc/general/myadt.hpp index 8fa8b95d..40279563 100644 --- a/libsrc/general/myadt.hpp +++ b/libsrc/general/myadt.hpp @@ -46,7 +46,7 @@ #include "gzstream.h" #include "archive_base.hpp" -#include "mysimd.hpp" +#include "ngsimd.hpp" #endif diff --git a/libsrc/general/mysimd.hpp b/libsrc/general/mysimd.hpp deleted file mode 100644 index 124ff91b..00000000 --- a/libsrc/general/mysimd.hpp +++ /dev/null @@ -1,403 +0,0 @@ -#ifndef FILE_MYSIMD -#define FILE_MYSIMD - -/**************************************************************************/ -/* File: mysimd.hpp */ -/* Author: Joachim Schoeberl */ -/* Date: 25. Mar. 16 */ -/**************************************************************************/ - -#include - -#ifdef WIN32 -#ifndef AVX_OPERATORS_DEFINED -#define AVX_OPERATORS_DEFINED -inline __m128d operator- (__m128d a) { return _mm_xor_pd(a, _mm_set1_pd(-0.0)); } -inline __m128d operator+ (__m128d a, __m128d b) { return _mm_add_pd(a,b); } -inline __m128d operator- (__m128d a, __m128d b) { return _mm_sub_pd(a,b); } -inline __m128d operator* (__m128d a, __m128d b) { return _mm_mul_pd(a,b); } -inline __m128d operator/ (__m128d a, __m128d b) { return _mm_div_pd(a,b); } -inline __m128d operator* (double a, __m128d b) { return _mm_set1_pd(a)*b; } -inline __m128d operator* (__m128d b, double a) { return _mm_set1_pd(a)*b; } - -inline __m128d operator+= (__m128d &a, __m128d b) { return a = a+b; } -inline __m128d operator-= (__m128d &a, __m128d b) { return a = a-b; } -inline __m128d operator*= (__m128d &a, __m128d b) { return a = a*b; } -inline __m128d operator/= (__m128d &a, __m128d b) { return a = a/b; } - -inline __m256d operator- (__m256d a) { return _mm256_xor_pd(a, _mm256_set1_pd(-0.0)); } -inline __m256d operator+ (__m256d a, __m256d b) { return _mm256_add_pd(a,b); } -inline __m256d operator- (__m256d a, __m256d b) { return _mm256_sub_pd(a,b); } -inline __m256d operator* (__m256d a, __m256d b) { return _mm256_mul_pd(a,b); } -inline __m256d operator/ (__m256d a, __m256d b) { return _mm256_div_pd(a,b); } -inline __m256d operator* (double a, __m256d b) { return _mm256_set1_pd(a)*b; } -inline __m256d operator* (__m256d b, double a) { return _mm256_set1_pd(a)*b; } - -inline __m256d operator+= (__m256d &a, __m256d b) { return a = a+b; } -inline __m256d operator-= (__m256d &a, __m256d b) { return a = a-b; } -inline __m256d operator*= (__m256d &a, __m256d b) { return a = a*b; } -inline __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; } -#endif -#endif - - - -namespace netgen -{ - template class SIMD; - - template - struct has_call_operator - { - template static std::true_type check( decltype( sizeof(&C::operator() )) ) { return std::true_type(); } - template static std::false_type check(...) { return std::false_type(); } - typedef decltype( check(sizeof(char)) ) type; - static constexpr type value = type(); - }; - -#ifdef __AVX__ - - template - class AlignedAlloc - { - protected: - static void * aligned_malloc(size_t s) - { - // Assume 16 byte alignment of standard library - if(alignof(T)<=16) - return malloc(s); - else - return _mm_malloc(s, alignof(T)); - } - - static void aligned_free(void *p) - { - if(alignof(T)<=16) - free(p); - else - _mm_free(p); - } - - public: - void * operator new (size_t s, void *p) { return p; } - void * operator new (size_t s) { return aligned_malloc(s); } - void * operator new[] (size_t s) { return aligned_malloc(s); } - void operator delete (void * p) { aligned_free(p); } - void operator delete[] (void * p) { aligned_free(p); } - }; - - template<> - class alignas(32) SIMD : public AlignedAlloc> - { - __m256d data; - - public: - static constexpr int Size() { return 4; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - SIMD (double val) - { - data = _mm256_set1_pd(val); - } - - SIMD (__m256d adata) - : data(adata) - { ; } - - /* - template - SIMD (const T & val) - { -// SIMD_function(val, std::is_convertible>()); - SIMD_function(val, has_call_operator::value); - } - */ - - /* - template - SIMD & operator= (const T & val) - { -// SIMD_function(val, std::is_convertible>()); - SIMD_function(val, has_call_operator::value); - return *this; - } - */ - - - template - void SIMD_function (const Function & func, std::true_type) - { - data = _mm256_set_pd(func(3), func(2), func(1), func(0)); - } - - // not a function - void SIMD_function (double const * p, std::false_type) - { - data = _mm256_loadu_pd(p); - } - - void SIMD_function (double val, std::false_type) - { - data = _mm256_set1_pd(val); - } - - void SIMD_function (__m256d _data, std::false_type) - { - data = _data; - } - - inline double operator[] (int i) const { return ((double*)(&data))[i]; } - inline __m256d Data() const { return data; } - inline __m256d & Data() { return data; } - }; - - - inline SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } - inline SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } - inline SIMD operator- (SIMD a) { return -a.Data(); } - inline SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } - inline SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } - inline SIMD operator* (double a, SIMD b) { return SIMD(a)*b; } - inline SIMD operator* (SIMD b, double a) { return SIMD(a)*b; } - inline SIMD operator+= (SIMD & a, SIMD b) { return a.Data()+=b.Data(); } - inline SIMD operator-= (SIMD & a, SIMD b) { return a.Data()-=b.Data(); } - inline SIMD operator*= (SIMD & a, SIMD b) { return a.Data()*=b.Data(); } - inline SIMD operator/= (SIMD & a, SIMD b) { return a.Data()/=b.Data(); } - - using std::sqrt; - using std::fabs; - - inline SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } - inline SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), -a.Data()); } - inline SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - inline SIMD Trans (SIMD a) { return a; } - inline SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); - return _mm256_blendv_pd(c.Data(), b.Data(), cp); - } - - inline double HSum (SIMD sd) - { - __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); - return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); - } - - - - -#else - - // it's only a dummy without AVX - template - class AlignedAlloc { ; }; - - template<> - class SIMD - { - double data; - - public: - static constexpr int Size() { return 1; } - SIMD () = default; - SIMD (const SIMD &) = default; - SIMD & operator= (const SIMD &) = default; - - SIMD (double val) - : data(val) { ; } - - /* - template - SIMD (const T & val) - { -// SIMD_function(val, std::is_convertible>()); - SIMD_function(val, has_call_operator::value); - } - */ - - - template - SIMD & operator= (const T & val) - { -// SIMD_function(val, std::is_convertible>()); - SIMD_function(val, has_call_operator::value); - return *this; - } - - template - void SIMD_function (const Function & func, std::true_type) - { - data = func(0); - } - - // not a function - void SIMD_function (double const * p, std::false_type) - { - data = *p; - } - - void SIMD_function (double val, std::false_type) - { - data = val; - } - - double operator[] (int i) const { return ((double*)(&data))[i]; } - double Data() const { return data; } - double & Data() { return data; } - }; - - - inline SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } - inline SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } - inline SIMD operator- (SIMD a) { return -a.Data(); } - inline SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } - inline SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } - inline SIMD operator* (double a, SIMD b) { return SIMD(a)*b; } - inline SIMD operator* (SIMD b, double a) { return SIMD(a)*b; } - inline SIMD operator+= (SIMD & a, SIMD b) { return a.Data()+=b.Data(); } - inline SIMD operator-= (SIMD & a, SIMD b) { return a.Data()-=b.Data(); } - inline SIMD operator*= (SIMD & a, SIMD b) { return a.Data()*=b.Data(); } - inline SIMD operator/= (SIMD & a, SIMD b) { return a.Data()/=b.Data(); } - - using std::sqrt; - using std::fabs; - - inline SIMD sqrt (SIMD a) { return std::sqrt(a.Data()); } - inline SIMD fabs (SIMD a) { return std::fabs(a.Data()); } - inline SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } - inline SIMD Trans (SIMD a) { return a; } - inline SIMD IfPos (SIMD a, SIMD b, SIMD c) - { - return (a.Data() > 0) ? b : c; - } - - inline double HSum (SIMD sd) - { return sd.Data(); } - -#endif - - - - - - - - template - ostream & operator<< (ostream & ost, SIMD simd) - { - ost << simd[0]; - for (int i = 1; i < simd.Size(); i++) - ost << " " << simd[i]; - return ost; - } - - /* - using std::exp; - inline netgen::SIMD exp (netgen::SIMD a) { - return netgen::SIMD([&](int i)->double { return exp(a[i]); } ); -} - - using std::log; -inline netgen::SIMD log (netgen::SIMD a) { - return netgen::SIMD([&](int i)->double { return log(a[i]); } ); -} - - using std::pow; -inline netgen::SIMD pow (netgen::SIMD a, double x) { - return netgen::SIMD([&](int i)->double { return pow(a[i],x); } ); -} - */ - - - template - class MultiSIMD - { - SIMD head; - MultiSIMD tail; - public: - MultiSIMD () = default; - MultiSIMD (const MultiSIMD & ) = default; - MultiSIMD (T v) : head(v), tail(v) { ; } - MultiSIMD (SIMD _head, MultiSIMD _tail) - : head(_head), tail(_tail) { ; } - SIMD Head() const { return head; } - MultiSIMD Tail() const { return tail; } - SIMD & Head() { return head; } - MultiSIMD & Tail() { return tail; } - - template - SIMD Get() const { return NR==0 ? head : tail.template Get(); } - template - SIMD & Get() { return NR==0 ? head : tail.template Get(); } - }; - - template - class MultiSIMD<2,T> - { - SIMD v0, v1; - public: - MultiSIMD () = default; - MultiSIMD (const MultiSIMD & ) = default; - MultiSIMD (T v) : v0(v), v1(v) { ; } - MultiSIMD (SIMD _v0, SIMD _v1) : v0(_v0), v1(_v1) { ; } - - SIMD Head() const { return v0; } - SIMD Tail() const { return v1; } - SIMD & Head() { return v0; } - SIMD & Tail() { return v1; } - - template - SIMD Get() const { return NR==0 ? v0 : v1; } - template - SIMD & Get() { return NR==0 ? v0 : v1; } - }; - - template inline MultiSIMD operator+ (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()+b.Head(), a.Tail()+b.Tail()); } - template inline MultiSIMD operator+ (double a, MultiSIMD b) - { return MultiSIMD (a+b.Head(), a+b.Tail()); } - template inline MultiSIMD operator+ (MultiSIMD b, double a) - { return MultiSIMD (a+b.Head(), a+b.Tail()); } - - template inline MultiSIMD operator- (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()-b.Head(), a.Tail()-b.Tail()); } - template inline MultiSIMD operator- (double a, MultiSIMD b) - { return MultiSIMD (a-b.Head(), a-b.Tail()); } - template inline MultiSIMD operator- (MultiSIMD b, double a) - { return MultiSIMD (b.Head()-a, b.Tail()-a); } - template inline MultiSIMD operator- (MultiSIMD a) - { return MultiSIMD (-a.Head(), -a.Tail()); } - template inline MultiSIMD operator* (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()*b.Head(), a.Tail()*b.Tail()); } - template inline MultiSIMD operator/ (MultiSIMD a, MultiSIMD b) - { return MultiSIMD (a.Head()/b.Head(), a.Tail()/b.Tail()); } - template inline MultiSIMD operator* (double a, MultiSIMD b) - { return MultiSIMD ( a*b.Head(), a*b.Tail()); } - template inline MultiSIMD operator* (MultiSIMD b, double a) - { return MultiSIMD ( a*b.Head(), a*b.Tail()); } - - template inline MultiSIMD & operator+= (MultiSIMD & a, MultiSIMD b) - { a.Head()+=b.Head(); a.Tail()+=b.Tail(); return a; } - template inline MultiSIMD operator-= (MultiSIMD & a, double b) - { a.Head()-=b; a.Tail()-=b; return a; } - template inline MultiSIMD operator-= (MultiSIMD & a, MultiSIMD b) - { a.Head()-=b.Head(); a.Tail()-=b.Tail(); return a; } - template inline MultiSIMD & operator*= (MultiSIMD & a, MultiSIMD b) - { a.Head()*=b.Head(); a.Tail()*=b.Tail(); return a; } - template inline MultiSIMD & operator*= (MultiSIMD & a, double b) - { a.Head()*=b; a.Tail()*=b; return a; } - // inline MultiSIMD operator/= (MultiSIMD & a, MultiSIMD b) { return a.Data()/=b.Data(); } - - - template - ostream & operator<< (ostream & ost, MultiSIMD multi) - { - ost << multi.Head() << " " << multi.Tail(); - return ost; - } - -} - -#endif diff --git a/libsrc/general/ngsimd.hpp b/libsrc/general/ngsimd.hpp new file mode 100644 index 00000000..feb0c046 --- /dev/null +++ b/libsrc/general/ngsimd.hpp @@ -0,0 +1,553 @@ +#ifndef FILE_NGSIMD +#define FILE_NGSIMD +/**************************************************************************/ +/* File: ngsimd.hpp */ +/* Author: Joachim Schoeberl */ +/* Date: 25. Mar. 16 */ +/**************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#ifdef WIN32 +#ifndef AVX_OPERATORS_DEFINED +#define AVX_OPERATORS_DEFINED +inline __m128d operator- (__m128d a) { return _mm_xor_pd(a, _mm_set1_pd(-0.0)); } +inline __m128d operator+ (__m128d a, __m128d b) { return _mm_add_pd(a,b); } +inline __m128d operator- (__m128d a, __m128d b) { return _mm_sub_pd(a,b); } +inline __m128d operator* (__m128d a, __m128d b) { return _mm_mul_pd(a,b); } +inline __m128d operator/ (__m128d a, __m128d b) { return _mm_div_pd(a,b); } +inline __m128d operator* (double a, __m128d b) { return _mm_set1_pd(a)*b; } +inline __m128d operator* (__m128d b, double a) { return _mm_set1_pd(a)*b; } + +inline __m128d operator+= (__m128d &a, __m128d b) { return a = a+b; } +inline __m128d operator-= (__m128d &a, __m128d b) { return a = a-b; } +inline __m128d operator*= (__m128d &a, __m128d b) { return a = a*b; } +inline __m128d operator/= (__m128d &a, __m128d b) { return a = a/b; } + +inline __m256d operator- (__m256d a) { return _mm256_xor_pd(a, _mm256_set1_pd(-0.0)); } +inline __m256d operator+ (__m256d a, __m256d b) { return _mm256_add_pd(a,b); } +inline __m256d operator- (__m256d a, __m256d b) { return _mm256_sub_pd(a,b); } +inline __m256d operator* (__m256d a, __m256d b) { return _mm256_mul_pd(a,b); } +inline __m256d operator/ (__m256d a, __m256d b) { return _mm256_div_pd(a,b); } +inline __m256d operator* (double a, __m256d b) { return _mm256_set1_pd(a)*b; } +inline __m256d operator* (__m256d b, double a) { return _mm256_set1_pd(a)*b; } + +inline __m256d operator+= (__m256d &a, __m256d b) { return a = a+b; } +inline __m256d operator-= (__m256d &a, __m256d b) { return a = a-b; } +inline __m256d operator*= (__m256d &a, __m256d b) { return a = a*b; } +inline __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; } +#endif +#endif + + + +namespace ngsimd +{ + constexpr int GetDefaultSIMDSize() { +#if defined __AVX512F__ + return 8; +#elif defined __AVX__ + return 4; +#else + return 1; +#endif + } + + template class SIMD; + + template + struct has_call_operator + { + template static std::true_type check( decltype( sizeof(&C::operator() )) ) { return std::true_type(); } + template static std::false_type check(...) { return std::false_type(); } + typedef decltype( check(sizeof(char)) ) type; + static constexpr type value = type(); + }; + + template + // a*b+c + inline auto FMA(T1 a, T2 b, T3 c) + { + return a*b+c; + } + + template::value, int>::type = 0> + inline SIMD operator+ (T a, SIMD b) { return SIMD(a) + b; } + template::value, int>::type = 0> + inline SIMD operator- (T a, SIMD b) { return SIMD(a) - b; } + template::value, int>::type = 0> + inline SIMD operator* (T a, SIMD b) { return SIMD(a) * b; } + template::value, int>::type = 0> + inline SIMD operator/ (T a, SIMD b) { return SIMD(a) / b; } + template::value, int>::type = 0> + inline SIMD operator+ (SIMD a, T b) { return a + SIMD(b); } + template::value, int>::type = 0> + inline SIMD operator- (SIMD a, T b) { return a - SIMD(b); } + template::value, int>::type = 0> + inline SIMD operator* (SIMD a, T b) { return a * SIMD(b); } + template::value, int>::type = 0> + inline SIMD operator/ (SIMD a, T b) { return a / SIMD(b); } + + +#ifdef __AVX__ + template + class AlignedAlloc + { + protected: + static void * aligned_malloc(size_t s) + { + // Assume 16 byte alignment of standard library + if(alignof(T)<=16) + return malloc(s); + else + return _mm_malloc(s, alignof(T)); + } + + static void aligned_free(void *p) + { + if(alignof(T)<=16) + free(p); + else + _mm_free(p); + } + + public: + void * operator new (size_t s, void *p) { return p; } + void * operator new (size_t s) { return aligned_malloc(s); } + void * operator new[] (size_t s) { return aligned_malloc(s); } + void operator delete (void * p) { aligned_free(p); } + void operator delete[] (void * p) { aligned_free(p); } + }; +#else + // it's only a dummy without AVX + template + class AlignedAlloc { ; }; + +#endif + +using std::sqrt; +using std::fabs; + + class ExceptionNOSIMD : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + std::string What() { return what(); } + }; + + using std::exp; + template inline SIMD exp (SIMD a) + { + return SIMD([&](int i)->double { return exp(a[i]); } ); + } + + using std::log; + template inline SIMD log (SIMD a) + { + return SIMD([&](int i)->double { return log(a[i]); } ); + } + + using std::pow; + template inline SIMD pow (SIMD a, double x) + { + return SIMD([&](int i)->double { return pow(a[i],x); } ); + } + + using std::sin; + template inline SIMD sin (SIMD a) + { + return SIMD([&](int i)->double { return sin(a[i]); } ); + } + + using std::cos; + template inline SIMD cos (SIMD a) + { + return SIMD([&](int i)->double { return cos(a[i]); } ); + } + + using std::tan; + template inline SIMD tan (SIMD a) + { + return SIMD([&](int i)->double { return tan(a[i]); } ); + } + + using std::atan; + template inline SIMD atan (SIMD a) + { + return SIMD([&](int i)->double { return atan(a[i]); } ); + } + + +///////////////////////////////////////////////////////////////////////////// +// SIMD width 1 (in case no AVX support is available) +///////////////////////////////////////////////////////////////////////////// + template<> + class SIMD + { + double data; + + public: + static constexpr int Size() { return 1; } + SIMD () = default; + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + // only called if T has a call operator of appropriate type + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = func(0); + } + + // only called if T is arithmetic (integral or floating point types) + template::value, int>::type = 0> + SIMD (const T & val) + { + data = val; + } + + SIMD (double const * p) + { + data = *p; + } + + inline operator double() const { return data; } + inline double operator[] (int i) const { return ((double*)(&data))[i]; } + inline double Data() const { return data; } + inline double & Data() { return data; } + + inline SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } + inline SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } + inline SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } + inline SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } + + }; + + inline SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } + inline SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } + inline SIMD operator- (SIMD a) { return -a.Data(); } + inline SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } + inline SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } + + inline SIMD sqrt (SIMD a) { return std::sqrt(a.Data()); } + inline SIMD fabs (SIMD a) { return std::fabs(a.Data()); } + inline SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } + inline SIMD Trans (SIMD a) { return a; } + inline SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + return (a.Data() > 0) ? b : c; + } + + inline double HSum (SIMD sd) + { + return sd.Data(); + } + +///////////////////////////////////////////////////////////////////////////// +// AVX - Simd width 4 +///////////////////////////////////////////////////////////////////////////// +#ifdef __AVX__ + template<> + class alignas(32) SIMD : public AlignedAlloc> + { + __m256d data; + + public: + static constexpr int Size() { return 4; } + SIMD () = default; + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (__m256d adata) + : data(adata) + { ; } + + // only called if T has a call operator of appropriate type + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm256_set_pd(func(3), func(2), func(1), func(0)); + } + + // only called if T is arithmetic (integral or floating point types) + template::value, int>::type = 0> + SIMD (const T & val) + { + data = _mm256_set1_pd(val); + } + + SIMD (double const * p) + { + data = _mm256_loadu_pd(p); + } + + inline operator __m256d() const { return data; } + inline double operator[] (int i) const { return ((double*)(&data))[i]; } + inline __m256d Data() const { return data; } + inline __m256d & Data() { return data; } + + inline SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } + inline SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } + inline SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } + inline SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } + + }; + + inline SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } + inline SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } + inline SIMD operator- (SIMD a) { return -a.Data(); } + inline SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } + inline SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } + + inline SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } + inline SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), -a.Data()); } + inline SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } + inline SIMD Trans (SIMD a) { return a; } + inline SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); + return _mm256_blendv_pd(c.Data(), b.Data(), cp); + } + + inline double HSum (SIMD sd) + { + __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); + return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); + } + + inline auto HSum (SIMD sd1, SIMD sd2) + { + __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data()); + __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1)); + return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); + } + + inline SIMD HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data()); + __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data()); + __m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16), + _mm256_blend_pd (hsum1, hsum2, 12)); + return hsum; + } + +#endif // __AVX__ + +///////////////////////////////////////////////////////////////////////////// +// AVX512 - Simd width 8 +///////////////////////////////////////////////////////////////////////////// +#ifdef __AVX512F__ + template<> + class alignas(64) SIMD : public AlignedAlloc> + { + __m512d data; + + public: + static constexpr int Size() { return 8; } + SIMD () = default; + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (__m512d adata) + : data(adata) + { ; } + + // only called if T has a call operator of appropriate type + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm512_set_pd(func(7), func(6), func(5), func(4), + func(3), func(2), func(1), func(0)); + } + + // only called if T is arithmetic (integral or floating point types) + template::value, int>::type = 0> + SIMD (const T & val) + { + data = _mm512_set1_pd(val); + } + + SIMD (double const * p) + { + data = _mm512_loadu_pd(p); + } + + inline operator __m512d() const { return data; } + inline double operator[] (int i) const { return ((double*)(&data))[i]; } + inline __m512d Data() const { return data; } + inline __m512d & Data() { return data; } + + inline SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } + inline SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } + inline SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } + inline SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } + + }; + + inline SIMD operator- (SIMD a) { return _mm512_sub_pd(_mm512_setzero_pd(), a.Data()); } + + inline SIMD operator+ (SIMD a, SIMD b) { return _mm512_add_pd(a.Data(),b.Data()); } + inline SIMD operator- (SIMD a, SIMD b) { return _mm512_sub_pd(a.Data(),b.Data()); } + inline SIMD operator* (SIMD a, SIMD b) { return _mm512_mul_pd(a.Data(),b.Data()); } + inline SIMD operator/ (SIMD a, SIMD b) { return _mm512_div_pd(a.Data(),b.Data()); } + + inline SIMD sqrt (SIMD a) { return _mm512_sqrt_pd(a.Data()); } + inline SIMD fabs (SIMD a) { return _mm512_max_pd(a.Data(), -a.Data()); } + inline SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } + inline SIMD Trans (SIMD a) { return a; } + inline SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + auto cp = _mm512_cmp_pd_mask (a.Data(), _mm512_setzero_pd(), _MM_CMPINT_GT); + return _mm512_mask_blend_pd(cp, c.Data(), b.Data()); + } + + + template<> inline auto FMA (SIMD a, SIMD b, SIMD c) + { + return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data()); + } + + inline double HSum (SIMD sd) + { + SIMD low = _mm512_extractf64x4_pd(sd.Data(),0); + SIMD high = _mm512_extractf64x4_pd(sd.Data(),1); + return HSum(low)+HSum(high); + } + + inline auto HSum (SIMD sd1, SIMD sd2) + { + return std::make_tuple(HSum(sd1), HSum(sd2)); + } + + inline SIMD HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + SIMD high1 = _mm512_extractf64x4_pd(v1.Data(),1); + SIMD high2 = _mm512_extractf64x4_pd(v2.Data(),1); + SIMD high3 = _mm512_extractf64x4_pd(v3.Data(),1); + SIMD high4 = _mm512_extractf64x4_pd(v4.Data(),1); + SIMD low1 = _mm512_extractf64x4_pd(v1.Data(),0); + SIMD low2 = _mm512_extractf64x4_pd(v2.Data(),0); + SIMD low3 = _mm512_extractf64x4_pd(v3.Data(),0); + SIMD low4 = _mm512_extractf64x4_pd(v4.Data(),0); + return HSum(low1,low2,low3,low4) + HSum(high1,high2,high3,high4); + } +#endif // __AVX512F__ + + +//////////////////////////////////////////////////////////////////////////////// +// MultiSIMD - Multiple SIMD values in one struct using head-tail implementation +//////////////////////////////////////////////////////////////////////////////// + template + class MultiSIMD : public AlignedAlloc> + { + SIMD head; + MultiSIMD tail; + public: + MultiSIMD () = default; + MultiSIMD (const MultiSIMD & ) = default; + MultiSIMD (T v) : head(v), tail(v) { ; } + MultiSIMD (SIMD _head, MultiSIMD _tail) + : head(_head), tail(_tail) { ; } + SIMD Head() const { return head; } + MultiSIMD Tail() const { return tail; } + SIMD & Head() { return head; } + MultiSIMD & Tail() { return tail; } + + template + SIMD Get() const { return NR==0 ? head : tail.template Get(); } + template + SIMD & Get() { return NR==0 ? head : tail.template Get(); } + }; + + template + class MultiSIMD<2,T> : public AlignedAlloc> + { + SIMD v0, v1; + public: + MultiSIMD () = default; + MultiSIMD (const MultiSIMD & ) = default; + MultiSIMD (T v) : v0(v), v1(v) { ; } + MultiSIMD (SIMD _v0, SIMD _v1) : v0(_v0), v1(_v1) { ; } + + SIMD Head() const { return v0; } + SIMD Tail() const { return v1; } + SIMD & Head() { return v0; } + SIMD & Tail() { return v1; } + + template + SIMD Get() const { return NR==0 ? v0 : v1; } + template + SIMD & Get() { return NR==0 ? v0 : v1; } + }; + + template inline MultiSIMD operator+ (MultiSIMD a, MultiSIMD b) + { return MultiSIMD (a.Head()+b.Head(), a.Tail()+b.Tail()); } + template inline MultiSIMD operator+ (double a, MultiSIMD b) + { return MultiSIMD (a+b.Head(), a+b.Tail()); } + template inline MultiSIMD operator+ (MultiSIMD b, double a) + { return MultiSIMD (a+b.Head(), a+b.Tail()); } + + template inline MultiSIMD operator- (MultiSIMD a, MultiSIMD b) + { return MultiSIMD (a.Head()-b.Head(), a.Tail()-b.Tail()); } + template inline MultiSIMD operator- (double a, MultiSIMD b) + { return MultiSIMD (a-b.Head(), a-b.Tail()); } + template inline MultiSIMD operator- (MultiSIMD b, double a) + { return MultiSIMD (b.Head()-a, b.Tail()-a); } + template inline MultiSIMD operator- (MultiSIMD a) + { return MultiSIMD (-a.Head(), -a.Tail()); } + template inline MultiSIMD operator* (MultiSIMD a, MultiSIMD b) + { return MultiSIMD (a.Head()*b.Head(), a.Tail()*b.Tail()); } + template inline MultiSIMD operator/ (MultiSIMD a, MultiSIMD b) + { return MultiSIMD (a.Head()/b.Head(), a.Tail()/b.Tail()); } + template inline MultiSIMD operator* (double a, MultiSIMD b) + { return MultiSIMD ( a*b.Head(), a*b.Tail()); } + template inline MultiSIMD operator* (MultiSIMD b, double a) + { return MultiSIMD ( a*b.Head(), a*b.Tail()); } + + template inline MultiSIMD & operator+= (MultiSIMD & a, MultiSIMD b) + { a.Head()+=b.Head(); a.Tail()+=b.Tail(); return a; } + template inline MultiSIMD operator-= (MultiSIMD & a, double b) + { a.Head()-=b; a.Tail()-=b; return a; } + template inline MultiSIMD operator-= (MultiSIMD & a, MultiSIMD b) + { a.Head()-=b.Head(); a.Tail()-=b.Tail(); return a; } + template inline MultiSIMD & operator*= (MultiSIMD & a, MultiSIMD b) + { a.Head()*=b.Head(); a.Tail()*=b.Tail(); return a; } + template inline MultiSIMD & operator*= (MultiSIMD & a, double b) + { a.Head()*=b; a.Tail()*=b; return a; } + // inline MultiSIMD operator/= (MultiSIMD & a, MultiSIMD b) { return a.Data()/=b.Data(); } + + inline SIMD HVSum (SIMD a) { return a; } + template + inline SIMD HVSum (MultiSIMD a) { return a.Head() + HVSum(a.Tail()); } + + template inline double HSum (MultiSIMD a) { return HSum(HVSum(a)); } + template inline auto HSum (MultiSIMD a, MultiSIMD b) + { return HSum(HVSum(a), HVSum(b)); } + + template + std::ostream & operator<< (std::ostream & ost, MultiSIMD multi) + { + ost << multi.Head() << " " << multi.Tail(); + return ost; + } + + template + std::ostream & operator<< (std::ostream & ost, SIMD simd) + { + ost << simd[0]; + for (int i = 1; i < simd.Size(); i++) + ost << " " << simd[i]; + return ost; + } +} + +namespace netgen +{ + using namespace ngsimd; +} +#endif diff --git a/libsrc/include/CMakeLists.txt b/libsrc/include/CMakeLists.txt index dfb2c416..1bd3dc2f 100644 --- a/libsrc/include/CMakeLists.txt +++ b/libsrc/include/CMakeLists.txt @@ -3,7 +3,7 @@ install(FILES nginterface.h nginterface_v2.hpp DESTINATION ${INCDIR} COMPONENT n install(FILES acisgeom.hpp csg.hpp geometry2d.hpp gprim.hpp incopengl.hpp inctcl.hpp incvis.hpp linalg.hpp meshing.hpp myadt.hpp mydefs.hpp - mystdlib.h nginterface_v2_impl.hpp occgeom.hpp + mystdlib.h nginterface_v2_impl.hpp occgeom.hpp ngsimd.hpp opti.hpp parallel.hpp parallelinterface.hpp stlgeom.hpp visual.hpp DESTINATION ${INCDIR}/include COMPONENT netgen_devel ) diff --git a/libsrc/include/ngsimd.hpp b/libsrc/include/ngsimd.hpp new file mode 100644 index 00000000..933f002a --- /dev/null +++ b/libsrc/include/ngsimd.hpp @@ -0,0 +1 @@ +#include <../general/ngsimd.hpp> diff --git a/libsrc/interface/nginterface_v2.cpp b/libsrc/interface/nginterface_v2.cpp index d95f71e1..ff5acc79 100644 --- a/libsrc/interface/nginterface_v2.cpp +++ b/libsrc/interface/nginterface_v2.cpp @@ -646,29 +646,26 @@ namespace netgen -#ifdef __AVX__ -#include - template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,1> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "multi-eltrafo simd called, 1,1,simd" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<2,2> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<2> (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + xi, sxi, + x, sx, + dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -695,15 +692,15 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<3,3> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointElementTransformation (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + xi, sxi, + x, sx, + dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -730,27 +727,27 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<0,2> (int elnr, int npts, - const __m256d *xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD *xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "MultiElementtransformation<0,2> simd not implemented" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<0,1> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { cout << "multi-eltrafo simd called, 0,1,simd" << endl; } template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,3> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { double hxi[4][1]; double hx[4][3]; @@ -772,9 +769,9 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<1,2> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { for (int i = 0; i < npts; i++) { @@ -801,15 +798,15 @@ namespace netgen template<> DLL_HEADER void Ngx_Mesh :: MultiElementTransformation<2,3> (int elnr, int npts, - const __m256d * xi, size_t sxi, - __m256d * x, size_t sx, - __m256d * dxdxi, size_t sdxdxi) const + const SIMD * xi, size_t sxi, + SIMD * x, size_t sx, + SIMD * dxdxi, size_t sdxdxi) const { mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<3> (elnr, npts, - reinterpret_cast*> (xi), sxi, - reinterpret_cast*> (x), sx, - reinterpret_cast*> (dxdxi), sdxdxi); + xi, sxi, + x, sx, + dxdxi, sdxdxi); /* for (int i = 0; i < npts; i++) { @@ -834,7 +831,6 @@ namespace netgen */ } -#endif diff --git a/libsrc/visualization/soldata.hpp b/libsrc/visualization/soldata.hpp index 20028762..0ae4b453 100644 --- a/libsrc/visualization/soldata.hpp +++ b/libsrc/visualization/soldata.hpp @@ -92,17 +92,15 @@ namespace netgen return res; } -#ifdef __AVX__ virtual bool GetMultiSurfValue (size_t selnr, size_t facetnr, size_t npts, - const __m256d * xref, - const __m256d * x, - const __m256d * dxdxref, - __m256d * values) + const SIMD * xref, + const SIMD * x, + const SIMD * dxdxref, + SIMD * values) { - cerr << "GetMultiSurfVaue not overloaded" << endl; + cerr << "GetMultiSurfVaue not overloaded for SIMD" << endl; return false; } -#endif virtual bool GetSegmentValue (int segnr, double xref, double * values) { return false; } diff --git a/libsrc/visualization/vssolution.cpp b/libsrc/visualization/vssolution.cpp index 457b3e6f..a61036e6 100644 --- a/libsrc/visualization/vssolution.cpp +++ b/libsrc/visualization/vssolution.cpp @@ -1279,7 +1279,6 @@ namespace netgen Array mvalues(npt); int sol_comp = (sol && sol->draw_surface) ? sol->components : 0; -#ifdef __AVX__ Array> > simd_pref ( (npt+SIMD::Size()-1)/SIMD::Size() ); Array> > simd_points ( (npt+SIMD::Size()-1)/SIMD::Size() ); Array> > simd_dxdxis ( (npt+SIMD::Size()-1)/SIMD::Size() ); @@ -1287,7 +1286,6 @@ namespace netgen Array> simd_values( (npt+SIMD::Size()-1)/SIMD::Size() * sol_comp); -#endif // Array> glob_pnts; // Array> glob_nvs; @@ -1486,7 +1484,6 @@ namespace netgen NgProfiler::StartTimer(timerloops); size_t base_pi = 0; -#ifdef __AVX__ for (int iy = 0, ii = 0; iy <= n; iy++) for (int ix = 0; ix <= n-iy; ix++, ii++) pref[ii] = Point<2> (ix*invn, iy*invn); @@ -1496,10 +1493,9 @@ namespace netgen for (size_t i = 0; i < simd_npt; i++) { - simd_pref[i](0).SIMD_function ([&] (size_t j) { size_t ii = i*simd_size+j; return (ii < npt) ? pref[ii](0) : 0; }, std::true_type()); - simd_pref[i](1).SIMD_function ([&] (size_t j) { size_t ii = i*simd_size+j; return (ii < npt) ? pref[ii](1) : 0; }, std::true_type()); + simd_pref[i](0) = [&] (size_t j) { size_t ii = i*simd_size+j; return (ii < npt) ? pref[ii](0) : 0; }; + simd_pref[i](1) = [&] (size_t j) { size_t ii = i*simd_size+j; return (ii < npt) ? pref[ii](1) : 0; }; } -#endif Array ind_reftrig; for (int iy = 0, ii = 0; iy < n; iy++,ii++)