From 25ba2f7a541538d666ff148037d5748b5af213dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joachim=20Sch=C3=B6berl?= Date: Mon, 27 Nov 2017 10:19:43 +0100 Subject: [PATCH] SSE - vectorization --- libsrc/general/ngsimd.hpp | 120 +++++++++++++++++++++++++++- libsrc/interface/nginterface_v2.cpp | 2 +- libsrc/visualization/soldata.hpp | 6 +- 3 files changed, 123 insertions(+), 5 deletions(-) diff --git a/libsrc/general/ngsimd.hpp b/libsrc/general/ngsimd.hpp index f78a33ad..e013397a 100644 --- a/libsrc/general/ngsimd.hpp +++ b/libsrc/general/ngsimd.hpp @@ -48,11 +48,25 @@ NG_INLINE __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; } namespace ngsimd { + + // MSVC does not define SSE. It's always present on 64bit cpus +#if (defined(_M_AMD64) || defined(_M_X64) || defined(__AVX__)) +#ifndef __SSE__ +#define __SSE__ +#endif +#ifndef __SSE2__ +#define __SSE2__ +#endif +#endif + + constexpr int GetDefaultSIMDSize() { #if defined __AVX512F__ return 8; #elif defined __AVX__ return 4; +#elif defined __SSE__ + return 2; #else return 1; #endif @@ -63,7 +77,10 @@ namespace ngsimd typedef __m512d tAVXd; #elif defined __AVX__ typedef __m256 tAVX; - typedef __m256d tAVXd; + typedef __m256d tAVXd; +#elif defined __SSE__ + typedef __m128 tAVX; + typedef __m128d tAVXd; #endif @@ -270,6 +287,107 @@ using std::fabs; return std::make_tuple(sd1.Data(), sd2.Data(), sd3.Data(), sd4.Data()); } + +///////////////////////////////////////////////////////////////////////////// +// SSE - Simd width 2 +///////////////////////////////////////////////////////////////////////////// +#ifdef __SSE__ + template<> + class alignas(16) SIMD // : public AlignedAlloc> + { + __m128d data; + + public: + static constexpr int Size() { return 2; } + SIMD () = default; + SIMD (const SIMD &) = default; + SIMD & operator= (const SIMD &) = default; + + SIMD (__m128d adata) + : data(adata) + { ; } + + // only called if T has a call operator of appropriate type + template>::value, int>::type = 0> + SIMD (const T & func) + { + data = _mm_set_pd(func(3), func(2), func(1), func(0)); + } + + // only called if T is arithmetic (integral or floating point types) + template::value, int>::type = 0> + SIMD (const T & val) + { + data = _mm_set1_pd(val); + } + + SIMD (double const * p) + { + data = _mm_loadu_pd(p); + } + + NG_INLINE operator __m128d() const { return data; } + NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; } + NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; } + NG_INLINE __m128d Data() const { return data; } + NG_INLINE __m128d & Data() { return data; } + + // NG_INLINE operator std::tuple () + // { return std::tuple((*this)[0], (*this)[1], (*this)[2], (*this)[3]); } + + + NG_INLINE SIMD &operator+= (SIMD b) { data+=b.Data(); return *this; } + NG_INLINE SIMD &operator-= (SIMD b) { data-=b.Data(); return *this; } + NG_INLINE SIMD &operator*= (SIMD b) { data*=b.Data(); return *this; } + NG_INLINE SIMD &operator/= (SIMD b) { data/=b.Data(); return *this; } + + }; + + NG_INLINE SIMD operator+ (SIMD a, SIMD b) { return a.Data()+b.Data(); } + NG_INLINE SIMD operator- (SIMD a, SIMD b) { return a.Data()-b.Data(); } + NG_INLINE SIMD operator- (SIMD a) { return -a.Data(); } + NG_INLINE SIMD operator* (SIMD a, SIMD b) { return a.Data()*b.Data(); } + NG_INLINE SIMD operator/ (SIMD a, SIMD b) { return a.Data()/b.Data(); } + + /* + NG_INLINE SIMD sqrt (SIMD a) { return _mm256_sqrt_pd(a.Data()); } + NG_INLINE SIMD floor (SIMD a) { return _mm256_floor_pd(a.Data()); } + NG_INLINE SIMD ceil (SIMD a) { return _mm256_ceil_pd(a.Data()); } + NG_INLINE SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), -a.Data()); } + NG_INLINE SIMD L2Norm2 (SIMD a) { return a.Data()*a.Data(); } + NG_INLINE SIMD Trans (SIMD a) { return a; } + NG_INLINE SIMD IfPos (SIMD a, SIMD b, SIMD c) + { + auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS); + return _mm256_blendv_pd(c.Data(), b.Data(), cp); + } + + NG_INLINE double HSum (SIMD sd) + { + __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1)); + return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv)); + } + + NG_INLINE auto HSum (SIMD sd1, SIMD sd2) + { + __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data()); + __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1)); + return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3))); + } + + NG_INLINE auto HSum (SIMD v1, SIMD v2, SIMD v3, SIMD v4) + { + __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data()); + __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data()); + __m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16), + _mm256_blend_pd (hsum1, hsum2, 12)); + return SIMD(hsum); + } + */ +#endif // __SSE__ + + + ///////////////////////////////////////////////////////////////////////////// // AVX - Simd width 4 ///////////////////////////////////////////////////////////////////////////// diff --git a/libsrc/interface/nginterface_v2.cpp b/libsrc/interface/nginterface_v2.cpp index 64492434..51161796 100644 --- a/libsrc/interface/nginterface_v2.cpp +++ b/libsrc/interface/nginterface_v2.cpp @@ -671,7 +671,7 @@ namespace netgen -#ifdef __AVX__ +#ifdef __SSE__ #include template<> DLL_HEADER void Ngx_Mesh :: diff --git a/libsrc/visualization/soldata.hpp b/libsrc/visualization/soldata.hpp index d852de9b..56df5d65 100644 --- a/libsrc/visualization/soldata.hpp +++ b/libsrc/visualization/soldata.hpp @@ -6,7 +6,7 @@ namespace netgen { using namespace std; - + /* #if defined __AVX512F__ typedef __m512 tAVX; typedef __m512d tAVXd; @@ -14,7 +14,7 @@ namespace netgen typedef __m256 tAVX; typedef __m256d tAVXd; #endif - + */ class SolutionData { @@ -101,7 +101,7 @@ namespace netgen return res; } -#ifdef __AVX__ +#ifdef __SSE__ virtual bool GetMultiSurfValue (size_t selnr, size_t facetnr, size_t npts, const tAVXd * xref, const tAVXd * x,