mirror of
https://github.com/NGSolve/netgen.git
synced 2025-01-24 03:40:34 +05:00
Merge branch 'refactor_simd' into 'master'
Move SIMD headers from ngsolve to ngcore See merge request jschoeberl/netgen!358
This commit is contained in:
commit
f626255ceb
@ -73,6 +73,7 @@ install(FILES ngcore.hpp archive.hpp type_traits.hpp version.hpp ngcore_api.hpp
|
||||
exception.hpp symboltable.hpp paje_trace.hpp utils.hpp profiler.hpp mpi_wrapper.hpp
|
||||
array.hpp taskmanager.hpp concurrentqueue.h localheap.hpp python_ngcore.hpp flags.hpp
|
||||
xbool.hpp signal.hpp bitarray.hpp table.hpp hashtable.hpp ranges.hpp
|
||||
simd.hpp simd_avx.hpp simd_avx512.hpp simd_generic.hpp simd_sse.hpp
|
||||
DESTINATION ${NG_INSTALL_DIR_INCLUDE}/core COMPONENT netgen_devel)
|
||||
|
||||
if(ENABLE_CPP_CORE_GUIDELINES_CHECK)
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "mpi_wrapper.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "signal.hpp"
|
||||
#include "simd.hpp"
|
||||
#include "symboltable.hpp"
|
||||
#include "taskmanager.hpp"
|
||||
#include "version.hpp"
|
||||
|
69
libsrc/core/simd.hpp
Normal file
69
libsrc/core/simd.hpp
Normal file
@ -0,0 +1,69 @@
|
||||
#ifndef NETGEN_CORE_SIMD_HPP
|
||||
#define NETGEN_CORE_SIMD_HPP
|
||||
|
||||
/**************************************************************************/
|
||||
/* File: simd.hpp */
|
||||
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include "ngcore_api.hpp"
|
||||
|
||||
#include "simd_generic.hpp"
|
||||
|
||||
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__))
|
||||
#ifndef __SSE__
|
||||
#define __SSE__
|
||||
#endif
|
||||
#include "simd_sse.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
#include "simd_avx.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#include "simd_avx512.hpp"
|
||||
#endif
|
||||
|
||||
namespace ngcore
|
||||
{
|
||||
NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
|
||||
{
|
||||
SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data());
|
||||
SIMD<double,2> hsum2 = my_mm_hadd_pd (v3.Data(), v4.Data());
|
||||
return SIMD<double,4> (hsum1, hsum2);
|
||||
}
|
||||
|
||||
|
||||
NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
|
||||
SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)
|
||||
{
|
||||
SIMD<double,4> h1,h2,h3,h4;
|
||||
std::tie(h1,h2) = Unpack(a1,a2);
|
||||
std::tie(h3,h4) = Unpack(a3,a4);
|
||||
b1 = SIMD<double,4> (h1.Lo(), h3.Lo());
|
||||
b2 = SIMD<double,4> (h2.Lo(), h4.Lo());
|
||||
b3 = SIMD<double,4> (h1.Hi(), h3.Hi());
|
||||
b4 = SIMD<double,4> (h2.Hi(), h4.Hi());
|
||||
}
|
||||
|
||||
template<int N>
|
||||
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2)
|
||||
{
|
||||
return SIMD<double,2>(HSum(s1), HSum(s2));
|
||||
}
|
||||
|
||||
template<int N>
|
||||
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2, SIMD<double,N> s3, SIMD<double,N> s4 )
|
||||
{
|
||||
return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4));
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
|
||||
{
|
||||
return SIMD<mask64>::GetMaskFromBits(i);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // NETGEN_CORE_SIMD_HPP
|
256
libsrc/core/simd_avx.hpp
Normal file
256
libsrc/core/simd_avx.hpp
Normal file
@ -0,0 +1,256 @@
|
||||
#ifndef NETGEN_CORE_SIMD_AVX_HPP
|
||||
#define NETGEN_CORE_SIMD_AVX_HPP
|
||||
|
||||
/**************************************************************************/
|
||||
/* File: simd_avx.hpp */
|
||||
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace ngcore
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
|
||||
{
|
||||
return _mm256_cmpgt_epi64 (a,b);
|
||||
}
|
||||
#else
|
||||
NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
|
||||
{
|
||||
__m128i rlo = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 0),
|
||||
_mm256_extractf128_si256(b, 0));
|
||||
__m128i rhi = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 1),
|
||||
_mm256_extractf128_si256(b, 1));
|
||||
return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template <>
|
||||
class SIMD<mask64,4>
|
||||
{
|
||||
__m256i mask;
|
||||
public:
|
||||
SIMD (size_t i)
|
||||
: mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i),
|
||||
_mm256_set_epi64x(3, 2, 1, 0)))
|
||||
{ ; }
|
||||
SIMD (__m256i _mask) : mask(_mask) { ; }
|
||||
SIMD (__m256d _mask) : mask(_mm256_castpd_si256(_mask)) { ; }
|
||||
__m256i Data() const { return mask; }
|
||||
static constexpr int Size() { return 4; }
|
||||
static SIMD<mask64, 4> GetMaskFromBits (unsigned int i);
|
||||
};
|
||||
|
||||
static SIMD<mask64, 4> masks_from_4bits[16] = {
|
||||
_mm256_set_epi64x (0,0,0,0), _mm256_set_epi64x (0,0,0,-1),
|
||||
_mm256_set_epi64x (0,0,-1,0), _mm256_set_epi64x (0,0,-1,-1),
|
||||
_mm256_set_epi64x (0,-1,0,0), _mm256_set_epi64x (0,-1,0,-1),
|
||||
_mm256_set_epi64x (0,-1,-1,0), _mm256_set_epi64x (0,-1,-1,-1),
|
||||
_mm256_set_epi64x (-1,0,0,0), _mm256_set_epi64x (-1,0,0,-1),
|
||||
_mm256_set_epi64x (-1,0,-1,0), _mm256_set_epi64x (-1,0,-1,-1),
|
||||
_mm256_set_epi64x (-1,-1,0,0), _mm256_set_epi64x (-1,-1,0,-1),
|
||||
_mm256_set_epi64x (-1,-1,-1,0), _mm256_set_epi64x (-1,-1,-1,-1)
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<mask64, 4> SIMD<mask64, 4> :: GetMaskFromBits (unsigned int i)
|
||||
{
|
||||
return masks_from_4bits[i & 15];
|
||||
}
|
||||
|
||||
template<>
|
||||
class alignas(32) SIMD<int64_t,4>
|
||||
{
|
||||
__m256i data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 4; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (int64_t val) { data = _mm256_set1_epi64x(val); }
|
||||
SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3) { data = _mm256_set_epi64x(v3,v2,v1,v0); }
|
||||
SIMD (std::array<int64_t,4> a)
|
||||
: data{_mm256_set_epi64x(a[3],a[2],a[1],a[0])}
|
||||
{}
|
||||
SIMD (SIMD<int64_t,2> v0, SIMD<int64_t,2> v1)
|
||||
: data(_mm256_set_m128i(v0.Data(),v1.Data()))
|
||||
{}
|
||||
SIMD (__m256i _data) { data = _data; }
|
||||
|
||||
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
|
||||
NETGEN_INLINE __m256i Data() const { return data; }
|
||||
NETGEN_INLINE __m256i & Data() { return data; }
|
||||
|
||||
SIMD<int64_t,2> Lo() const { return _mm256_extractf128_si256(data, 0); }
|
||||
SIMD<int64_t,2> Hi() const { return _mm256_extractf128_si256(data, 1); }
|
||||
static SIMD FirstInt(int n0=0) { return { n0+0, n0+1, n0+2, n0+3 }; }
|
||||
};
|
||||
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,4> operator-(SIMD<int64_t,4> a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a.Data()); }
|
||||
|
||||
#ifdef __AVX2__
|
||||
NETGEN_INLINE SIMD<int64_t,4> operator+ (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_add_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<int64_t,4> operator- (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_sub_epi64(a.Data(),b.Data()); }
|
||||
#endif // __AVX2__
|
||||
|
||||
template<>
|
||||
class alignas(32) SIMD<double,4>
|
||||
{
|
||||
__m256d data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 4; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (double val) { data = _mm256_set1_pd(val); }
|
||||
SIMD (int val) { data = _mm256_set1_pd(val); }
|
||||
SIMD (size_t val) { data = _mm256_set1_pd(val); }
|
||||
SIMD (double v0, double v1, double v2, double v3) { data = _mm256_set_pd(v3,v2,v1,v0); }
|
||||
SIMD (SIMD<double,2> v0, SIMD<double,2> v1) : SIMD(v0[0], v0[1], v1[0], v1[1]) { ; }
|
||||
SIMD (double const * p) { data = _mm256_loadu_pd(p); }
|
||||
SIMD (double const * p, SIMD<mask64,4> mask) { data = _mm256_maskload_pd(p, mask.Data()); }
|
||||
SIMD (__m256d _data) { data = _data; }
|
||||
SIMD (std::array<double,4> a)
|
||||
: data{_mm256_set_pd(a[3],a[2],a[1],a[0])}
|
||||
{}
|
||||
|
||||
void Store (double * p) { _mm256_storeu_pd(p, data); }
|
||||
void Store (double * p, SIMD<mask64,4> mask) { _mm256_maskstore_pd(p, mask.Data(), data); }
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm256_set_pd(func(3), func(2), func(1), func(0));
|
||||
}
|
||||
|
||||
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
// [[deprecated("don't write to individual elments of SIMD")]]
|
||||
// NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
template <int I>
|
||||
double Get() const { return ((double*)(&data))[I]; }
|
||||
NETGEN_INLINE __m256d Data() const { return data; }
|
||||
NETGEN_INLINE __m256d & Data() { return data; }
|
||||
|
||||
SIMD<double,2> Lo() const { return _mm256_extractf128_pd(data, 0); }
|
||||
SIMD<double,2> Hi() const { return _mm256_extractf128_pd(data, 1); }
|
||||
|
||||
operator std::tuple<double&,double&,double&,double&> ()
|
||||
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
};
|
||||
|
||||
NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
|
||||
{
|
||||
return std::make_tuple(SIMD<double,4>(_mm256_unpacklo_pd(a.Data(),b.Data())),
|
||||
SIMD<double,4>(_mm256_unpackhi_pd(a.Data(),b.Data())));
|
||||
}
|
||||
|
||||
NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a) { return _mm256_xor_pd(a.Data(), _mm256_set1_pd(-0.0)); }
|
||||
NETGEN_INLINE SIMD<double,4> operator+ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_add_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_sub_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_mul_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> operator/ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_div_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> operator* (double a, SIMD<double,4> b) { return _mm256_set1_pd(a)*b.Data(); }
|
||||
NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> b, double a) { return _mm256_set1_pd(a)*b.Data(); }
|
||||
|
||||
NETGEN_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), (-a).Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LE_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LT_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GE_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GT_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_EQ_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<double,4> a , SIMD<double,4> b)
|
||||
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_NEQ_OQ); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return _mm256_xor_si256(_mm256_cmpgt_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return my_mm256_cmpgt_epi64(b.Data(),a.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return _mm256_xor_si256(_mm256_cmpgt_epi64(b.Data(),a.Data()),_mm256_set1_epi32(-1)); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return my_mm256_cmpgt_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return _mm256_cmpeq_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
|
||||
{ return _mm256_xor_si256(_mm256_cmpeq_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
|
||||
|
||||
#ifdef __AVX2__
|
||||
NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
|
||||
{ return _mm256_and_si256 (a.Data(), b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
|
||||
{ return _mm256_or_si256 (a.Data(), b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
|
||||
{ return _mm256_xor_si256 (a.Data(), _mm256_cmpeq_epi64(a.Data(),a.Data())); }
|
||||
#else //AVX2 is a superset of AVX. Without it, it is necessary to reinterpret the types
|
||||
NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
|
||||
{ return _mm256_castpd_si256(_mm256_and_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( b.Data()))); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
|
||||
{ return _mm256_castpd_si256(_mm256_or_pd (_mm256_castsi256_pd(a.Data()), _mm256_castsi256_pd(b.Data()))); }
|
||||
NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
|
||||
{ return _mm256_castpd_si256(_mm256_xor_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( _mm256_cmpeq_epi64(a.Data(),a.Data())))); }
|
||||
#endif
|
||||
NETGEN_INLINE SIMD<double,4> If (SIMD<mask64,4> a, SIMD<double,4> b, SIMD<double,4> c)
|
||||
{ return _mm256_blendv_pd(c.Data(), b.Data(), _mm256_castsi256_pd(a.Data())); }
|
||||
|
||||
NETGEN_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
|
||||
{
|
||||
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
|
||||
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
|
||||
}
|
||||
|
||||
NETGEN_INLINE SIMD<double,4> IfZero (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
|
||||
{
|
||||
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_EQ_OS);
|
||||
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
|
||||
}
|
||||
|
||||
NETGEN_INLINE double HSum (SIMD<double,4> sd)
|
||||
{
|
||||
// __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
|
||||
__m128d hv = (sd.Lo()+sd.Hi()).Data();
|
||||
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
|
||||
{
|
||||
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
|
||||
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
|
||||
return SIMD<double,2>(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
|
||||
{
|
||||
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
|
||||
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
|
||||
SIMD<double,4> hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
|
||||
_mm256_blend_pd (hsum1, hsum2, 12));
|
||||
return hsum;
|
||||
// return make_tuple(hsum[0], hsum[1], hsum[2], hsum[3]);
|
||||
}
|
||||
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,4> If (SIMD<mask64,4> a, SIMD<int64_t,4> b, SIMD<int64_t,4> c)
|
||||
{ return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(c.Data()), _mm256_castsi256_pd(b.Data()),
|
||||
_mm256_castsi256_pd(a.Data()))); }
|
||||
|
||||
}
|
||||
|
||||
#endif // NETGEN_CORE_SIMD_AVX_HPP
|
||||
|
239
libsrc/core/simd_avx512.hpp
Normal file
239
libsrc/core/simd_avx512.hpp
Normal file
@ -0,0 +1,239 @@
|
||||
#ifndef NETGEN_CORE_SIMD_AVX512_HPP
|
||||
#define NETGEN_CORE_SIMD_AVX512_HPP
|
||||
|
||||
/**************************************************************************/
|
||||
/* File: simd_avx512.hpp */
|
||||
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace ngcore
|
||||
{
|
||||
|
||||
template <>
|
||||
class SIMD<mask64,8>
|
||||
{
|
||||
__mmask8 mask;
|
||||
public:
|
||||
SIMD (size_t i)
|
||||
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
|
||||
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
|
||||
{ ; }
|
||||
SIMD (int i)
|
||||
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
|
||||
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
|
||||
{ ; }
|
||||
SIMD (int64_t i)
|
||||
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
|
||||
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
|
||||
{ ; }
|
||||
SIMD (__mmask8 _mask) : mask(_mask) { ; }
|
||||
__mmask8 Data() const { return mask; }
|
||||
static constexpr int Size() { return 8; }
|
||||
static NETGEN_INLINE SIMD<mask64, 8> GetMaskFromBits (unsigned int i)
|
||||
{
|
||||
return SIMD<mask64, 8>(__mmask8(i));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class alignas(64) SIMD<int64_t,8>
|
||||
{
|
||||
__m512i data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 8; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (int64_t val) { data = _mm512_set1_epi64(val); }
|
||||
SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7) { data = _mm512_set_epi64(v7,v6,v5,v4,v3,v2,v1,v0); }
|
||||
SIMD (__m512i _data) { data = _data; }
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm512_set_epi64(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
|
||||
}
|
||||
|
||||
|
||||
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
|
||||
NETGEN_INLINE __m512i Data() const { return data; }
|
||||
NETGEN_INLINE __m512i & Data() { return data; }
|
||||
static SIMD FirstInt() { return { 0, 1, 2, 3, 4, 5, 6, 7 }; }
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,8> operator-(SIMD<int64_t,8> a) { return _mm512_sub_epi64(_mm512_setzero_si512(), a.Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,8> operator+ (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_add_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<int64_t,8> operator- (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_sub_epi64(a.Data(),b.Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,8> If (SIMD<mask64,8> a, SIMD<int64_t,8> b, SIMD<int64_t,8> c)
|
||||
{ return _mm512_mask_blend_epi64(a.Data(), c.Data(), b.Data()); }
|
||||
|
||||
|
||||
template<>
|
||||
class alignas(64) SIMD<double,8>
|
||||
{
|
||||
__m512d data;
|
||||
public:
|
||||
static constexpr int Size() { return 8; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (double val) { data = _mm512_set1_pd(val); }
|
||||
SIMD (int val) { data = _mm512_set1_pd(val); }
|
||||
SIMD (size_t val) { data = _mm512_set1_pd(val); }
|
||||
SIMD (double const * p) { data = _mm512_loadu_pd(p); }
|
||||
SIMD (double const * p, SIMD<mask64,8> mask)
|
||||
{ data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
|
||||
SIMD (__m512d _data) { data = _data; }
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm512_set_pd(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
|
||||
}
|
||||
|
||||
void Store (double * p) { _mm512_storeu_pd(p, data); }
|
||||
void Store (double * p, SIMD<mask64,8> mask) { _mm512_mask_storeu_pd(p, mask.Data(), data); }
|
||||
|
||||
template <typename Function>
|
||||
void SIMD_function (const Function & func, std::true_type)
|
||||
{
|
||||
data = (__m512){ func(7), func(6), func(5), func(4),
|
||||
func(3), func(2), func(1), func(0) };
|
||||
}
|
||||
|
||||
// not a function
|
||||
void SIMD_function (double const * p, std::false_type)
|
||||
{
|
||||
data = _mm512_loadu_pd(p);
|
||||
}
|
||||
|
||||
void SIMD_function (double val, std::false_type)
|
||||
{
|
||||
data = _mm512_set1_pd(val);
|
||||
}
|
||||
|
||||
void SIMD_function (__m512d _data, std::false_type)
|
||||
{
|
||||
data = _data;
|
||||
}
|
||||
|
||||
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NETGEN_INLINE __m512d Data() const { return data; }
|
||||
NETGEN_INLINE __m512d & Data() { return data; }
|
||||
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return -a.Data(); }
|
||||
NETGEN_INLINE SIMD<double,8> operator+ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_add_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_sub_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_mul_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> operator/ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_div_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> operator* (double a, SIMD<double,8> b) { return _mm512_set1_pd(a)*b.Data(); }
|
||||
NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> b, double a) { return _mm512_set1_pd(a)*b.Data(); }
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> sqrt (SIMD<double,8> a) { return _mm512_sqrt_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> floor (SIMD<double,8> a) { return _mm512_floor_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> ceil (SIMD<double,8> a) { return _mm512_ceil_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,8> fabs (SIMD<double,8> a) { return _mm512_max_pd(a.Data(), -a.Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LE_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LT_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GE_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GT_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_EQ_OQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<double,8> a , SIMD<double,8> b)
|
||||
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_NEQ_OQ); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LE); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LT); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLT); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLE); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_EQ); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
|
||||
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NE); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,8> operator&& (SIMD<mask64,8> a, SIMD<mask64,8> b)
|
||||
{ return (__mmask8)(a.Data() & b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator|| (SIMD<mask64,8> a, SIMD<mask64,8> b)
|
||||
{ return (__mmask8)(a.Data() | b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,8> operator! (SIMD<mask64,8> a)
|
||||
{ return (__mmask8)(~a.Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> If (SIMD<mask64,8> a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{ return _mm512_mask_blend_pd(a.Data(), c.Data(), b.Data()); }
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> IfPos (SIMD<double,8> a, SIMD<double> b, SIMD<double> c)
|
||||
{
|
||||
auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_GT_OS);
|
||||
return _mm512_mask_blend_pd(k,c.Data(),b.Data());
|
||||
}
|
||||
NETGEN_INLINE SIMD<double,8> IfZero (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{
|
||||
auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_EQ_OS);
|
||||
return _mm512_mask_blend_pd(k,c.Data(),b.Data());
|
||||
}
|
||||
|
||||
|
||||
NETGEN_INLINE auto Unpack (SIMD<double,8> a, SIMD<double,8> b)
|
||||
{
|
||||
return std::make_tuple(SIMD<double,8>(_mm512_unpacklo_pd(a.Data(),b.Data())),
|
||||
SIMD<double,8>(_mm512_unpackhi_pd(a.Data(),b.Data())));
|
||||
}
|
||||
|
||||
|
||||
NETGEN_INLINE double HSum (SIMD<double,8> sd)
|
||||
{
|
||||
SIMD<double,4> low = _mm512_extractf64x4_pd(sd.Data(),0);
|
||||
SIMD<double,4> high = _mm512_extractf64x4_pd(sd.Data(),1);
|
||||
return HSum(low)+HSum(high);
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto HSum (SIMD<double,8> sd1, SIMD<double,8> sd2)
|
||||
{
|
||||
return SIMD<double,2>(HSum(sd1), HSum(sd2));
|
||||
}
|
||||
|
||||
NETGEN_INLINE SIMD<double,4> HSum (SIMD<double,8> v1, SIMD<double,8> v2, SIMD<double,8> v3, SIMD<double,8> v4)
|
||||
{
|
||||
SIMD<double> lo,hi;
|
||||
std::tie(lo,hi) = Unpack(v1, v2);
|
||||
SIMD<double> sum01 = lo+hi;
|
||||
std::tie(lo,hi) = Unpack(v3, v4);
|
||||
SIMD<double> sum23 = lo+hi;
|
||||
// sum01 b a b a b a b a
|
||||
// sum23 d c d c d c d c
|
||||
// __m512 perm = _mm512_permutex2var_pd (sum01.Data(), _mm512_set_epi64(1,2,3,4,5,6,7,8), sum23.Data());
|
||||
__m256d ab = _mm512_extractf64x4_pd(sum01.Data(),0) + _mm512_extractf64x4_pd(sum01.Data(),1);
|
||||
__m256d cd = _mm512_extractf64x4_pd(sum23.Data(),0) + _mm512_extractf64x4_pd(sum23.Data(),1);
|
||||
return _mm256_add_pd (_mm256_permute2f128_pd (ab, cd, 1+2*16), _mm256_blend_pd (ab, cd, 12));
|
||||
}
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> FMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{
|
||||
return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data());
|
||||
}
|
||||
NETGEN_INLINE SIMD<double,8> FMA (const double & a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{
|
||||
return _mm512_fmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data());
|
||||
}
|
||||
}
|
||||
|
||||
#endif // NETGEN_CORE_SIMD_AVX512_HPP
|
657
libsrc/core/simd_generic.hpp
Normal file
657
libsrc/core/simd_generic.hpp
Normal file
@ -0,0 +1,657 @@
|
||||
#ifndef NETGEN_CORE_SIMD_GENERIC_HPP
|
||||
#define NETGEN_CORE_SIMD_GENERIC_HPP
|
||||
|
||||
/**************************************************************************/
|
||||
/* File: simd_base.hpp */
|
||||
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <type_traits>
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
|
||||
#include "array.hpp"
|
||||
|
||||
namespace ngcore
|
||||
{
|
||||
|
||||
constexpr int GetDefaultSIMDSize() {
|
||||
#if defined __AVX512F__
|
||||
return 8;
|
||||
#elif defined __AVX__
|
||||
return 4;
|
||||
#elif (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__))
|
||||
return 2;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
|
||||
|
||||
class mask64;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
namespace detail {
|
||||
template <typename T, size_t N, size_t... I>
|
||||
auto array_range_impl(std::array<T, N> const& arr,
|
||||
size_t first,
|
||||
std::index_sequence<I...>)
|
||||
-> std::array<T, sizeof...(I)> {
|
||||
return {arr[first + I]...};
|
||||
}
|
||||
|
||||
template <size_t S, typename T, size_t N>
|
||||
auto array_range(std::array<T, N> const& arr, size_t first) {
|
||||
return array_range_impl(arr, first, std::make_index_sequence<S>{});
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// mask
|
||||
|
||||
template <>
|
||||
class SIMD<mask64,1>
|
||||
{
|
||||
int64_t mask;
|
||||
public:
|
||||
SIMD (size_t i)
|
||||
: mask(i > 0 ? -1 : 0) { ; }
|
||||
bool Data() const { return mask; }
|
||||
static constexpr int Size() { return 1; }
|
||||
auto operator[] (int /* i */) const { return mask; }
|
||||
};
|
||||
|
||||
|
||||
template <int N>
|
||||
class SIMD<mask64,N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<mask64,N1> lo;
|
||||
SIMD<mask64,N2> hi;
|
||||
public:
|
||||
|
||||
SIMD (int i) : lo(i), hi(i-N1) { ; }
|
||||
SIMD (SIMD<mask64,N1> lo_, SIMD<mask64,N2> hi_) : lo(lo_), hi(hi_) { ; }
|
||||
SIMD<mask64,N1> Lo() const { return lo; }
|
||||
SIMD<mask64,N2> Hi() const { return hi; }
|
||||
static constexpr int Size() { return N; }
|
||||
};
|
||||
|
||||
template<int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator&& (SIMD<mask64,N> a, SIMD<mask64,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() && b.Data();
|
||||
else return { a.Lo() && b.Lo(), a.Hi() && b.Hi() };
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// int64
|
||||
|
||||
template<>
|
||||
class SIMD<int64_t,1>
|
||||
{
|
||||
int64_t data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 1; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
SIMD (int64_t val) { data = val; }
|
||||
SIMD (std::array<int64_t, 1> arr)
|
||||
: data{arr[0]}
|
||||
{}
|
||||
|
||||
int64_t operator[] (int i) const { return ((int64_t*)(&data))[i]; }
|
||||
auto Data() const { return data; }
|
||||
static SIMD FirstInt(int64_t n0=0) { return {n0}; }
|
||||
template <int I>
|
||||
int64_t Get()
|
||||
{
|
||||
static_assert(I==0);
|
||||
return data;
|
||||
}
|
||||
};
|
||||
|
||||
template<int N>
|
||||
class SIMD<int64_t,N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<int64_t,N1> lo;
|
||||
SIMD<int64_t,N2> high;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return N; }
|
||||
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (int64_t val) : lo{val}, high{val} { ; }
|
||||
SIMD (SIMD<int64_t,N1> lo_, SIMD<int64_t,N2> high_) : lo(lo_), high(high_) { ; }
|
||||
|
||||
SIMD( std::array<int64_t, N> arr )
|
||||
: lo(detail::array_range<N1>(arr, 0)),
|
||||
high(detail::array_range<N2>(arr, N1))
|
||||
{}
|
||||
|
||||
template<typename ...T>
|
||||
SIMD(const T... vals)
|
||||
: lo(detail::array_range<N1>(std::array<int64_t, N>{vals...}, 0)),
|
||||
high(detail::array_range<N2>(std::array<int64_t, N>{vals...}, N1))
|
||||
{
|
||||
static_assert(sizeof...(vals)==N, "wrong number of arguments");
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
for(auto i : IntRange(N1))
|
||||
lo[i] = func(i);
|
||||
for(auto i : IntRange(N2))
|
||||
high[i] = func(N1+i);
|
||||
}
|
||||
|
||||
auto Lo() const { return lo; }
|
||||
auto Hi() const { return high; }
|
||||
|
||||
int64_t operator[] (int i) const { return ((int64_t*)(&lo))[i]; }
|
||||
|
||||
/*
|
||||
operator tuple<int64_t&,int64_t&,int64_t&,int64_t&> ()
|
||||
{ return tuple<int64_t&,int64_t&,int64_t&,int64_t&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
*/
|
||||
|
||||
/*
|
||||
static SIMD FirstInt() { return { 0, 1, 2, 3 }; }
|
||||
*/
|
||||
static SIMD FirstInt(int64_t n0=0) { return {SIMD<int64_t,N1>::FirstInt(n0), SIMD<int64_t,N2>::FirstInt(n0+N1)}; }
|
||||
template <int I>
|
||||
int64_t Get()
|
||||
{
|
||||
static_assert(I>=0 && I<N, "Index out of range");
|
||||
if constexpr(I<N1) return lo.template Get<I>();
|
||||
else return high.template Get<I-N1>();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// double
|
||||
|
||||
template<>
|
||||
class SIMD<double,1>
|
||||
{
|
||||
double data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 1; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
SIMD (double val) { data = val; }
|
||||
SIMD (int val) { data = val; }
|
||||
SIMD (size_t val) { data = val; }
|
||||
SIMD (double const * p) { data = *p; }
|
||||
SIMD (double const * p, SIMD<mask64,1> mask) { data = mask.Data() ? *p : 0.0; }
|
||||
SIMD (std::array<double, 1> arr)
|
||||
: data{arr[0]}
|
||||
{}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = func(0);
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
||||
SIMD & operator= (const T & func)
|
||||
{
|
||||
data = func(0);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void Store (double * p) { *p = data; }
|
||||
void Store (double * p, SIMD<mask64,1> mask) { if (mask.Data()) *p = data; }
|
||||
|
||||
double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
double Data() const { return data; }
|
||||
template <int I>
|
||||
double Get()
|
||||
{
|
||||
static_assert(I==0);
|
||||
return data;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<int N>
|
||||
class SIMD<double, N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<double, N1> lo;
|
||||
SIMD<double, N2> high;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return N; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD (SIMD<double,N1> lo_, SIMD<double,N2> hi_) : lo(lo_), high(hi_) { ; }
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
for(auto i : IntRange(N1))
|
||||
lo[i] = func(i);
|
||||
for(auto i : IntRange(N2))
|
||||
high[i] = func(N1+i);
|
||||
}
|
||||
|
||||
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
||||
SIMD & operator= (const T & func)
|
||||
{
|
||||
for(auto i : IntRange(N1))
|
||||
lo[i] = func(i);
|
||||
for(auto i : IntRange(N2))
|
||||
high[i] = func(N1+i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (double val) : lo{val}, high{val} { ; }
|
||||
SIMD (int val) : lo{val}, high{val} { ; }
|
||||
SIMD (size_t val) : lo{val}, high{val} { ; }
|
||||
|
||||
SIMD (double const * p) : lo{p}, high{p+N1} { ; }
|
||||
SIMD (double const * p, SIMD<mask64,N> mask)
|
||||
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
|
||||
{ }
|
||||
SIMD (double * p) : lo{p}, high{p+N1} { ; }
|
||||
SIMD (double * p, SIMD<mask64,N> mask)
|
||||
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
|
||||
{ }
|
||||
|
||||
SIMD( std::array<double, N> arr )
|
||||
: lo(detail::array_range<N1>(arr, 0)),
|
||||
high(detail::array_range<N2>(arr, N1))
|
||||
{}
|
||||
|
||||
template<typename ...T>
|
||||
SIMD(const T... vals)
|
||||
: lo(detail::array_range<N1>(std::array<double, N>{vals...}, 0)),
|
||||
high(detail::array_range<N2>(std::array<double, N>{vals...}, N1))
|
||||
{
|
||||
static_assert(sizeof...(vals)==N, "wrong number of arguments");
|
||||
}
|
||||
|
||||
void Store (double * p) { lo.Store(p); high.Store(p+N1); }
|
||||
void Store (double * p, SIMD<mask64,N> mask)
|
||||
{
|
||||
lo.Store(p, mask.Lo());
|
||||
high.Store(p+N1, mask.Hi());
|
||||
}
|
||||
|
||||
auto Lo() const { return lo; }
|
||||
auto Hi() const { return high; }
|
||||
|
||||
double operator[] (int i) const { return ((double*)(&lo))[i]; }
|
||||
|
||||
template<typename=std::enable_if<N==2>>
|
||||
operator std::tuple<double&,double&> ()
|
||||
{ return std::tuple<double&,double&>((*this)[0], (*this)[1]); }
|
||||
|
||||
template<typename=std::enable_if<N==4>>
|
||||
operator std::tuple<double&,double&,double&,double&> ()
|
||||
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
|
||||
template <int I>
|
||||
double Get()
|
||||
{
|
||||
static_assert(I>=0 && I<N, "Index out of range");
|
||||
if constexpr(I<N1) return lo.template Get<I>();
|
||||
else return high.template Get<I-N1>();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Generic operators for any arithmetic type/simd width
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> operator+ (SIMD<T,N> a, SIMD<T,N> b) {
|
||||
if constexpr(N==1) return a.Data()+b.Data();
|
||||
else return { a.Lo()+b.Lo(), a.Hi()+b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a, SIMD<T,N> b) {
|
||||
if constexpr(N==1) return a.Data()-b.Data();
|
||||
else return { a.Lo()-b.Lo(), a.Hi()-b.Hi() };
|
||||
}
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a) {
|
||||
if constexpr(N==1) return -a.Data();
|
||||
else return { -a.Lo(), -a.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> operator* (SIMD<T,N> a, SIMD<T,N> b) {
|
||||
if constexpr(N==1) return a.Data()*b.Data();
|
||||
else return { a.Lo()*b.Lo(), a.Hi()*b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> operator/ (SIMD<T,N> a, SIMD<T,N> b) {
|
||||
if constexpr(N==1) return a.Data()/b.Data();
|
||||
else return { a.Lo()/b.Lo(), a.Hi()/b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator< (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() < b.Data();
|
||||
else return { a.Lo()<b.Lo(), a.Hi()<b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator<= (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() <= b.Data();
|
||||
else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator> (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() > b.Data();
|
||||
else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator>= (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() >= b.Data();
|
||||
else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator== (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() == b.Data();
|
||||
else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() };
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
NETGEN_INLINE SIMD<mask64,N> operator!= (SIMD<T,N> & a, SIMD<T,N> b)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() != b.Data();
|
||||
else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() };
|
||||
}
|
||||
|
||||
// int64_t operators with scalar operand (implement overloads to allow implicit casts for second operand)
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator+ (SIMD<int64_t,N> a, int64_t b) { return a+SIMD<int64_t,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator+ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)+b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator- (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)-b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator- (SIMD<int64_t,N> a, int64_t b) { return a-SIMD<int64_t,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator* (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)*b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator* (SIMD<int64_t,N> b, int64_t a) { return SIMD<int64_t,N>(a)*b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator/ (SIMD<int64_t,N> a, int64_t b) { return a/SIMD<int64_t,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> operator/ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)/b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a+b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, int64_t b) { a+=SIMD<int64_t,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a-b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, int64_t b) { a-=SIMD<int64_t,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a*b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, int64_t b) { a*=SIMD<int64_t,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<int64_t,N> & operator/= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a/b; return a; }
|
||||
|
||||
// double operators with scalar operand (implement overloads to allow implicit casts for second operand)
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator+ (SIMD<double,N> a, double b) { return a+SIMD<double,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator+ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)+b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator- (double a, SIMD<double,N> b) { return SIMD<double,N>(a)-b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator- (SIMD<double,N> a, double b) { return a-SIMD<double,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator* (double a, SIMD<double,N> b) { return SIMD<double,N>(a)*b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator* (SIMD<double,N> b, double a) { return SIMD<double,N>(a)*b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator/ (SIMD<double,N> a, double b) { return a/SIMD<double,N>(b); }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> operator/ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)/b; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, SIMD<double,N> b) { a=a+b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, double b) { a+=SIMD<double,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, SIMD<double,N> b) { a = a-b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, double b) { a-=SIMD<double,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, SIMD<double,N> b) { a=a*b; return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, double b) { a*=SIMD<double,N>(b); return a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> & operator/= (SIMD<double,N> & a, SIMD<double,N> b) { a = a/b; return a; }
|
||||
|
||||
// double functions
|
||||
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> L2Norm2 (SIMD<double,N> a) { return a*a; }
|
||||
template <int N>
|
||||
NETGEN_INLINE SIMD<double,N> Trans (SIMD<double,N> a) { return a; }
|
||||
|
||||
template <int N>
|
||||
NETGEN_INLINE double HSum (SIMD<double,N> a)
|
||||
{
|
||||
if constexpr(N==1)
|
||||
return a.Data();
|
||||
else
|
||||
return HSum(a.Lo()) + HSum(a.Hi());
|
||||
}
|
||||
|
||||
NETGEN_INLINE double IfPos (double a, double b, double c) { return a>0 ? b : c; }
|
||||
NETGEN_INLINE double IfZero (double a, double b, double c) { return a==0. ? b : c; }
|
||||
|
||||
template<typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> IfPos (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
||||
{
|
||||
if constexpr(N==1) return a.Data()>0.0 ? b : c;
|
||||
else return { IfPos(a.Lo(), b.Lo(), c.Lo()), IfPos(a.Hi(), b.Hi(), c.Hi())};
|
||||
|
||||
}
|
||||
|
||||
template<typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> IfZero (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
||||
{
|
||||
if constexpr(N==1) return a.Data()==0.0 ? b : c;
|
||||
else return { IfZero(a.Lo(), b.Lo(), c.Lo()), IfZero(a.Hi(), b.Hi(), c.Hi())};
|
||||
|
||||
}
|
||||
|
||||
template<typename T, int N>
|
||||
NETGEN_INLINE SIMD<T,N> If (SIMD<mask64,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
||||
{
|
||||
if constexpr(N==1) return a.Data() ? b : c;
|
||||
else return { If(a.Lo(), b.Lo(), c.Lo()), If(a.Hi(), b.Hi(), c.Hi())};
|
||||
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
// a*b+c
|
||||
NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c)
|
||||
{
|
||||
return a*b+c;
|
||||
}
|
||||
|
||||
// update form of fma
|
||||
template <int N>
|
||||
void FMAasm (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> & sum)
|
||||
{
|
||||
sum = FMA(a,b,sum);
|
||||
}
|
||||
|
||||
template <int i, typename T, int N>
|
||||
T get(SIMD<T,N> a) { return a[i]; }
|
||||
|
||||
template <int NUM, typename FUNC>
|
||||
NETGEN_INLINE void Iterate2 (FUNC f)
|
||||
{
|
||||
if constexpr (NUM > 1) Iterate2<NUM-1> (f);
|
||||
if constexpr (NUM >= 1) f(std::integral_constant<int,NUM-1>());
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int N>
|
||||
ostream & operator<< (ostream & ost, SIMD<T,N> simd)
|
||||
{
|
||||
/*
|
||||
ost << simd[0];
|
||||
for (int i = 1; i < simd.Size(); i++)
|
||||
ost << " " << simd[i];
|
||||
*/
|
||||
Iterate2<simd.Size()> ([&] (auto I) {
|
||||
if (I.value != 0) ost << " ";
|
||||
ost << get<I.value>(simd);
|
||||
});
|
||||
return ost;
|
||||
}
|
||||
|
||||
using std::exp;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> exp (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double>([a](int i)->double { return exp(a[i]); } );
|
||||
}
|
||||
|
||||
using std::log;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> log (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return log(a[i]); } );
|
||||
}
|
||||
|
||||
using std::pow;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, double x) {
|
||||
return ngcore::SIMD<double,N>([a,x](int i)->double { return pow(a[i],x); } );
|
||||
}
|
||||
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, ngcore::SIMD<double,N> b) {
|
||||
return ngcore::SIMD<double,N>([a,b](int i)->double { return pow(a[i],b[i]); } );
|
||||
}
|
||||
|
||||
using std::sin;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> sin (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return sin(a[i]); } );
|
||||
}
|
||||
|
||||
using std::cos;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> cos (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return cos(a[i]); } );
|
||||
}
|
||||
|
||||
using std::tan;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> tan (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return tan(a[i]); } );
|
||||
}
|
||||
|
||||
using std::atan;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> atan (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return atan(a[i]); } );
|
||||
}
|
||||
|
||||
using std::atan2;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> atan2 (ngcore::SIMD<double,N> y, ngcore::SIMD<double,N> x) {
|
||||
return ngcore::SIMD<double,N>([y,x](int i)->double { return atan2(y[i], x[i]); } );
|
||||
}
|
||||
|
||||
using std::acos;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> acos (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return acos(a[i]); } );
|
||||
}
|
||||
|
||||
using std::asin;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> asin (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return asin(a[i]); } );
|
||||
}
|
||||
|
||||
using std::sinh;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> sinh (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return sinh(a[i]); } );
|
||||
}
|
||||
|
||||
using std::cosh;
|
||||
template <int N>
|
||||
NETGEN_INLINE ngcore::SIMD<double,N> cosh (ngcore::SIMD<double,N> a) {
|
||||
return ngcore::SIMD<double,N>([a](int i)->double { return cosh(a[i]); } );
|
||||
}
|
||||
|
||||
template<int N, typename T>
|
||||
using MultiSIMD = SIMD<T, N*GetDefaultSIMDSize()>;
|
||||
|
||||
template<int N>
|
||||
NETGEN_INLINE auto Unpack (SIMD<double,N> a, SIMD<double,N> b)
|
||||
{
|
||||
if constexpr(N==1)
|
||||
{
|
||||
return std::make_tuple(SIMD<double,N>{a.Data()}, SIMD<double,N>{b.Data()} );
|
||||
}
|
||||
else
|
||||
{
|
||||
auto [a1,b1] = Unpack(a.Lo(), b.Lo());
|
||||
auto [a2,b2] = Unpack(a.Hi(), b.Hi());
|
||||
return std::make_tuple(SIMD<double,N>{ a1, a2 },
|
||||
SIMD<double,N>{ b1, b2 });
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
namespace std
|
||||
{
|
||||
// structured binding support
|
||||
template <typename T, int N >
|
||||
struct tuple_size<ngcore::SIMD<T,N>> : std::integral_constant<std::size_t, N> {};
|
||||
template<size_t N, typename T, int M> struct tuple_element<N,ngcore::SIMD<T,M>> { using type = T; };
|
||||
}
|
||||
|
||||
#endif // NETGEN_CORE_SIMD_GENERIC_HPP
|
266
libsrc/core/simd_sse.hpp
Normal file
266
libsrc/core/simd_sse.hpp
Normal file
@ -0,0 +1,266 @@
|
||||
#ifndef NETGEN_CORE_SIMD_SSE_HPP
|
||||
#define NETGEN_CORE_SIMD_SSE_HPP
|
||||
|
||||
/**************************************************************************/
|
||||
/* File: simd_sse.hpp */
|
||||
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace ngcore
|
||||
{
|
||||
|
||||
template <>
|
||||
class SIMD<mask64,2>
|
||||
{
|
||||
__m128i mask;
|
||||
public:
|
||||
SIMD (int i)
|
||||
: mask(_mm_cmpgt_epi32(_mm_set1_epi32(i),
|
||||
_mm_set_epi32(1, 1, 0, 0)))
|
||||
{ ; }
|
||||
SIMD (__m128i _mask) : mask(_mask) { ; }
|
||||
__m128i Data() const { return mask; }
|
||||
static constexpr int Size() { return 2; }
|
||||
static NETGEN_INLINE SIMD<mask64, 2> GetMaskFromBits (unsigned int i);
|
||||
};
|
||||
|
||||
static SIMD<mask64, 2> masks_from_2bits[4] = {
|
||||
_mm_set_epi32 (0,0,0,0), _mm_set_epi32 (0,0,-1,0),
|
||||
_mm_set_epi32 (-1,0,0,0), _mm_set_epi32 (-1,0,-1,0),
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<mask64, 2> SIMD<mask64, 2> :: GetMaskFromBits (unsigned int i)
|
||||
{
|
||||
return masks_from_2bits[i & 3];
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
class alignas(16) SIMD<int64_t,2>
|
||||
{
|
||||
__m128i data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 2; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD (int64_t v0, int64_t v1) { data = _mm_set_epi64x(v1,v0); }
|
||||
SIMD (std::array<int64_t, 2> arr)
|
||||
: data{_mm_set_epi64x(arr[1],arr[0])}
|
||||
{}
|
||||
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (int64_t val) { data = _mm_set1_epi64x(val); }
|
||||
SIMD (__m128i _data) { data = _data; }
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm_set_epi64(func(1), func(0));
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
|
||||
NETGEN_INLINE __m128i Data() const { return data; }
|
||||
NETGEN_INLINE __m128i & Data() { return data; }
|
||||
static SIMD FirstInt(int n0=0) { return { n0, n0+1 }; }
|
||||
};
|
||||
|
||||
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t,2> operator-(SIMD<int64_t,2> a) { return _mm_sub_epi64(_mm_setzero_si128(), a.Data()); }
|
||||
NETGEN_INLINE SIMD<int64_t,2> operator+ (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_add_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_sub_epi64(a.Data(),b.Data()); }
|
||||
|
||||
|
||||
template<>
|
||||
class alignas(16) SIMD<double,2>
|
||||
{
|
||||
__m128d data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 2; }
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
|
||||
SIMD (std::array<double, 2> arr)
|
||||
: data{_mm_set_pd(arr[1], arr[0])}
|
||||
{}
|
||||
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (double val) { data = _mm_set1_pd(val); }
|
||||
SIMD (int val) { data = _mm_set1_pd(val); }
|
||||
SIMD (size_t val) { data = _mm_set1_pd(val); }
|
||||
|
||||
SIMD (double const * p) { data = _mm_loadu_pd(p); }
|
||||
SIMD (double const * p, SIMD<mask64,2> mask)
|
||||
{
|
||||
#ifdef __AVX__
|
||||
data = _mm_maskload_pd(p, mask.Data());
|
||||
#else
|
||||
// this versions segfaults if p points to the last allowed element
|
||||
// happened on Mac with the new SparseCholesky-factorization
|
||||
// data = _mm_and_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p));
|
||||
auto pmask = (int64_t*)&mask;
|
||||
data = _mm_set_pd (pmask[1] ? p[1] : 0.0, pmask[0] ? p[0] : 0.0);
|
||||
#endif
|
||||
}
|
||||
SIMD (__m128d _data) { data = _data; }
|
||||
|
||||
void Store (double * p) { _mm_storeu_pd(p, data); }
|
||||
void Store (double * p, SIMD<mask64,2> mask)
|
||||
{
|
||||
#ifdef __AVX__
|
||||
_mm_maskstore_pd(p, mask.Data(), data);
|
||||
#else
|
||||
/*
|
||||
_mm_storeu_pd (p, _mm_or_pd (_mm_and_pd(_mm_castsi128_pd(mask.Data()), data),
|
||||
_mm_andnot_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p))));
|
||||
*/
|
||||
auto pmask = (int64_t*)&mask;
|
||||
if (pmask[0]) p[0] = (*this)[0];
|
||||
if (pmask[1]) p[1] = (*this)[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm_set_pd(func(1), func(0));
|
||||
}
|
||||
|
||||
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NETGEN_INLINE __m128d Data() const { return data; }
|
||||
NETGEN_INLINE __m128d & Data() { return data; }
|
||||
|
||||
operator std::tuple<double&,double&> ()
|
||||
{
|
||||
auto pdata = (double*)&data;
|
||||
return std::tuple<double&,double&>(pdata[0], pdata[1]);
|
||||
}
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return _mm_xor_pd(a.Data(), _mm_set1_pd(-0.0)); }
|
||||
NETGEN_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_add_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return _mm_sub_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return _mm_mul_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_div_pd(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<double,2> operator* (double a, SIMD<double,2> b) { return _mm_set1_pd(a)*b; }
|
||||
NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> b, double a) { return _mm_set1_pd(a)*b; }
|
||||
|
||||
template<>
|
||||
NETGEN_INLINE auto Unpack (SIMD<double,2> a, SIMD<double,2> b)
|
||||
{
|
||||
return std::make_tuple(SIMD<double,2>(_mm_unpacklo_pd(a.Data(),b.Data())),
|
||||
SIMD<double,2>(_mm_unpackhi_pd(a.Data(),b.Data())));
|
||||
}
|
||||
|
||||
NETGEN_INLINE __m128d my_mm_hadd_pd(__m128d a, __m128d b) {
|
||||
#if defined(__SSE3__) || defined(__AVX__)
|
||||
return _mm_hadd_pd(a,b);
|
||||
#else
|
||||
return _mm_add_pd( _mm_unpacklo_pd(a,b), _mm_unpackhi_pd(a,b) );
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __AVX__
|
||||
NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
|
||||
auto res_lo = _mm_cvtsi128_si64(a) > _mm_cvtsi128_si64(b) ? -1:0;
|
||||
auto res_hi = _mm_cvtsi128_si64(_mm_srli_si128(a,8)) > _mm_cvtsi128_si64(_mm_srli_si128(b,8)) ? -1 : 0;
|
||||
return _mm_set_epi64x(res_hi,res_lo);
|
||||
}
|
||||
#else
|
||||
NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
|
||||
return _mm_cmpgt_epi64(a,b);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
NETGEN_INLINE SIMD<double,2> sqrt (SIMD<double,2> a) { return _mm_sqrt_pd(a.Data()); }
|
||||
NETGEN_INLINE SIMD<double,2> fabs (SIMD<double,2> a) { return _mm_max_pd(a.Data(), (-a).Data()); }
|
||||
using std::floor;
|
||||
NETGEN_INLINE SIMD<double,2> floor (SIMD<double,2> a)
|
||||
{ return ngcore::SIMD<double,2>([&](int i)->double { return floor(a[i]); } ); }
|
||||
using std::ceil;
|
||||
NETGEN_INLINE SIMD<double,2> ceil (SIMD<double,2> a)
|
||||
{ return ngcore::SIMD<double,2>([&](int i)->double { return ceil(a[i]); } ); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmple_pd(a.Data(),b.Data())); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmplt_pd(a.Data(),b.Data())); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmpge_pd(a.Data(),b.Data())); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmpgt_pd(a.Data(),b.Data())); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmpeq_pd(a.Data(),b.Data())); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<double,2> a , SIMD<double,2> b)
|
||||
{ return _mm_castpd_si128( _mm_cmpneq_pd(a.Data(),b.Data())); }
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return _mm_xor_si128(_mm_cmpgt_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return my_mm_cmpgt_epi64(b.Data(),a.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return _mm_xor_si128(_mm_cmpgt_epi64(b.Data(),a.Data()),_mm_set1_epi32(-1)); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return my_mm_cmpgt_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return _mm_cmpeq_epi64(a.Data(),b.Data()); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
|
||||
{ return _mm_xor_si128(_mm_cmpeq_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
|
||||
|
||||
|
||||
|
||||
NETGEN_INLINE SIMD<mask64,2> operator&& (SIMD<mask64,2> a, SIMD<mask64,2> b)
|
||||
{ return _mm_castpd_si128(_mm_and_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( b.Data()))); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator|| (SIMD<mask64,2> a, SIMD<mask64,2> b)
|
||||
{ return _mm_castpd_si128(_mm_or_pd (_mm_castsi128_pd(a.Data()), _mm_castsi128_pd(b.Data()))); }
|
||||
NETGEN_INLINE SIMD<mask64,2> operator! (SIMD<mask64,2> a)
|
||||
{ return _mm_castpd_si128(_mm_xor_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( _mm_cmpeq_epi64(a.Data(),a.Data())))); }
|
||||
#ifdef __SSE4_1__
|
||||
NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
|
||||
{ return _mm_blendv_pd(c.Data(), b.Data(), _mm_castsi128_pd(a.Data())); }
|
||||
#else
|
||||
NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
|
||||
{
|
||||
return _mm_or_pd(
|
||||
_mm_andnot_pd(_mm_castsi128_pd(a.Data()),c.Data()),
|
||||
_mm_and_pd(b.Data(),_mm_castsi128_pd(a.Data()))
|
||||
);}
|
||||
#endif // __SSE4_1__
|
||||
|
||||
NETGEN_INLINE SIMD<double,2> IfPos (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
|
||||
{ return ngcore::SIMD<double,2>([&](int i)->double { return a[i]>0 ? b[i] : c[i]; }); }
|
||||
NETGEN_INLINE SIMD<double,2> IfZero (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
|
||||
{ return ngcore::SIMD<double,2>([&](int i)->double { return a[i]==0. ? b[i] : c[i]; }); }
|
||||
|
||||
|
||||
NETGEN_INLINE double HSum (SIMD<double,2> sd)
|
||||
{
|
||||
return _mm_cvtsd_f64 (my_mm_hadd_pd (sd.Data(), sd.Data()));
|
||||
}
|
||||
|
||||
NETGEN_INLINE auto HSum (SIMD<double,2> sd1, SIMD<double,2> sd2)
|
||||
{
|
||||
__m128d hv2 = my_mm_hadd_pd(sd1.Data(), sd2.Data());
|
||||
return SIMD<double,2> (hv2);
|
||||
// return SIMD<double,2>(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
|
||||
}
|
||||
|
||||
NETGEN_INLINE SIMD<int64_t, 2> If(SIMD<mask64, 2> a, SIMD<int64_t, 2> b,
|
||||
SIMD<int64_t, 2> c) {
|
||||
return _mm_or_si128(
|
||||
_mm_andnot_si128(a.Data(),c.Data()),
|
||||
_mm_and_si128(b.Data(),a.Data())
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // NETGEN_CORE_SIMD_SSE_HPP
|
@ -11,7 +11,7 @@ target_sources(gen INTERFACE
|
||||
install(FILES
|
||||
ngarray.hpp autodiff.hpp autoptr.hpp ngbitarray.hpp
|
||||
dynamicmem.hpp hashtabl.hpp mpi_interface.hpp myadt.hpp
|
||||
ngsimd.hpp mystring.hpp netgenout.hpp ngpython.hpp
|
||||
mystring.hpp netgenout.hpp ngpython.hpp
|
||||
optmem.hpp parthreads.hpp seti.hpp sort.hpp
|
||||
spbita2d.hpp stack.hpp table.hpp template.hpp
|
||||
gzstream.h
|
||||
|
@ -47,7 +47,5 @@ namespace netgen
|
||||
#include "netgenout.hpp"
|
||||
#include "gzstream.h"
|
||||
|
||||
#include "ngsimd.hpp"
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -1,672 +0,0 @@
|
||||
#ifndef FILE_NGSIMD
|
||||
#define FILE_NGSIMD
|
||||
/**************************************************************************/
|
||||
/* File: ngsimd.hpp */
|
||||
/* Author: Joachim Schoeberl */
|
||||
/* Date: 25. Mar. 16 */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <tuple>
|
||||
#include <ostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include <core/utils.hpp>
|
||||
|
||||
#ifdef WIN32
|
||||
#ifndef AVX_OPERATORS_DEFINED
|
||||
#define AVX_OPERATORS_DEFINED
|
||||
NG_INLINE __m128d operator- (__m128d a) { return _mm_xor_pd(a, _mm_set1_pd(-0.0)); }
|
||||
NG_INLINE __m128d operator+ (__m128d a, __m128d b) { return _mm_add_pd(a,b); }
|
||||
NG_INLINE __m128d operator- (__m128d a, __m128d b) { return _mm_sub_pd(a,b); }
|
||||
NG_INLINE __m128d operator* (__m128d a, __m128d b) { return _mm_mul_pd(a,b); }
|
||||
NG_INLINE __m128d operator/ (__m128d a, __m128d b) { return _mm_div_pd(a,b); }
|
||||
NG_INLINE __m128d operator* (double a, __m128d b) { return _mm_set1_pd(a)*b; }
|
||||
NG_INLINE __m128d operator* (__m128d b, double a) { return _mm_set1_pd(a)*b; }
|
||||
|
||||
NG_INLINE __m128d operator+= (__m128d &a, __m128d b) { return a = a+b; }
|
||||
NG_INLINE __m128d operator-= (__m128d &a, __m128d b) { return a = a-b; }
|
||||
NG_INLINE __m128d operator*= (__m128d &a, __m128d b) { return a = a*b; }
|
||||
NG_INLINE __m128d operator/= (__m128d &a, __m128d b) { return a = a/b; }
|
||||
|
||||
NG_INLINE __m256d operator- (__m256d a) { return _mm256_xor_pd(a, _mm256_set1_pd(-0.0)); }
|
||||
NG_INLINE __m256d operator+ (__m256d a, __m256d b) { return _mm256_add_pd(a,b); }
|
||||
NG_INLINE __m256d operator- (__m256d a, __m256d b) { return _mm256_sub_pd(a,b); }
|
||||
NG_INLINE __m256d operator* (__m256d a, __m256d b) { return _mm256_mul_pd(a,b); }
|
||||
NG_INLINE __m256d operator/ (__m256d a, __m256d b) { return _mm256_div_pd(a,b); }
|
||||
NG_INLINE __m256d operator* (double a, __m256d b) { return _mm256_set1_pd(a)*b; }
|
||||
NG_INLINE __m256d operator* (__m256d b, double a) { return _mm256_set1_pd(a)*b; }
|
||||
|
||||
NG_INLINE __m256d operator+= (__m256d &a, __m256d b) { return a = a+b; }
|
||||
NG_INLINE __m256d operator-= (__m256d &a, __m256d b) { return a = a-b; }
|
||||
NG_INLINE __m256d operator*= (__m256d &a, __m256d b) { return a = a*b; }
|
||||
NG_INLINE __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
namespace ngsimd
|
||||
{
|
||||
// MSVC does not define SSE. It's always present on 64bit cpus
|
||||
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__AVX__))
|
||||
#ifndef __SSE__
|
||||
#define __SSE__
|
||||
#endif
|
||||
#ifndef __SSE2__
|
||||
#define __SSE2__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
constexpr int GetDefaultSIMDSize() {
|
||||
#if defined __AVX512F__
|
||||
return 8;
|
||||
#elif defined __AVX__
|
||||
return 4;
|
||||
#elif defined __SSE__
|
||||
return 2;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined __AVX512F__
|
||||
typedef __m512 tAVX;
|
||||
typedef __m512d tAVXd;
|
||||
#elif defined __AVX__
|
||||
typedef __m256 tAVX;
|
||||
typedef __m256d tAVXd;
|
||||
#elif defined __SSE__
|
||||
typedef __m128 tAVX;
|
||||
typedef __m128d tAVXd;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
|
||||
|
||||
template <typename T>
|
||||
struct has_call_operator
|
||||
{
|
||||
template <typename C> static std::true_type check( decltype( sizeof(&C::operator() )) ) { return std::true_type(); }
|
||||
template <typename> static std::false_type check(...) { return std::false_type(); }
|
||||
typedef decltype( check<T>(sizeof(char)) ) type;
|
||||
static constexpr type value = type();
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
// a*b+c
|
||||
NG_INLINE auto FMA(T1 a, T2 b, T3 c)
|
||||
{
|
||||
return a*b+c;
|
||||
}
|
||||
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator+ (T a, SIMD<double,N> b) { return SIMD<double,N>(a) + b; }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator- (T a, SIMD<double,N> b) { return SIMD<double,N>(a) - b; }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator* (T a, SIMD<double,N> b) { return SIMD<double,N>(a) * b; }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator/ (T a, SIMD<double,N> b) { return SIMD<double,N>(a) / b; }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator+ (SIMD<double,N> a, T b) { return a + SIMD<double,N>(b); }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator- (SIMD<double,N> a, T b) { return a - SIMD<double,N>(b); }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator* (SIMD<double,N> a, T b) { return a * SIMD<double,N>(b); }
|
||||
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
NG_INLINE SIMD<double,N> operator/ (SIMD<double,N> a, T b) { return a / SIMD<double,N>(b); }
|
||||
|
||||
|
||||
using std::sqrt;
|
||||
using std::fabs;
|
||||
|
||||
class ExceptionNOSIMD : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
using std::runtime_error::runtime_error;
|
||||
std::string What() { return what(); }
|
||||
};
|
||||
|
||||
using std::exp;
|
||||
template<int N> NG_INLINE SIMD<double,N> exp (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return exp(a[i]); } );
|
||||
}
|
||||
|
||||
using std::log;
|
||||
template<int N> NG_INLINE SIMD<double,N> log (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return log(a[i]); } );
|
||||
}
|
||||
|
||||
using std::pow;
|
||||
template<int N> NG_INLINE SIMD<double,N> pow (SIMD<double,N> a, double x)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return pow(a[i],x); } );
|
||||
}
|
||||
|
||||
using std::sin;
|
||||
template<int N> NG_INLINE SIMD<double,N> sin (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return sin(a[i]); } );
|
||||
}
|
||||
|
||||
using std::cos;
|
||||
template<int N> NG_INLINE SIMD<double,N> cos (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return cos(a[i]); } );
|
||||
}
|
||||
|
||||
using std::tan;
|
||||
template<int N> NG_INLINE SIMD<double,N> tan (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return tan(a[i]); } );
|
||||
}
|
||||
|
||||
using std::atan;
|
||||
template<int N> NG_INLINE SIMD<double,N> atan (SIMD<double,N> a)
|
||||
{
|
||||
return SIMD<double,N>([&](int i)->double { return atan(a[i]); } );
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// SIMD width 1 (in case no AVX support is available)
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
class SIMD<double,1>
|
||||
{
|
||||
double data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 1; }
|
||||
SIMD () = default;
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
// only called if T has a call operator of appropriate type
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = func(0);
|
||||
}
|
||||
|
||||
// only called if T is arithmetic (integral or floating point types)
|
||||
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
SIMD (const T & val)
|
||||
{
|
||||
data = val;
|
||||
}
|
||||
|
||||
SIMD (double const * p)
|
||||
{
|
||||
data = *p;
|
||||
}
|
||||
|
||||
NG_INLINE operator double() const { return data; }
|
||||
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NG_INLINE double Data() const { return data; }
|
||||
NG_INLINE double & Data() { return data; }
|
||||
|
||||
NG_INLINE SIMD<double,1> &operator+= (SIMD<double,1> b) { data+=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,1> &operator-= (SIMD<double,1> b) { data-=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,1> &operator*= (SIMD<double,1> b) { data*=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,1> &operator/= (SIMD<double,1> b) { data/=b.Data(); return *this; }
|
||||
|
||||
};
|
||||
|
||||
NG_INLINE SIMD<double,1> operator+ (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()+b.Data(); }
|
||||
NG_INLINE SIMD<double,1> operator- (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()-b.Data(); }
|
||||
NG_INLINE SIMD<double,1> operator- (SIMD<double,1> a) { return -a.Data(); }
|
||||
NG_INLINE SIMD<double,1> operator* (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()*b.Data(); }
|
||||
NG_INLINE SIMD<double,1> operator/ (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()/b.Data(); }
|
||||
|
||||
NG_INLINE SIMD<double,1> sqrt (SIMD<double,1> a) { return std::sqrt(a.Data()); }
|
||||
NG_INLINE SIMD<double,1> floor (SIMD<double,1> a) { return std::floor(a.Data()); }
|
||||
NG_INLINE SIMD<double,1> ceil (SIMD<double,1> a) { return std::ceil(a.Data()); }
|
||||
NG_INLINE SIMD<double,1> fabs (SIMD<double,1> a) { return std::fabs(a.Data()); }
|
||||
NG_INLINE SIMD<double,1> L2Norm2 (SIMD<double,1> a) { return a.Data()*a.Data(); }
|
||||
NG_INLINE SIMD<double,1> Trans (SIMD<double,1> a) { return a; }
|
||||
NG_INLINE SIMD<double,1> IfPos (SIMD<double,1> a, SIMD<double,1> b, SIMD<double,1> c)
|
||||
{
|
||||
return (a.Data() > 0) ? b : c;
|
||||
}
|
||||
|
||||
NG_INLINE double HSum (SIMD<double,1> sd)
|
||||
{
|
||||
return sd.Data();
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,1> sd1, SIMD<double,1> sd2)
|
||||
{
|
||||
return std::make_tuple(sd1.Data(), sd2.Data());
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,1> sd1, SIMD<double,1> sd2, SIMD<double,1> sd3, SIMD<double,1> sd4)
|
||||
{
|
||||
return std::make_tuple(sd1.Data(), sd2.Data(), sd3.Data(), sd4.Data());
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// SSE - Simd width 2
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef __SSE__
|
||||
template<>
|
||||
class alignas(16) SIMD<double,2>
|
||||
{
|
||||
__m128d data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 2; }
|
||||
SIMD () = default;
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (__m128d adata)
|
||||
: data(adata)
|
||||
{ ; }
|
||||
|
||||
// only called if T has a call operator of appropriate type
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm_set_pd(func(1), func(0));
|
||||
}
|
||||
|
||||
// only called if T is arithmetic (integral or floating point types)
|
||||
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
SIMD (const T & val)
|
||||
{
|
||||
data = _mm_set1_pd(val);
|
||||
}
|
||||
|
||||
SIMD (double const * p)
|
||||
{
|
||||
data = _mm_loadu_pd(p);
|
||||
}
|
||||
|
||||
NG_INLINE operator __m128d() const { return data; }
|
||||
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
NG_INLINE __m128d Data() const { return data; }
|
||||
NG_INLINE __m128d & Data() { return data; }
|
||||
|
||||
// NG_INLINE operator std::tuple<double&,double&,double&,double&> ()
|
||||
// { return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
|
||||
|
||||
NG_INLINE SIMD<double,2> &operator+= (SIMD<double,2> b) { data+=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,2> &operator-= (SIMD<double,2> b) { data-=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,2> &operator*= (SIMD<double,2> b) { data*=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,2> &operator/= (SIMD<double,2> b) { data/=b.Data(); return *this; }
|
||||
|
||||
};
|
||||
|
||||
NG_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()+b.Data(); }
|
||||
NG_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()-b.Data(); }
|
||||
NG_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return -a.Data(); }
|
||||
NG_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()*b.Data(); }
|
||||
NG_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()/b.Data(); }
|
||||
|
||||
/*
|
||||
NG_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), -a.Data()); }
|
||||
NG_INLINE SIMD<double,4> L2Norm2 (SIMD<double,4> a) { return a.Data()*a.Data(); }
|
||||
NG_INLINE SIMD<double,4> Trans (SIMD<double,4> a) { return a; }
|
||||
NG_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
|
||||
{
|
||||
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
|
||||
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
|
||||
}
|
||||
|
||||
NG_INLINE double HSum (SIMD<double,4> sd)
|
||||
{
|
||||
__m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
|
||||
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
|
||||
{
|
||||
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
|
||||
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
|
||||
return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
|
||||
{
|
||||
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
|
||||
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
|
||||
__m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
|
||||
_mm256_blend_pd (hsum1, hsum2, 12));
|
||||
return SIMD<double,4>(hsum);
|
||||
}
|
||||
*/
|
||||
#endif // __SSE__
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// AVX - Simd width 4
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef __AVX__
|
||||
template<>
|
||||
class alignas(32) SIMD<double,4>
|
||||
{
|
||||
__m256d data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 4; }
|
||||
SIMD () = default;
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (__m256d adata)
|
||||
: data(adata)
|
||||
{ ; }
|
||||
|
||||
// only called if T has a call operator of appropriate type
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm256_set_pd(func(3), func(2), func(1), func(0));
|
||||
}
|
||||
|
||||
// only called if T is arithmetic (integral or floating point types)
|
||||
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
SIMD (const T & val)
|
||||
{
|
||||
data = _mm256_set1_pd(val);
|
||||
}
|
||||
|
||||
SIMD (double const * p)
|
||||
{
|
||||
data = _mm256_loadu_pd(p);
|
||||
}
|
||||
|
||||
NG_INLINE operator __m256d() const { return data; }
|
||||
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
NG_INLINE __m256d Data() const { return data; }
|
||||
NG_INLINE __m256d & Data() { return data; }
|
||||
|
||||
NG_INLINE operator std::tuple<double&,double&,double&,double&> ()
|
||||
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
|
||||
|
||||
NG_INLINE SIMD<double,4> &operator+= (SIMD<double,4> b) { data+=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,4> &operator-= (SIMD<double,4> b) { data-=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,4> &operator*= (SIMD<double,4> b) { data*=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,4> &operator/= (SIMD<double,4> b) { data/=b.Data(); return *this; }
|
||||
|
||||
};
|
||||
|
||||
NG_INLINE SIMD<double,4> operator+ (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()+b.Data(); }
|
||||
NG_INLINE SIMD<double,4> operator- (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()-b.Data(); }
|
||||
NG_INLINE SIMD<double,4> operator- (SIMD<double,4> a) { return -a.Data(); }
|
||||
NG_INLINE SIMD<double,4> operator* (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()*b.Data(); }
|
||||
NG_INLINE SIMD<double,4> operator/ (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()/b.Data(); }
|
||||
|
||||
NG_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), -a.Data()); }
|
||||
NG_INLINE SIMD<double,4> L2Norm2 (SIMD<double,4> a) { return a.Data()*a.Data(); }
|
||||
NG_INLINE SIMD<double,4> Trans (SIMD<double,4> a) { return a; }
|
||||
NG_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
|
||||
{
|
||||
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
|
||||
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
|
||||
}
|
||||
|
||||
NG_INLINE double HSum (SIMD<double,4> sd)
|
||||
{
|
||||
__m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
|
||||
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
|
||||
{
|
||||
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
|
||||
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
|
||||
return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
|
||||
{
|
||||
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
|
||||
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
|
||||
__m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
|
||||
_mm256_blend_pd (hsum1, hsum2, 12));
|
||||
return SIMD<double,4>(hsum);
|
||||
}
|
||||
|
||||
#endif // __AVX__
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// AVX512 - Simd width 8
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef __AVX512F__
|
||||
template<>
|
||||
class alignas(64) SIMD<double,8>
|
||||
{
|
||||
__m512d data;
|
||||
|
||||
public:
|
||||
static constexpr int Size() { return 8; }
|
||||
SIMD () = default;
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
||||
SIMD (__m512d adata)
|
||||
: data(adata)
|
||||
{ ; }
|
||||
|
||||
// only called if T has a call operator of appropriate type
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
{
|
||||
data = _mm512_set_pd(func(7), func(6), func(5), func(4),
|
||||
func(3), func(2), func(1), func(0));
|
||||
}
|
||||
|
||||
// only called if T is arithmetic (integral or floating point types)
|
||||
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
|
||||
SIMD (const T & val)
|
||||
{
|
||||
data = _mm512_set1_pd(val);
|
||||
}
|
||||
|
||||
SIMD (double const * p)
|
||||
{
|
||||
data = _mm512_loadu_pd(p);
|
||||
}
|
||||
|
||||
NG_INLINE operator __m512d() const { return data; }
|
||||
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
|
||||
NG_INLINE __m512d Data() const { return data; }
|
||||
NG_INLINE __m512d & Data() { return data; }
|
||||
|
||||
NG_INLINE SIMD<double,8> &operator+= (SIMD<double,8> b) { data+=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,8> &operator-= (SIMD<double,8> b) { data-=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,8> &operator*= (SIMD<double,8> b) { data*=b.Data(); return *this; }
|
||||
NG_INLINE SIMD<double,8> &operator/= (SIMD<double,8> b) { data/=b.Data(); return *this; }
|
||||
|
||||
};
|
||||
|
||||
NG_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return _mm512_sub_pd(_mm512_setzero_pd(), a.Data()); }
|
||||
|
||||
NG_INLINE SIMD<double,8> operator+ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_add_pd(a.Data(),b.Data()); }
|
||||
NG_INLINE SIMD<double,8> operator- (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_sub_pd(a.Data(),b.Data()); }
|
||||
NG_INLINE SIMD<double,8> operator* (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_mul_pd(a.Data(),b.Data()); }
|
||||
NG_INLINE SIMD<double,8> operator/ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_div_pd(a.Data(),b.Data()); }
|
||||
|
||||
NG_INLINE SIMD<double,8> sqrt (SIMD<double,8> a) { return _mm512_sqrt_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,8> floor (SIMD<double,8> a) { return _mm512_floor_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,8> ceil (SIMD<double,8> a) { return _mm512_ceil_pd(a.Data()); }
|
||||
NG_INLINE SIMD<double,8> fabs (SIMD<double,8> a) { return _mm512_max_pd(a.Data(), -a.Data()); }
|
||||
NG_INLINE SIMD<double,8> L2Norm2 (SIMD<double,8> a) { return a.Data()*a.Data(); }
|
||||
NG_INLINE SIMD<double,8> Trans (SIMD<double,8> a) { return a; }
|
||||
NG_INLINE SIMD<double,8> IfPos (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{
|
||||
auto cp = _mm512_cmp_pd_mask (a.Data(), _mm512_setzero_pd(), _MM_CMPINT_GT);
|
||||
return _mm512_mask_blend_pd(cp, c.Data(), b.Data());
|
||||
}
|
||||
|
||||
|
||||
template<> NG_INLINE auto FMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
|
||||
{
|
||||
return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data());
|
||||
}
|
||||
|
||||
NG_INLINE double HSum (SIMD<double,8> sd)
|
||||
{
|
||||
SIMD<double,4> low = _mm512_extractf64x4_pd(sd.Data(),0);
|
||||
SIMD<double,4> high = _mm512_extractf64x4_pd(sd.Data(),1);
|
||||
return HSum(low)+HSum(high);
|
||||
}
|
||||
|
||||
NG_INLINE auto HSum (SIMD<double,8> sd1, SIMD<double,8> sd2)
|
||||
{
|
||||
return std::make_tuple(HSum(sd1), HSum(sd2));
|
||||
}
|
||||
|
||||
NG_INLINE SIMD<double,4> HSum (SIMD<double,8> v1, SIMD<double,8> v2, SIMD<double,8> v3, SIMD<double,8> v4)
|
||||
{
|
||||
SIMD<double,4> high1 = _mm512_extractf64x4_pd(v1.Data(),1);
|
||||
SIMD<double,4> high2 = _mm512_extractf64x4_pd(v2.Data(),1);
|
||||
SIMD<double,4> high3 = _mm512_extractf64x4_pd(v3.Data(),1);
|
||||
SIMD<double,4> high4 = _mm512_extractf64x4_pd(v4.Data(),1);
|
||||
SIMD<double,4> low1 = _mm512_extractf64x4_pd(v1.Data(),0);
|
||||
SIMD<double,4> low2 = _mm512_extractf64x4_pd(v2.Data(),0);
|
||||
SIMD<double,4> low3 = _mm512_extractf64x4_pd(v3.Data(),0);
|
||||
SIMD<double,4> low4 = _mm512_extractf64x4_pd(v4.Data(),0);
|
||||
return HSum(low1,low2,low3,low4) + HSum(high1,high2,high3,high4);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// MultiSIMD - Multiple SIMD values in one struct using head-tail implementation
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <int D, typename T>
|
||||
class MultiSIMD
|
||||
{
|
||||
SIMD<T> head;
|
||||
MultiSIMD<D-1,T> tail;
|
||||
public:
|
||||
MultiSIMD () = default;
|
||||
MultiSIMD (const MultiSIMD & ) = default;
|
||||
MultiSIMD (T v) : head(v), tail(v) { ; }
|
||||
MultiSIMD (SIMD<T> _head, MultiSIMD<D-1,T> _tail)
|
||||
: head(_head), tail(_tail) { ; }
|
||||
SIMD<T> Head() const { return head; }
|
||||
MultiSIMD<D-1,T> Tail() const { return tail; }
|
||||
SIMD<T> & Head() { return head; }
|
||||
MultiSIMD<D-1,T> & Tail() { return tail; }
|
||||
|
||||
template <int NR>
|
||||
SIMD<T> Get() const { return NR==0 ? head : tail.template Get<NR-1>(); }
|
||||
template <int NR>
|
||||
SIMD<T> & Get() { return NR==0 ? head : tail.template Get<NR-1>(); }
|
||||
auto MakeTuple() { return std::tuple_cat(std::tuple<SIMD<T>&> (head), tail.MakeTuple()); }
|
||||
// not yet possible for MSVC
|
||||
// operator auto () { return MakeTuple(); }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class MultiSIMD<2,T>
|
||||
{
|
||||
SIMD<T> v0, v1;
|
||||
public:
|
||||
MultiSIMD () = default;
|
||||
MultiSIMD (const MultiSIMD & ) = default;
|
||||
MultiSIMD (T v) : v0(v), v1(v) { ; }
|
||||
MultiSIMD (SIMD<T> _v0, SIMD<T> _v1) : v0(_v0), v1(_v1) { ; }
|
||||
|
||||
SIMD<T> Head() const { return v0; }
|
||||
SIMD<T> Tail() const { return v1; }
|
||||
SIMD<T> & Head() { return v0; }
|
||||
SIMD<T> & Tail() { return v1; }
|
||||
|
||||
template <int NR>
|
||||
SIMD<T> Get() const { return NR==0 ? v0 : v1; }
|
||||
template <int NR>
|
||||
SIMD<T> & Get() { return NR==0 ? v0 : v1; }
|
||||
auto MakeTuple() { return std::tuple<SIMD<T>&, SIMD<T>&> (v0, v1); }
|
||||
operator std::tuple<SIMD<T>&, SIMD<T>&>() { return MakeTuple(); }
|
||||
};
|
||||
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a.Head()+b.Head(), a.Tail()+b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (double a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a+b.Head(), a+b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (MultiSIMD<D,double> b, double a)
|
||||
{ return MultiSIMD<D,double> (a+b.Head(), a+b.Tail()); }
|
||||
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a.Head()-b.Head(), a.Tail()-b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator- (double a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a-b.Head(), a-b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> b, double a)
|
||||
{ return MultiSIMD<D,double> (b.Head()-a, b.Tail()-a); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> a)
|
||||
{ return MultiSIMD<D,double> (-a.Head(), -a.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator* (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a.Head()*b.Head(), a.Tail()*b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator/ (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> (a.Head()/b.Head(), a.Tail()/b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator* (double a, MultiSIMD<D,double> b)
|
||||
{ return MultiSIMD<D,double> ( a*b.Head(), a*b.Tail()); }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator* (MultiSIMD<D,double> b, double a)
|
||||
{ return MultiSIMD<D,double> ( a*b.Head(), a*b.Tail()); }
|
||||
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> & operator+= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
|
||||
{ a.Head()+=b.Head(); a.Tail()+=b.Tail(); return a; }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator-= (MultiSIMD<D,double> & a, double b)
|
||||
{ a.Head()-=b; a.Tail()-=b; return a; }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> operator-= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
|
||||
{ a.Head()-=b.Head(); a.Tail()-=b.Tail(); return a; }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> & operator*= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
|
||||
{ a.Head()*=b.Head(); a.Tail()*=b.Tail(); return a; }
|
||||
template <int D> NG_INLINE MultiSIMD<D,double> & operator*= (MultiSIMD<D,double> & a, double b)
|
||||
{ a.Head()*=b; a.Tail()*=b; return a; }
|
||||
// NG_INLINE MultiSIMD<double> operator/= (MultiSIMD<double> & a, MultiSIMD<double> b) { return a.Data()/=b.Data(); }
|
||||
|
||||
NG_INLINE SIMD<double> HVSum (SIMD<double> a) { return a; }
|
||||
template <int D>
|
||||
NG_INLINE SIMD<double> HVSum (MultiSIMD<D,double> a) { return a.Head() + HVSum(a.Tail()); }
|
||||
|
||||
template <int D> NG_INLINE double HSum (MultiSIMD<D,double> a) { return HSum(HVSum(a)); }
|
||||
template <int D> NG_INLINE auto HSum (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
|
||||
{ return HSum(HVSum(a), HVSum(b)); }
|
||||
|
||||
template <int D, typename T>
|
||||
std::ostream & operator<< (std::ostream & ost, MultiSIMD<D,T> multi)
|
||||
{
|
||||
ost << multi.Head() << " " << multi.Tail();
|
||||
return ost;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream & operator<< (std::ostream & ost, SIMD<T> simd)
|
||||
{
|
||||
ost << simd[0];
|
||||
for (int i = 1; i < simd.Size(); i++)
|
||||
ost << " " << simd[i];
|
||||
return ost;
|
||||
}
|
||||
}
|
||||
|
||||
namespace netgen
|
||||
{
|
||||
using namespace ngsimd;
|
||||
}
|
||||
#endif
|
@ -779,29 +779,23 @@ namespace netgen
|
||||
|
||||
|
||||
|
||||
#ifdef __SSE__
|
||||
#include <immintrin.h>
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<1,1> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
cout << "multi-eltrafo simd called, 1,1,simd" << endl;
|
||||
}
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<2,2> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<2>
|
||||
(elnr, npts,
|
||||
reinterpret_cast<const SIMD<double>*> (xi), sxi,
|
||||
reinterpret_cast<SIMD<double>*> (x), sx,
|
||||
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
|
||||
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
|
||||
/*
|
||||
for (int i = 0; i < npts; i++)
|
||||
{
|
||||
@ -828,15 +822,15 @@ namespace netgen
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<3,3> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
mesh->GetCurvedElements().CalcMultiPointElementTransformation
|
||||
(elnr, npts,
|
||||
reinterpret_cast<const SIMD<double>*> (xi), sxi,
|
||||
reinterpret_cast<SIMD<double>*> (x), sx,
|
||||
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
|
||||
xi, sxi,
|
||||
x, sx,
|
||||
dxdxi, sdxdxi);
|
||||
/*
|
||||
for (int i = 0; i < npts; i++)
|
||||
{
|
||||
@ -863,33 +857,30 @@ namespace netgen
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<0,2> (int elnr, int npts,
|
||||
const tAVXd *xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> *xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
cout << "MultiElementtransformation<0,2> simd not implemented" << endl;
|
||||
}
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<0,1> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
cout << "multi-eltrafo simd called, 0,1,simd" << endl;
|
||||
}
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<1,3> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<3>
|
||||
(elnr, npts,
|
||||
reinterpret_cast<const SIMD<double>*> (xi), sxi,
|
||||
reinterpret_cast<SIMD<double>*> (x), sx,
|
||||
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
|
||||
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
|
||||
/*
|
||||
double hxi[4][1];
|
||||
double hx[4][3];
|
||||
@ -912,15 +903,12 @@ namespace netgen
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<1,2> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<2>
|
||||
(elnr, npts,
|
||||
reinterpret_cast<const SIMD<double>*> (xi), sxi,
|
||||
reinterpret_cast<SIMD<double>*> (x), sx,
|
||||
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
|
||||
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
|
||||
/*
|
||||
for (int i = 0; i < npts; i++)
|
||||
{
|
||||
@ -947,15 +935,12 @@ namespace netgen
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<2,3> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<3>
|
||||
(elnr, npts,
|
||||
reinterpret_cast<const SIMD<double>*> (xi), sxi,
|
||||
reinterpret_cast<SIMD<double>*> (x), sx,
|
||||
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
|
||||
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
|
||||
/*
|
||||
for (int i = 0; i < npts; i++)
|
||||
{
|
||||
@ -982,9 +967,9 @@ namespace netgen
|
||||
|
||||
template<> DLL_HEADER void Ngx_Mesh ::
|
||||
MultiElementTransformation<0,3> (int elnr, int npts,
|
||||
const tAVXd * xi, size_t sxi,
|
||||
tAVXd * x, size_t sx,
|
||||
tAVXd * dxdxi, size_t sdxdxi) const
|
||||
const SIMD<double> * xi, size_t sxi,
|
||||
SIMD<double> * x, size_t sx,
|
||||
SIMD<double> * dxdxi, size_t sdxdxi) const
|
||||
{
|
||||
for (int i = 0; i < npts; i++)
|
||||
{
|
||||
@ -1003,10 +988,6 @@ namespace netgen
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -6,15 +6,6 @@ namespace netgen
|
||||
{
|
||||
|
||||
using namespace std;
|
||||
/*
|
||||
#if defined __AVX512F__
|
||||
typedef __m512 tAVX;
|
||||
typedef __m512d tAVXd;
|
||||
#elif defined __AVX__
|
||||
typedef __m256 tAVX;
|
||||
typedef __m256d tAVXd;
|
||||
#endif
|
||||
*/
|
||||
|
||||
class SolutionData
|
||||
{
|
||||
@ -101,17 +92,15 @@ namespace netgen
|
||||
return res;
|
||||
}
|
||||
|
||||
#ifdef __SSE__
|
||||
virtual bool GetMultiSurfValue (size_t selnr, size_t facetnr, size_t npts,
|
||||
const ngsimd::tAVXd * xref,
|
||||
const ngsimd::tAVXd * x,
|
||||
const ngsimd::tAVXd * dxdxref,
|
||||
ngsimd::tAVXd * values)
|
||||
const SIMD<double> * xref,
|
||||
const SIMD<double> * x,
|
||||
const SIMD<double> * dxdxref,
|
||||
SIMD<double> * values)
|
||||
{
|
||||
cerr << "GetMultiSurfVaue not overloaded for SIMD<double>" << endl;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
virtual bool GetSegmentValue (int segnr, double xref, double * values)
|
||||
{ return false; }
|
||||
|
Loading…
Reference in New Issue
Block a user