Merge branch 'refactor_simd' into 'master'

Move SIMD headers from ngsolve to ngcore

See merge request jschoeberl/netgen!358
This commit is contained in:
Joachim Schöberl 2020-12-15 10:07:56 +00:00
commit f626255ceb
12 changed files with 1528 additions and 743 deletions

View File

@ -73,6 +73,7 @@ install(FILES ngcore.hpp archive.hpp type_traits.hpp version.hpp ngcore_api.hpp
exception.hpp symboltable.hpp paje_trace.hpp utils.hpp profiler.hpp mpi_wrapper.hpp
array.hpp taskmanager.hpp concurrentqueue.h localheap.hpp python_ngcore.hpp flags.hpp
xbool.hpp signal.hpp bitarray.hpp table.hpp hashtable.hpp ranges.hpp
simd.hpp simd_avx.hpp simd_avx512.hpp simd_generic.hpp simd_sse.hpp
DESTINATION ${NG_INSTALL_DIR_INCLUDE}/core COMPONENT netgen_devel)
if(ENABLE_CPP_CORE_GUIDELINES_CHECK)

View File

@ -13,6 +13,7 @@
#include "mpi_wrapper.hpp"
#include "profiler.hpp"
#include "signal.hpp"
#include "simd.hpp"
#include "symboltable.hpp"
#include "taskmanager.hpp"
#include "version.hpp"

69
libsrc/core/simd.hpp Normal file
View File

@ -0,0 +1,69 @@
#ifndef NETGEN_CORE_SIMD_HPP
#define NETGEN_CORE_SIMD_HPP
/**************************************************************************/
/* File: simd.hpp */
/* Author: Joachim Schoeberl, Matthias Hochsteger */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include "ngcore_api.hpp"
#include "simd_generic.hpp"
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__))
#ifndef __SSE__
#define __SSE__
#endif
#include "simd_sse.hpp"
#endif
#ifdef __AVX__
#include "simd_avx.hpp"
#endif
#ifdef __AVX512F__
#include "simd_avx512.hpp"
#endif
namespace ngcore
{
NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
{
SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data());
SIMD<double,2> hsum2 = my_mm_hadd_pd (v3.Data(), v4.Data());
return SIMD<double,4> (hsum1, hsum2);
}
NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)
{
SIMD<double,4> h1,h2,h3,h4;
std::tie(h1,h2) = Unpack(a1,a2);
std::tie(h3,h4) = Unpack(a3,a4);
b1 = SIMD<double,4> (h1.Lo(), h3.Lo());
b2 = SIMD<double,4> (h2.Lo(), h4.Lo());
b3 = SIMD<double,4> (h1.Hi(), h3.Hi());
b4 = SIMD<double,4> (h2.Hi(), h4.Hi());
}
template<int N>
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2)
{
return SIMD<double,2>(HSum(s1), HSum(s2));
}
template<int N>
NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2, SIMD<double,N> s3, SIMD<double,N> s4 )
{
return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4));
}
NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
{
return SIMD<mask64>::GetMaskFromBits(i);
}
}
#endif // NETGEN_CORE_SIMD_HPP

256
libsrc/core/simd_avx.hpp Normal file
View File

@ -0,0 +1,256 @@
#ifndef NETGEN_CORE_SIMD_AVX_HPP
#define NETGEN_CORE_SIMD_AVX_HPP
/**************************************************************************/
/* File: simd_avx.hpp */
/* Author: Joachim Schoeberl, Matthias Hochsteger */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include <immintrin.h>
namespace ngcore
{
#if defined(__AVX2__)
NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
{
return _mm256_cmpgt_epi64 (a,b);
}
#else
NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
{
__m128i rlo = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 0),
_mm256_extractf128_si256(b, 0));
__m128i rhi = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 1),
_mm256_extractf128_si256(b, 1));
return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1);
}
#endif
template <>
class SIMD<mask64,4>
{
__m256i mask;
public:
SIMD (size_t i)
: mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i),
_mm256_set_epi64x(3, 2, 1, 0)))
{ ; }
SIMD (__m256i _mask) : mask(_mask) { ; }
SIMD (__m256d _mask) : mask(_mm256_castpd_si256(_mask)) { ; }
__m256i Data() const { return mask; }
static constexpr int Size() { return 4; }
static SIMD<mask64, 4> GetMaskFromBits (unsigned int i);
};
static SIMD<mask64, 4> masks_from_4bits[16] = {
_mm256_set_epi64x (0,0,0,0), _mm256_set_epi64x (0,0,0,-1),
_mm256_set_epi64x (0,0,-1,0), _mm256_set_epi64x (0,0,-1,-1),
_mm256_set_epi64x (0,-1,0,0), _mm256_set_epi64x (0,-1,0,-1),
_mm256_set_epi64x (0,-1,-1,0), _mm256_set_epi64x (0,-1,-1,-1),
_mm256_set_epi64x (-1,0,0,0), _mm256_set_epi64x (-1,0,0,-1),
_mm256_set_epi64x (-1,0,-1,0), _mm256_set_epi64x (-1,0,-1,-1),
_mm256_set_epi64x (-1,-1,0,0), _mm256_set_epi64x (-1,-1,0,-1),
_mm256_set_epi64x (-1,-1,-1,0), _mm256_set_epi64x (-1,-1,-1,-1)
};
NETGEN_INLINE SIMD<mask64, 4> SIMD<mask64, 4> :: GetMaskFromBits (unsigned int i)
{
return masks_from_4bits[i & 15];
}
template<>
class alignas(32) SIMD<int64_t,4>
{
__m256i data;
public:
static constexpr int Size() { return 4; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) { data = _mm256_set1_epi64x(val); }
SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3) { data = _mm256_set_epi64x(v3,v2,v1,v0); }
SIMD (std::array<int64_t,4> a)
: data{_mm256_set_epi64x(a[3],a[2],a[1],a[0])}
{}
SIMD (SIMD<int64_t,2> v0, SIMD<int64_t,2> v1)
: data(_mm256_set_m128i(v0.Data(),v1.Data()))
{}
SIMD (__m256i _data) { data = _data; }
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
NETGEN_INLINE __m256i Data() const { return data; }
NETGEN_INLINE __m256i & Data() { return data; }
SIMD<int64_t,2> Lo() const { return _mm256_extractf128_si256(data, 0); }
SIMD<int64_t,2> Hi() const { return _mm256_extractf128_si256(data, 1); }
static SIMD FirstInt(int n0=0) { return { n0+0, n0+1, n0+2, n0+3 }; }
};
NETGEN_INLINE SIMD<int64_t,4> operator-(SIMD<int64_t,4> a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a.Data()); }
#ifdef __AVX2__
NETGEN_INLINE SIMD<int64_t,4> operator+ (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_add_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<int64_t,4> operator- (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_sub_epi64(a.Data(),b.Data()); }
#endif // __AVX2__
template<>
class alignas(32) SIMD<double,4>
{
__m256d data;
public:
static constexpr int Size() { return 4; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (double val) { data = _mm256_set1_pd(val); }
SIMD (int val) { data = _mm256_set1_pd(val); }
SIMD (size_t val) { data = _mm256_set1_pd(val); }
SIMD (double v0, double v1, double v2, double v3) { data = _mm256_set_pd(v3,v2,v1,v0); }
SIMD (SIMD<double,2> v0, SIMD<double,2> v1) : SIMD(v0[0], v0[1], v1[0], v1[1]) { ; }
SIMD (double const * p) { data = _mm256_loadu_pd(p); }
SIMD (double const * p, SIMD<mask64,4> mask) { data = _mm256_maskload_pd(p, mask.Data()); }
SIMD (__m256d _data) { data = _data; }
SIMD (std::array<double,4> a)
: data{_mm256_set_pd(a[3],a[2],a[1],a[0])}
{}
void Store (double * p) { _mm256_storeu_pd(p, data); }
void Store (double * p, SIMD<mask64,4> mask) { _mm256_maskstore_pd(p, mask.Data(), data); }
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm256_set_pd(func(3), func(2), func(1), func(0));
}
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
// [[deprecated("don't write to individual elments of SIMD")]]
// NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
template <int I>
double Get() const { return ((double*)(&data))[I]; }
NETGEN_INLINE __m256d Data() const { return data; }
NETGEN_INLINE __m256d & Data() { return data; }
SIMD<double,2> Lo() const { return _mm256_extractf128_pd(data, 0); }
SIMD<double,2> Hi() const { return _mm256_extractf128_pd(data, 1); }
operator std::tuple<double&,double&,double&,double&> ()
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
};
NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
{
return std::make_tuple(SIMD<double,4>(_mm256_unpacklo_pd(a.Data(),b.Data())),
SIMD<double,4>(_mm256_unpackhi_pd(a.Data(),b.Data())));
}
NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a) { return _mm256_xor_pd(a.Data(), _mm256_set1_pd(-0.0)); }
NETGEN_INLINE SIMD<double,4> operator+ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_add_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_sub_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_mul_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,4> operator/ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_div_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,4> operator* (double a, SIMD<double,4> b) { return _mm256_set1_pd(a)*b.Data(); }
NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> b, double a) { return _mm256_set1_pd(a)*b.Data(); }
NETGEN_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
NETGEN_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
NETGEN_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
NETGEN_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), (-a).Data()); }
NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LE_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LT_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GE_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GT_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_EQ_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<double,4> a , SIMD<double,4> b)
{ return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_NEQ_OQ); }
NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return _mm256_xor_si256(_mm256_cmpgt_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return my_mm256_cmpgt_epi64(b.Data(),a.Data()); }
NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return _mm256_xor_si256(_mm256_cmpgt_epi64(b.Data(),a.Data()),_mm256_set1_epi32(-1)); }
NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return my_mm256_cmpgt_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return _mm256_cmpeq_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
{ return _mm256_xor_si256(_mm256_cmpeq_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
#ifdef __AVX2__
NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
{ return _mm256_and_si256 (a.Data(), b.Data()); }
NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
{ return _mm256_or_si256 (a.Data(), b.Data()); }
NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
{ return _mm256_xor_si256 (a.Data(), _mm256_cmpeq_epi64(a.Data(),a.Data())); }
#else //AVX2 is a superset of AVX. Without it, it is necessary to reinterpret the types
NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
{ return _mm256_castpd_si256(_mm256_and_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( b.Data()))); }
NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
{ return _mm256_castpd_si256(_mm256_or_pd (_mm256_castsi256_pd(a.Data()), _mm256_castsi256_pd(b.Data()))); }
NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
{ return _mm256_castpd_si256(_mm256_xor_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( _mm256_cmpeq_epi64(a.Data(),a.Data())))); }
#endif
NETGEN_INLINE SIMD<double,4> If (SIMD<mask64,4> a, SIMD<double,4> b, SIMD<double,4> c)
{ return _mm256_blendv_pd(c.Data(), b.Data(), _mm256_castsi256_pd(a.Data())); }
NETGEN_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
{
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
}
NETGEN_INLINE SIMD<double,4> IfZero (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
{
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_EQ_OS);
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
}
NETGEN_INLINE double HSum (SIMD<double,4> sd)
{
// __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
__m128d hv = (sd.Lo()+sd.Hi()).Data();
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
}
NETGEN_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
{
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
return SIMD<double,2>(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
}
NETGEN_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
{
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
SIMD<double,4> hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
_mm256_blend_pd (hsum1, hsum2, 12));
return hsum;
// return make_tuple(hsum[0], hsum[1], hsum[2], hsum[3]);
}
NETGEN_INLINE SIMD<int64_t,4> If (SIMD<mask64,4> a, SIMD<int64_t,4> b, SIMD<int64_t,4> c)
{ return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(c.Data()), _mm256_castsi256_pd(b.Data()),
_mm256_castsi256_pd(a.Data()))); }
}
#endif // NETGEN_CORE_SIMD_AVX_HPP

239
libsrc/core/simd_avx512.hpp Normal file
View File

@ -0,0 +1,239 @@
#ifndef NETGEN_CORE_SIMD_AVX512_HPP
#define NETGEN_CORE_SIMD_AVX512_HPP
/**************************************************************************/
/* File: simd_avx512.hpp */
/* Author: Joachim Schoeberl, Matthias Hochsteger */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include <immintrin.h>
namespace ngcore
{
template <>
class SIMD<mask64,8>
{
__mmask8 mask;
public:
SIMD (size_t i)
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
{ ; }
SIMD (int i)
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
{ ; }
SIMD (int64_t i)
: mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
_mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
{ ; }
SIMD (__mmask8 _mask) : mask(_mask) { ; }
__mmask8 Data() const { return mask; }
static constexpr int Size() { return 8; }
static NETGEN_INLINE SIMD<mask64, 8> GetMaskFromBits (unsigned int i)
{
return SIMD<mask64, 8>(__mmask8(i));
}
};
template<>
class alignas(64) SIMD<int64_t,8>
{
__m512i data;
public:
static constexpr int Size() { return 8; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) { data = _mm512_set1_epi64(val); }
SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7) { data = _mm512_set_epi64(v7,v6,v5,v4,v3,v2,v1,v0); }
SIMD (__m512i _data) { data = _data; }
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm512_set_epi64(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
}
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
NETGEN_INLINE __m512i Data() const { return data; }
NETGEN_INLINE __m512i & Data() { return data; }
static SIMD FirstInt() { return { 0, 1, 2, 3, 4, 5, 6, 7 }; }
};
NETGEN_INLINE SIMD<int64_t,8> operator-(SIMD<int64_t,8> a) { return _mm512_sub_epi64(_mm512_setzero_si512(), a.Data()); }
NETGEN_INLINE SIMD<int64_t,8> operator+ (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_add_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<int64_t,8> operator- (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_sub_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<int64_t,8> If (SIMD<mask64,8> a, SIMD<int64_t,8> b, SIMD<int64_t,8> c)
{ return _mm512_mask_blend_epi64(a.Data(), c.Data(), b.Data()); }
template<>
class alignas(64) SIMD<double,8>
{
__m512d data;
public:
static constexpr int Size() { return 8; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (double val) { data = _mm512_set1_pd(val); }
SIMD (int val) { data = _mm512_set1_pd(val); }
SIMD (size_t val) { data = _mm512_set1_pd(val); }
SIMD (double const * p) { data = _mm512_loadu_pd(p); }
SIMD (double const * p, SIMD<mask64,8> mask)
{ data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
SIMD (__m512d _data) { data = _data; }
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm512_set_pd(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
}
void Store (double * p) { _mm512_storeu_pd(p, data); }
void Store (double * p, SIMD<mask64,8> mask) { _mm512_mask_storeu_pd(p, mask.Data(), data); }
template <typename Function>
void SIMD_function (const Function & func, std::true_type)
{
data = (__m512){ func(7), func(6), func(5), func(4),
func(3), func(2), func(1), func(0) };
}
// not a function
void SIMD_function (double const * p, std::false_type)
{
data = _mm512_loadu_pd(p);
}
void SIMD_function (double val, std::false_type)
{
data = _mm512_set1_pd(val);
}
void SIMD_function (__m512d _data, std::false_type)
{
data = _data;
}
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NETGEN_INLINE __m512d Data() const { return data; }
NETGEN_INLINE __m512d & Data() { return data; }
};
NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return -a.Data(); }
NETGEN_INLINE SIMD<double,8> operator+ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_add_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_sub_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_mul_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,8> operator/ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_div_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,8> operator* (double a, SIMD<double,8> b) { return _mm512_set1_pd(a)*b.Data(); }
NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> b, double a) { return _mm512_set1_pd(a)*b.Data(); }
NETGEN_INLINE SIMD<double,8> sqrt (SIMD<double,8> a) { return _mm512_sqrt_pd(a.Data()); }
NETGEN_INLINE SIMD<double,8> floor (SIMD<double,8> a) { return _mm512_floor_pd(a.Data()); }
NETGEN_INLINE SIMD<double,8> ceil (SIMD<double,8> a) { return _mm512_ceil_pd(a.Data()); }
NETGEN_INLINE SIMD<double,8> fabs (SIMD<double,8> a) { return _mm512_max_pd(a.Data(), -a.Data()); }
NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LE_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LT_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GE_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GT_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_EQ_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<double,8> a , SIMD<double,8> b)
{ return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_NEQ_OQ); }
NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LE); }
NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LT); }
NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLT); }
NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLE); }
NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_EQ); }
NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
{ return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NE); }
NETGEN_INLINE SIMD<mask64,8> operator&& (SIMD<mask64,8> a, SIMD<mask64,8> b)
{ return (__mmask8)(a.Data() & b.Data()); }
NETGEN_INLINE SIMD<mask64,8> operator|| (SIMD<mask64,8> a, SIMD<mask64,8> b)
{ return (__mmask8)(a.Data() | b.Data()); }
NETGEN_INLINE SIMD<mask64,8> operator! (SIMD<mask64,8> a)
{ return (__mmask8)(~a.Data()); }
NETGEN_INLINE SIMD<double,8> If (SIMD<mask64,8> a, SIMD<double,8> b, SIMD<double,8> c)
{ return _mm512_mask_blend_pd(a.Data(), c.Data(), b.Data()); }
NETGEN_INLINE SIMD<double,8> IfPos (SIMD<double,8> a, SIMD<double> b, SIMD<double> c)
{
auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_GT_OS);
return _mm512_mask_blend_pd(k,c.Data(),b.Data());
}
NETGEN_INLINE SIMD<double,8> IfZero (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
{
auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_EQ_OS);
return _mm512_mask_blend_pd(k,c.Data(),b.Data());
}
NETGEN_INLINE auto Unpack (SIMD<double,8> a, SIMD<double,8> b)
{
return std::make_tuple(SIMD<double,8>(_mm512_unpacklo_pd(a.Data(),b.Data())),
SIMD<double,8>(_mm512_unpackhi_pd(a.Data(),b.Data())));
}
NETGEN_INLINE double HSum (SIMD<double,8> sd)
{
SIMD<double,4> low = _mm512_extractf64x4_pd(sd.Data(),0);
SIMD<double,4> high = _mm512_extractf64x4_pd(sd.Data(),1);
return HSum(low)+HSum(high);
}
NETGEN_INLINE auto HSum (SIMD<double,8> sd1, SIMD<double,8> sd2)
{
return SIMD<double,2>(HSum(sd1), HSum(sd2));
}
NETGEN_INLINE SIMD<double,4> HSum (SIMD<double,8> v1, SIMD<double,8> v2, SIMD<double,8> v3, SIMD<double,8> v4)
{
SIMD<double> lo,hi;
std::tie(lo,hi) = Unpack(v1, v2);
SIMD<double> sum01 = lo+hi;
std::tie(lo,hi) = Unpack(v3, v4);
SIMD<double> sum23 = lo+hi;
// sum01 b a b a b a b a
// sum23 d c d c d c d c
// __m512 perm = _mm512_permutex2var_pd (sum01.Data(), _mm512_set_epi64(1,2,3,4,5,6,7,8), sum23.Data());
__m256d ab = _mm512_extractf64x4_pd(sum01.Data(),0) + _mm512_extractf64x4_pd(sum01.Data(),1);
__m256d cd = _mm512_extractf64x4_pd(sum23.Data(),0) + _mm512_extractf64x4_pd(sum23.Data(),1);
return _mm256_add_pd (_mm256_permute2f128_pd (ab, cd, 1+2*16), _mm256_blend_pd (ab, cd, 12));
}
NETGEN_INLINE SIMD<double,8> FMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
{
return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data());
}
NETGEN_INLINE SIMD<double,8> FMA (const double & a, SIMD<double,8> b, SIMD<double,8> c)
{
return _mm512_fmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data());
}
}
#endif // NETGEN_CORE_SIMD_AVX512_HPP

View File

@ -0,0 +1,657 @@
#ifndef NETGEN_CORE_SIMD_GENERIC_HPP
#define NETGEN_CORE_SIMD_GENERIC_HPP
/**************************************************************************/
/* File: simd_base.hpp */
/* Author: Joachim Schoeberl, Matthias Hochsteger */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include <type_traits>
#include <functional>
#include <tuple>
#include "array.hpp"
namespace ngcore
{
constexpr int GetDefaultSIMDSize() {
#if defined __AVX512F__
return 8;
#elif defined __AVX__
return 4;
#elif (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE__))
return 2;
#else
return 1;
#endif
}
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
class mask64;
////////////////////////////////////////////////////////////////////////////
namespace detail {
template <typename T, size_t N, size_t... I>
auto array_range_impl(std::array<T, N> const& arr,
size_t first,
std::index_sequence<I...>)
-> std::array<T, sizeof...(I)> {
return {arr[first + I]...};
}
template <size_t S, typename T, size_t N>
auto array_range(std::array<T, N> const& arr, size_t first) {
return array_range_impl(arr, first, std::make_index_sequence<S>{});
}
} // namespace detail
////////////////////////////////////////////////////////////////////////////
// mask
template <>
class SIMD<mask64,1>
{
int64_t mask;
public:
SIMD (size_t i)
: mask(i > 0 ? -1 : 0) { ; }
bool Data() const { return mask; }
static constexpr int Size() { return 1; }
auto operator[] (int /* i */) const { return mask; }
};
template <int N>
class SIMD<mask64,N>
{
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1;
SIMD<mask64,N1> lo;
SIMD<mask64,N2> hi;
public:
SIMD (int i) : lo(i), hi(i-N1) { ; }
SIMD (SIMD<mask64,N1> lo_, SIMD<mask64,N2> hi_) : lo(lo_), hi(hi_) { ; }
SIMD<mask64,N1> Lo() const { return lo; }
SIMD<mask64,N2> Hi() const { return hi; }
static constexpr int Size() { return N; }
};
template<int N>
NETGEN_INLINE SIMD<mask64,N> operator&& (SIMD<mask64,N> a, SIMD<mask64,N> b)
{
if constexpr(N==1) return a.Data() && b.Data();
else return { a.Lo() && b.Lo(), a.Hi() && b.Hi() };
}
////////////////////////////////////////////////////////////////////////////
// int64
template<>
class SIMD<int64_t,1>
{
int64_t data;
public:
static constexpr int Size() { return 1; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) { data = val; }
SIMD (std::array<int64_t, 1> arr)
: data{arr[0]}
{}
int64_t operator[] (int i) const { return ((int64_t*)(&data))[i]; }
auto Data() const { return data; }
static SIMD FirstInt(int64_t n0=0) { return {n0}; }
template <int I>
int64_t Get()
{
static_assert(I==0);
return data;
}
};
template<int N>
class SIMD<int64_t,N>
{
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1;
SIMD<int64_t,N1> lo;
SIMD<int64_t,N2> high;
public:
static constexpr int Size() { return N; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) : lo{val}, high{val} { ; }
SIMD (SIMD<int64_t,N1> lo_, SIMD<int64_t,N2> high_) : lo(lo_), high(high_) { ; }
SIMD( std::array<int64_t, N> arr )
: lo(detail::array_range<N1>(arr, 0)),
high(detail::array_range<N2>(arr, N1))
{}
template<typename ...T>
SIMD(const T... vals)
: lo(detail::array_range<N1>(std::array<int64_t, N>{vals...}, 0)),
high(detail::array_range<N2>(std::array<int64_t, N>{vals...}, N1))
{
static_assert(sizeof...(vals)==N, "wrong number of arguments");
}
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
for(auto i : IntRange(N1))
lo[i] = func(i);
for(auto i : IntRange(N2))
high[i] = func(N1+i);
}
auto Lo() const { return lo; }
auto Hi() const { return high; }
int64_t operator[] (int i) const { return ((int64_t*)(&lo))[i]; }
/*
operator tuple<int64_t&,int64_t&,int64_t&,int64_t&> ()
{ return tuple<int64_t&,int64_t&,int64_t&,int64_t&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
*/
/*
static SIMD FirstInt() { return { 0, 1, 2, 3 }; }
*/
static SIMD FirstInt(int64_t n0=0) { return {SIMD<int64_t,N1>::FirstInt(n0), SIMD<int64_t,N2>::FirstInt(n0+N1)}; }
template <int I>
int64_t Get()
{
static_assert(I>=0 && I<N, "Index out of range");
if constexpr(I<N1) return lo.template Get<I>();
else return high.template Get<I-N1>();
}
};
////////////////////////////////////////////////////////////////////////////
// double
template<>
class SIMD<double,1>
{
double data;
public:
static constexpr int Size() { return 1; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (double val) { data = val; }
SIMD (int val) { data = val; }
SIMD (size_t val) { data = val; }
SIMD (double const * p) { data = *p; }
SIMD (double const * p, SIMD<mask64,1> mask) { data = mask.Data() ? *p : 0.0; }
SIMD (std::array<double, 1> arr)
: data{arr[0]}
{}
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD (const T & func)
{
data = func(0);
}
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD & operator= (const T & func)
{
data = func(0);
return *this;
}
void Store (double * p) { *p = data; }
void Store (double * p, SIMD<mask64,1> mask) { if (mask.Data()) *p = data; }
double operator[] (int i) const { return ((double*)(&data))[i]; }
double Data() const { return data; }
template <int I>
double Get()
{
static_assert(I==0);
return data;
}
};
template<int N>
class SIMD<double, N>
{
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
static constexpr int N2 = N-N1;
SIMD<double, N1> lo;
SIMD<double, N2> high;
public:
static constexpr int Size() { return N; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD (SIMD<double,N1> lo_, SIMD<double,N2> hi_) : lo(lo_), high(hi_) { ; }
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD (const T & func)
{
for(auto i : IntRange(N1))
lo[i] = func(i);
for(auto i : IntRange(N2))
high[i] = func(N1+i);
}
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
SIMD & operator= (const T & func)
{
for(auto i : IntRange(N1))
lo[i] = func(i);
for(auto i : IntRange(N2))
high[i] = func(N1+i);
return *this;
}
SIMD & operator= (const SIMD &) = default;
SIMD (double val) : lo{val}, high{val} { ; }
SIMD (int val) : lo{val}, high{val} { ; }
SIMD (size_t val) : lo{val}, high{val} { ; }
SIMD (double const * p) : lo{p}, high{p+N1} { ; }
SIMD (double const * p, SIMD<mask64,N> mask)
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
{ }
SIMD (double * p) : lo{p}, high{p+N1} { ; }
SIMD (double * p, SIMD<mask64,N> mask)
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
{ }
SIMD( std::array<double, N> arr )
: lo(detail::array_range<N1>(arr, 0)),
high(detail::array_range<N2>(arr, N1))
{}
template<typename ...T>
SIMD(const T... vals)
: lo(detail::array_range<N1>(std::array<double, N>{vals...}, 0)),
high(detail::array_range<N2>(std::array<double, N>{vals...}, N1))
{
static_assert(sizeof...(vals)==N, "wrong number of arguments");
}
void Store (double * p) { lo.Store(p); high.Store(p+N1); }
void Store (double * p, SIMD<mask64,N> mask)
{
lo.Store(p, mask.Lo());
high.Store(p+N1, mask.Hi());
}
auto Lo() const { return lo; }
auto Hi() const { return high; }
double operator[] (int i) const { return ((double*)(&lo))[i]; }
template<typename=std::enable_if<N==2>>
operator std::tuple<double&,double&> ()
{ return std::tuple<double&,double&>((*this)[0], (*this)[1]); }
template<typename=std::enable_if<N==4>>
operator std::tuple<double&,double&,double&,double&> ()
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
template <int I>
double Get()
{
static_assert(I>=0 && I<N, "Index out of range");
if constexpr(I<N1) return lo.template Get<I>();
else return high.template Get<I-N1>();
}
};
// Generic operators for any arithmetic type/simd width
template <typename T, int N>
NETGEN_INLINE SIMD<T,N> operator+ (SIMD<T,N> a, SIMD<T,N> b) {
if constexpr(N==1) return a.Data()+b.Data();
else return { a.Lo()+b.Lo(), a.Hi()+b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a, SIMD<T,N> b) {
if constexpr(N==1) return a.Data()-b.Data();
else return { a.Lo()-b.Lo(), a.Hi()-b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a) {
if constexpr(N==1) return -a.Data();
else return { -a.Lo(), -a.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<T,N> operator* (SIMD<T,N> a, SIMD<T,N> b) {
if constexpr(N==1) return a.Data()*b.Data();
else return { a.Lo()*b.Lo(), a.Hi()*b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<T,N> operator/ (SIMD<T,N> a, SIMD<T,N> b) {
if constexpr(N==1) return a.Data()/b.Data();
else return { a.Lo()/b.Lo(), a.Hi()/b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator< (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() < b.Data();
else return { a.Lo()<b.Lo(), a.Hi()<b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator<= (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() <= b.Data();
else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator> (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() > b.Data();
else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator>= (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() >= b.Data();
else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator== (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() == b.Data();
else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() };
}
template <typename T, int N>
NETGEN_INLINE SIMD<mask64,N> operator!= (SIMD<T,N> & a, SIMD<T,N> b)
{
if constexpr(N==1) return a.Data() != b.Data();
else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() };
}
// int64_t operators with scalar operand (implement overloads to allow implicit casts for second operand)
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator+ (SIMD<int64_t,N> a, int64_t b) { return a+SIMD<int64_t,N>(b); }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator+ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)+b; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator- (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)-b; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator- (SIMD<int64_t,N> a, int64_t b) { return a-SIMD<int64_t,N>(b); }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator* (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)*b; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator* (SIMD<int64_t,N> b, int64_t a) { return SIMD<int64_t,N>(a)*b; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator/ (SIMD<int64_t,N> a, int64_t b) { return a/SIMD<int64_t,N>(b); }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> operator/ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)/b; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a+b; return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, int64_t b) { a+=SIMD<int64_t,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a-b; return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, int64_t b) { a-=SIMD<int64_t,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a*b; return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, int64_t b) { a*=SIMD<int64_t,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<int64_t,N> & operator/= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a/b; return a; }
// double operators with scalar operand (implement overloads to allow implicit casts for second operand)
template <int N>
NETGEN_INLINE SIMD<double,N> operator+ (SIMD<double,N> a, double b) { return a+SIMD<double,N>(b); }
template <int N>
NETGEN_INLINE SIMD<double,N> operator+ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)+b; }
template <int N>
NETGEN_INLINE SIMD<double,N> operator- (double a, SIMD<double,N> b) { return SIMD<double,N>(a)-b; }
template <int N>
NETGEN_INLINE SIMD<double,N> operator- (SIMD<double,N> a, double b) { return a-SIMD<double,N>(b); }
template <int N>
NETGEN_INLINE SIMD<double,N> operator* (double a, SIMD<double,N> b) { return SIMD<double,N>(a)*b; }
template <int N>
NETGEN_INLINE SIMD<double,N> operator* (SIMD<double,N> b, double a) { return SIMD<double,N>(a)*b; }
template <int N>
NETGEN_INLINE SIMD<double,N> operator/ (SIMD<double,N> a, double b) { return a/SIMD<double,N>(b); }
template <int N>
NETGEN_INLINE SIMD<double,N> operator/ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)/b; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, SIMD<double,N> b) { a=a+b; return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, double b) { a+=SIMD<double,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, SIMD<double,N> b) { a = a-b; return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, double b) { a-=SIMD<double,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, SIMD<double,N> b) { a=a*b; return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, double b) { a*=SIMD<double,N>(b); return a; }
template <int N>
NETGEN_INLINE SIMD<double,N> & operator/= (SIMD<double,N> & a, SIMD<double,N> b) { a = a/b; return a; }
// double functions
template <int N>
NETGEN_INLINE SIMD<double,N> L2Norm2 (SIMD<double,N> a) { return a*a; }
template <int N>
NETGEN_INLINE SIMD<double,N> Trans (SIMD<double,N> a) { return a; }
template <int N>
NETGEN_INLINE double HSum (SIMD<double,N> a)
{
if constexpr(N==1)
return a.Data();
else
return HSum(a.Lo()) + HSum(a.Hi());
}
NETGEN_INLINE double IfPos (double a, double b, double c) { return a>0 ? b : c; }
NETGEN_INLINE double IfZero (double a, double b, double c) { return a==0. ? b : c; }
template<typename T, int N>
NETGEN_INLINE SIMD<T,N> IfPos (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
{
if constexpr(N==1) return a.Data()>0.0 ? b : c;
else return { IfPos(a.Lo(), b.Lo(), c.Lo()), IfPos(a.Hi(), b.Hi(), c.Hi())};
}
template<typename T, int N>
NETGEN_INLINE SIMD<T,N> IfZero (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
{
if constexpr(N==1) return a.Data()==0.0 ? b : c;
else return { IfZero(a.Lo(), b.Lo(), c.Lo()), IfZero(a.Hi(), b.Hi(), c.Hi())};
}
template<typename T, int N>
NETGEN_INLINE SIMD<T,N> If (SIMD<mask64,N> a, SIMD<T,N> b, SIMD<T,N> c)
{
if constexpr(N==1) return a.Data() ? b : c;
else return { If(a.Lo(), b.Lo(), c.Lo()), If(a.Hi(), b.Hi(), c.Hi())};
}
template <typename T1, typename T2, typename T3>
// a*b+c
NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c)
{
return a*b+c;
}
// update form of fma
template <int N>
void FMAasm (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> & sum)
{
sum = FMA(a,b,sum);
}
template <int i, typename T, int N>
T get(SIMD<T,N> a) { return a[i]; }
template <int NUM, typename FUNC>
NETGEN_INLINE void Iterate2 (FUNC f)
{
if constexpr (NUM > 1) Iterate2<NUM-1> (f);
if constexpr (NUM >= 1) f(std::integral_constant<int,NUM-1>());
}
template <typename T, int N>
ostream & operator<< (ostream & ost, SIMD<T,N> simd)
{
/*
ost << simd[0];
for (int i = 1; i < simd.Size(); i++)
ost << " " << simd[i];
*/
Iterate2<simd.Size()> ([&] (auto I) {
if (I.value != 0) ost << " ";
ost << get<I.value>(simd);
});
return ost;
}
using std::exp;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> exp (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double>([a](int i)->double { return exp(a[i]); } );
}
using std::log;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> log (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return log(a[i]); } );
}
using std::pow;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, double x) {
return ngcore::SIMD<double,N>([a,x](int i)->double { return pow(a[i],x); } );
}
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, ngcore::SIMD<double,N> b) {
return ngcore::SIMD<double,N>([a,b](int i)->double { return pow(a[i],b[i]); } );
}
using std::sin;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> sin (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return sin(a[i]); } );
}
using std::cos;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> cos (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return cos(a[i]); } );
}
using std::tan;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> tan (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return tan(a[i]); } );
}
using std::atan;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> atan (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return atan(a[i]); } );
}
using std::atan2;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> atan2 (ngcore::SIMD<double,N> y, ngcore::SIMD<double,N> x) {
return ngcore::SIMD<double,N>([y,x](int i)->double { return atan2(y[i], x[i]); } );
}
using std::acos;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> acos (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return acos(a[i]); } );
}
using std::asin;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> asin (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return asin(a[i]); } );
}
using std::sinh;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> sinh (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return sinh(a[i]); } );
}
using std::cosh;
template <int N>
NETGEN_INLINE ngcore::SIMD<double,N> cosh (ngcore::SIMD<double,N> a) {
return ngcore::SIMD<double,N>([a](int i)->double { return cosh(a[i]); } );
}
template<int N, typename T>
using MultiSIMD = SIMD<T, N*GetDefaultSIMDSize()>;
template<int N>
NETGEN_INLINE auto Unpack (SIMD<double,N> a, SIMD<double,N> b)
{
if constexpr(N==1)
{
return std::make_tuple(SIMD<double,N>{a.Data()}, SIMD<double,N>{b.Data()} );
}
else
{
auto [a1,b1] = Unpack(a.Lo(), b.Lo());
auto [a2,b2] = Unpack(a.Hi(), b.Hi());
return std::make_tuple(SIMD<double,N>{ a1, a2 },
SIMD<double,N>{ b1, b2 });
}
}
}
namespace std
{
// structured binding support
template <typename T, int N >
struct tuple_size<ngcore::SIMD<T,N>> : std::integral_constant<std::size_t, N> {};
template<size_t N, typename T, int M> struct tuple_element<N,ngcore::SIMD<T,M>> { using type = T; };
}
#endif // NETGEN_CORE_SIMD_GENERIC_HPP

266
libsrc/core/simd_sse.hpp Normal file
View File

@ -0,0 +1,266 @@
#ifndef NETGEN_CORE_SIMD_SSE_HPP
#define NETGEN_CORE_SIMD_SSE_HPP
/**************************************************************************/
/* File: simd_sse.hpp */
/* Author: Joachim Schoeberl, Matthias Hochsteger */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include <immintrin.h>
namespace ngcore
{
template <>
class SIMD<mask64,2>
{
__m128i mask;
public:
SIMD (int i)
: mask(_mm_cmpgt_epi32(_mm_set1_epi32(i),
_mm_set_epi32(1, 1, 0, 0)))
{ ; }
SIMD (__m128i _mask) : mask(_mask) { ; }
__m128i Data() const { return mask; }
static constexpr int Size() { return 2; }
static NETGEN_INLINE SIMD<mask64, 2> GetMaskFromBits (unsigned int i);
};
static SIMD<mask64, 2> masks_from_2bits[4] = {
_mm_set_epi32 (0,0,0,0), _mm_set_epi32 (0,0,-1,0),
_mm_set_epi32 (-1,0,0,0), _mm_set_epi32 (-1,0,-1,0),
};
NETGEN_INLINE SIMD<mask64, 2> SIMD<mask64, 2> :: GetMaskFromBits (unsigned int i)
{
return masks_from_2bits[i & 3];
}
template<>
class alignas(16) SIMD<int64_t,2>
{
__m128i data;
public:
static constexpr int Size() { return 2; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD (int64_t v0, int64_t v1) { data = _mm_set_epi64x(v1,v0); }
SIMD (std::array<int64_t, 2> arr)
: data{_mm_set_epi64x(arr[1],arr[0])}
{}
SIMD & operator= (const SIMD &) = default;
SIMD (int64_t val) { data = _mm_set1_epi64x(val); }
SIMD (__m128i _data) { data = _data; }
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm_set_epi64(func(1), func(0));
}
NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
NETGEN_INLINE __m128i Data() const { return data; }
NETGEN_INLINE __m128i & Data() { return data; }
static SIMD FirstInt(int n0=0) { return { n0, n0+1 }; }
};
NETGEN_INLINE SIMD<int64_t,2> operator-(SIMD<int64_t,2> a) { return _mm_sub_epi64(_mm_setzero_si128(), a.Data()); }
NETGEN_INLINE SIMD<int64_t,2> operator+ (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_add_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_sub_epi64(a.Data(),b.Data()); }
template<>
class alignas(16) SIMD<double,2>
{
__m128d data;
public:
static constexpr int Size() { return 2; }
SIMD () {}
SIMD (const SIMD &) = default;
SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
SIMD (std::array<double, 2> arr)
: data{_mm_set_pd(arr[1], arr[0])}
{}
SIMD & operator= (const SIMD &) = default;
SIMD (double val) { data = _mm_set1_pd(val); }
SIMD (int val) { data = _mm_set1_pd(val); }
SIMD (size_t val) { data = _mm_set1_pd(val); }
SIMD (double const * p) { data = _mm_loadu_pd(p); }
SIMD (double const * p, SIMD<mask64,2> mask)
{
#ifdef __AVX__
data = _mm_maskload_pd(p, mask.Data());
#else
// this versions segfaults if p points to the last allowed element
// happened on Mac with the new SparseCholesky-factorization
// data = _mm_and_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p));
auto pmask = (int64_t*)&mask;
data = _mm_set_pd (pmask[1] ? p[1] : 0.0, pmask[0] ? p[0] : 0.0);
#endif
}
SIMD (__m128d _data) { data = _data; }
void Store (double * p) { _mm_storeu_pd(p, data); }
void Store (double * p, SIMD<mask64,2> mask)
{
#ifdef __AVX__
_mm_maskstore_pd(p, mask.Data(), data);
#else
/*
_mm_storeu_pd (p, _mm_or_pd (_mm_and_pd(_mm_castsi128_pd(mask.Data()), data),
_mm_andnot_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p))));
*/
auto pmask = (int64_t*)&mask;
if (pmask[0]) p[0] = (*this)[0];
if (pmask[1]) p[1] = (*this)[1];
#endif
}
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm_set_pd(func(1), func(0));
}
NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NETGEN_INLINE __m128d Data() const { return data; }
NETGEN_INLINE __m128d & Data() { return data; }
operator std::tuple<double&,double&> ()
{
auto pdata = (double*)&data;
return std::tuple<double&,double&>(pdata[0], pdata[1]);
}
};
NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return _mm_xor_pd(a.Data(), _mm_set1_pd(-0.0)); }
NETGEN_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_add_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return _mm_sub_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return _mm_mul_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_div_pd(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<double,2> operator* (double a, SIMD<double,2> b) { return _mm_set1_pd(a)*b; }
NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> b, double a) { return _mm_set1_pd(a)*b; }
template<>
NETGEN_INLINE auto Unpack (SIMD<double,2> a, SIMD<double,2> b)
{
return std::make_tuple(SIMD<double,2>(_mm_unpacklo_pd(a.Data(),b.Data())),
SIMD<double,2>(_mm_unpackhi_pd(a.Data(),b.Data())));
}
NETGEN_INLINE __m128d my_mm_hadd_pd(__m128d a, __m128d b) {
#if defined(__SSE3__) || defined(__AVX__)
return _mm_hadd_pd(a,b);
#else
return _mm_add_pd( _mm_unpacklo_pd(a,b), _mm_unpackhi_pd(a,b) );
#endif
}
#ifndef __AVX__
NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
auto res_lo = _mm_cvtsi128_si64(a) > _mm_cvtsi128_si64(b) ? -1:0;
auto res_hi = _mm_cvtsi128_si64(_mm_srli_si128(a,8)) > _mm_cvtsi128_si64(_mm_srli_si128(b,8)) ? -1 : 0;
return _mm_set_epi64x(res_hi,res_lo);
}
#else
NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
return _mm_cmpgt_epi64(a,b);
}
#endif
NETGEN_INLINE SIMD<double,2> sqrt (SIMD<double,2> a) { return _mm_sqrt_pd(a.Data()); }
NETGEN_INLINE SIMD<double,2> fabs (SIMD<double,2> a) { return _mm_max_pd(a.Data(), (-a).Data()); }
using std::floor;
NETGEN_INLINE SIMD<double,2> floor (SIMD<double,2> a)
{ return ngcore::SIMD<double,2>([&](int i)->double { return floor(a[i]); } ); }
using std::ceil;
NETGEN_INLINE SIMD<double,2> ceil (SIMD<double,2> a)
{ return ngcore::SIMD<double,2>([&](int i)->double { return ceil(a[i]); } ); }
NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmple_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmplt_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmpge_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmpgt_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmpeq_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<double,2> a , SIMD<double,2> b)
{ return _mm_castpd_si128( _mm_cmpneq_pd(a.Data(),b.Data())); }
NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return _mm_xor_si128(_mm_cmpgt_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return my_mm_cmpgt_epi64(b.Data(),a.Data()); }
NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return _mm_xor_si128(_mm_cmpgt_epi64(b.Data(),a.Data()),_mm_set1_epi32(-1)); }
NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return my_mm_cmpgt_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return _mm_cmpeq_epi64(a.Data(),b.Data()); }
NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
{ return _mm_xor_si128(_mm_cmpeq_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
NETGEN_INLINE SIMD<mask64,2> operator&& (SIMD<mask64,2> a, SIMD<mask64,2> b)
{ return _mm_castpd_si128(_mm_and_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( b.Data()))); }
NETGEN_INLINE SIMD<mask64,2> operator|| (SIMD<mask64,2> a, SIMD<mask64,2> b)
{ return _mm_castpd_si128(_mm_or_pd (_mm_castsi128_pd(a.Data()), _mm_castsi128_pd(b.Data()))); }
NETGEN_INLINE SIMD<mask64,2> operator! (SIMD<mask64,2> a)
{ return _mm_castpd_si128(_mm_xor_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( _mm_cmpeq_epi64(a.Data(),a.Data())))); }
#ifdef __SSE4_1__
NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
{ return _mm_blendv_pd(c.Data(), b.Data(), _mm_castsi128_pd(a.Data())); }
#else
NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
{
return _mm_or_pd(
_mm_andnot_pd(_mm_castsi128_pd(a.Data()),c.Data()),
_mm_and_pd(b.Data(),_mm_castsi128_pd(a.Data()))
);}
#endif // __SSE4_1__
NETGEN_INLINE SIMD<double,2> IfPos (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
{ return ngcore::SIMD<double,2>([&](int i)->double { return a[i]>0 ? b[i] : c[i]; }); }
NETGEN_INLINE SIMD<double,2> IfZero (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
{ return ngcore::SIMD<double,2>([&](int i)->double { return a[i]==0. ? b[i] : c[i]; }); }
NETGEN_INLINE double HSum (SIMD<double,2> sd)
{
return _mm_cvtsd_f64 (my_mm_hadd_pd (sd.Data(), sd.Data()));
}
NETGEN_INLINE auto HSum (SIMD<double,2> sd1, SIMD<double,2> sd2)
{
__m128d hv2 = my_mm_hadd_pd(sd1.Data(), sd2.Data());
return SIMD<double,2> (hv2);
// return SIMD<double,2>(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
}
NETGEN_INLINE SIMD<int64_t, 2> If(SIMD<mask64, 2> a, SIMD<int64_t, 2> b,
SIMD<int64_t, 2> c) {
return _mm_or_si128(
_mm_andnot_si128(a.Data(),c.Data()),
_mm_and_si128(b.Data(),a.Data())
);
}
}
#endif // NETGEN_CORE_SIMD_SSE_HPP

View File

@ -11,7 +11,7 @@ target_sources(gen INTERFACE
install(FILES
ngarray.hpp autodiff.hpp autoptr.hpp ngbitarray.hpp
dynamicmem.hpp hashtabl.hpp mpi_interface.hpp myadt.hpp
ngsimd.hpp mystring.hpp netgenout.hpp ngpython.hpp
mystring.hpp netgenout.hpp ngpython.hpp
optmem.hpp parthreads.hpp seti.hpp sort.hpp
spbita2d.hpp stack.hpp table.hpp template.hpp
gzstream.h

View File

@ -47,7 +47,5 @@ namespace netgen
#include "netgenout.hpp"
#include "gzstream.h"
#include "ngsimd.hpp"
#endif

View File

@ -1,672 +0,0 @@
#ifndef FILE_NGSIMD
#define FILE_NGSIMD
/**************************************************************************/
/* File: ngsimd.hpp */
/* Author: Joachim Schoeberl */
/* Date: 25. Mar. 16 */
/**************************************************************************/
#include <immintrin.h>
#include <tuple>
#include <ostream>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <core/utils.hpp>
#ifdef WIN32
#ifndef AVX_OPERATORS_DEFINED
#define AVX_OPERATORS_DEFINED
NG_INLINE __m128d operator- (__m128d a) { return _mm_xor_pd(a, _mm_set1_pd(-0.0)); }
NG_INLINE __m128d operator+ (__m128d a, __m128d b) { return _mm_add_pd(a,b); }
NG_INLINE __m128d operator- (__m128d a, __m128d b) { return _mm_sub_pd(a,b); }
NG_INLINE __m128d operator* (__m128d a, __m128d b) { return _mm_mul_pd(a,b); }
NG_INLINE __m128d operator/ (__m128d a, __m128d b) { return _mm_div_pd(a,b); }
NG_INLINE __m128d operator* (double a, __m128d b) { return _mm_set1_pd(a)*b; }
NG_INLINE __m128d operator* (__m128d b, double a) { return _mm_set1_pd(a)*b; }
NG_INLINE __m128d operator+= (__m128d &a, __m128d b) { return a = a+b; }
NG_INLINE __m128d operator-= (__m128d &a, __m128d b) { return a = a-b; }
NG_INLINE __m128d operator*= (__m128d &a, __m128d b) { return a = a*b; }
NG_INLINE __m128d operator/= (__m128d &a, __m128d b) { return a = a/b; }
NG_INLINE __m256d operator- (__m256d a) { return _mm256_xor_pd(a, _mm256_set1_pd(-0.0)); }
NG_INLINE __m256d operator+ (__m256d a, __m256d b) { return _mm256_add_pd(a,b); }
NG_INLINE __m256d operator- (__m256d a, __m256d b) { return _mm256_sub_pd(a,b); }
NG_INLINE __m256d operator* (__m256d a, __m256d b) { return _mm256_mul_pd(a,b); }
NG_INLINE __m256d operator/ (__m256d a, __m256d b) { return _mm256_div_pd(a,b); }
NG_INLINE __m256d operator* (double a, __m256d b) { return _mm256_set1_pd(a)*b; }
NG_INLINE __m256d operator* (__m256d b, double a) { return _mm256_set1_pd(a)*b; }
NG_INLINE __m256d operator+= (__m256d &a, __m256d b) { return a = a+b; }
NG_INLINE __m256d operator-= (__m256d &a, __m256d b) { return a = a-b; }
NG_INLINE __m256d operator*= (__m256d &a, __m256d b) { return a = a*b; }
NG_INLINE __m256d operator/= (__m256d &a, __m256d b) { return a = a/b; }
#endif
#endif
namespace ngsimd
{
// MSVC does not define SSE. It's always present on 64bit cpus
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__AVX__))
#ifndef __SSE__
#define __SSE__
#endif
#ifndef __SSE2__
#define __SSE2__
#endif
#endif
constexpr int GetDefaultSIMDSize() {
#if defined __AVX512F__
return 8;
#elif defined __AVX__
return 4;
#elif defined __SSE__
return 2;
#else
return 1;
#endif
}
#if defined __AVX512F__
typedef __m512 tAVX;
typedef __m512d tAVXd;
#elif defined __AVX__
typedef __m256 tAVX;
typedef __m256d tAVXd;
#elif defined __SSE__
typedef __m128 tAVX;
typedef __m128d tAVXd;
#endif
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
template <typename T>
struct has_call_operator
{
template <typename C> static std::true_type check( decltype( sizeof(&C::operator() )) ) { return std::true_type(); }
template <typename> static std::false_type check(...) { return std::false_type(); }
typedef decltype( check<T>(sizeof(char)) ) type;
static constexpr type value = type();
};
template <typename T1, typename T2, typename T3>
// a*b+c
NG_INLINE auto FMA(T1 a, T2 b, T3 c)
{
return a*b+c;
}
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator+ (T a, SIMD<double,N> b) { return SIMD<double,N>(a) + b; }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator- (T a, SIMD<double,N> b) { return SIMD<double,N>(a) - b; }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator* (T a, SIMD<double,N> b) { return SIMD<double,N>(a) * b; }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator/ (T a, SIMD<double,N> b) { return SIMD<double,N>(a) / b; }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator+ (SIMD<double,N> a, T b) { return a + SIMD<double,N>(b); }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator- (SIMD<double,N> a, T b) { return a - SIMD<double,N>(b); }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator* (SIMD<double,N> a, T b) { return a * SIMD<double,N>(b); }
template<int N, typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
NG_INLINE SIMD<double,N> operator/ (SIMD<double,N> a, T b) { return a / SIMD<double,N>(b); }
using std::sqrt;
using std::fabs;
class ExceptionNOSIMD : public std::runtime_error
{
public:
using std::runtime_error::runtime_error;
std::string What() { return what(); }
};
using std::exp;
template<int N> NG_INLINE SIMD<double,N> exp (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return exp(a[i]); } );
}
using std::log;
template<int N> NG_INLINE SIMD<double,N> log (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return log(a[i]); } );
}
using std::pow;
template<int N> NG_INLINE SIMD<double,N> pow (SIMD<double,N> a, double x)
{
return SIMD<double,N>([&](int i)->double { return pow(a[i],x); } );
}
using std::sin;
template<int N> NG_INLINE SIMD<double,N> sin (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return sin(a[i]); } );
}
using std::cos;
template<int N> NG_INLINE SIMD<double,N> cos (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return cos(a[i]); } );
}
using std::tan;
template<int N> NG_INLINE SIMD<double,N> tan (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return tan(a[i]); } );
}
using std::atan;
template<int N> NG_INLINE SIMD<double,N> atan (SIMD<double,N> a)
{
return SIMD<double,N>([&](int i)->double { return atan(a[i]); } );
}
/////////////////////////////////////////////////////////////////////////////
// SIMD width 1 (in case no AVX support is available)
/////////////////////////////////////////////////////////////////////////////
template<>
class SIMD<double,1>
{
double data;
public:
static constexpr int Size() { return 1; }
SIMD () = default;
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
// only called if T has a call operator of appropriate type
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = func(0);
}
// only called if T is arithmetic (integral or floating point types)
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
SIMD (const T & val)
{
data = val;
}
SIMD (double const * p)
{
data = *p;
}
NG_INLINE operator double() const { return data; }
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NG_INLINE double Data() const { return data; }
NG_INLINE double & Data() { return data; }
NG_INLINE SIMD<double,1> &operator+= (SIMD<double,1> b) { data+=b.Data(); return *this; }
NG_INLINE SIMD<double,1> &operator-= (SIMD<double,1> b) { data-=b.Data(); return *this; }
NG_INLINE SIMD<double,1> &operator*= (SIMD<double,1> b) { data*=b.Data(); return *this; }
NG_INLINE SIMD<double,1> &operator/= (SIMD<double,1> b) { data/=b.Data(); return *this; }
};
NG_INLINE SIMD<double,1> operator+ (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()+b.Data(); }
NG_INLINE SIMD<double,1> operator- (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()-b.Data(); }
NG_INLINE SIMD<double,1> operator- (SIMD<double,1> a) { return -a.Data(); }
NG_INLINE SIMD<double,1> operator* (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()*b.Data(); }
NG_INLINE SIMD<double,1> operator/ (SIMD<double,1> a, SIMD<double,1> b) { return a.Data()/b.Data(); }
NG_INLINE SIMD<double,1> sqrt (SIMD<double,1> a) { return std::sqrt(a.Data()); }
NG_INLINE SIMD<double,1> floor (SIMD<double,1> a) { return std::floor(a.Data()); }
NG_INLINE SIMD<double,1> ceil (SIMD<double,1> a) { return std::ceil(a.Data()); }
NG_INLINE SIMD<double,1> fabs (SIMD<double,1> a) { return std::fabs(a.Data()); }
NG_INLINE SIMD<double,1> L2Norm2 (SIMD<double,1> a) { return a.Data()*a.Data(); }
NG_INLINE SIMD<double,1> Trans (SIMD<double,1> a) { return a; }
NG_INLINE SIMD<double,1> IfPos (SIMD<double,1> a, SIMD<double,1> b, SIMD<double,1> c)
{
return (a.Data() > 0) ? b : c;
}
NG_INLINE double HSum (SIMD<double,1> sd)
{
return sd.Data();
}
NG_INLINE auto HSum (SIMD<double,1> sd1, SIMD<double,1> sd2)
{
return std::make_tuple(sd1.Data(), sd2.Data());
}
NG_INLINE auto HSum (SIMD<double,1> sd1, SIMD<double,1> sd2, SIMD<double,1> sd3, SIMD<double,1> sd4)
{
return std::make_tuple(sd1.Data(), sd2.Data(), sd3.Data(), sd4.Data());
}
/////////////////////////////////////////////////////////////////////////////
// SSE - Simd width 2
/////////////////////////////////////////////////////////////////////////////
#ifdef __SSE__
template<>
class alignas(16) SIMD<double,2>
{
__m128d data;
public:
static constexpr int Size() { return 2; }
SIMD () = default;
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (__m128d adata)
: data(adata)
{ ; }
// only called if T has a call operator of appropriate type
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm_set_pd(func(1), func(0));
}
// only called if T is arithmetic (integral or floating point types)
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
SIMD (const T & val)
{
data = _mm_set1_pd(val);
}
SIMD (double const * p)
{
data = _mm_loadu_pd(p);
}
NG_INLINE operator __m128d() const { return data; }
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; }
NG_INLINE __m128d Data() const { return data; }
NG_INLINE __m128d & Data() { return data; }
// NG_INLINE operator std::tuple<double&,double&,double&,double&> ()
// { return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
NG_INLINE SIMD<double,2> &operator+= (SIMD<double,2> b) { data+=b.Data(); return *this; }
NG_INLINE SIMD<double,2> &operator-= (SIMD<double,2> b) { data-=b.Data(); return *this; }
NG_INLINE SIMD<double,2> &operator*= (SIMD<double,2> b) { data*=b.Data(); return *this; }
NG_INLINE SIMD<double,2> &operator/= (SIMD<double,2> b) { data/=b.Data(); return *this; }
};
NG_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()+b.Data(); }
NG_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()-b.Data(); }
NG_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return -a.Data(); }
NG_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()*b.Data(); }
NG_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return a.Data()/b.Data(); }
/*
NG_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
NG_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
NG_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
NG_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), -a.Data()); }
NG_INLINE SIMD<double,4> L2Norm2 (SIMD<double,4> a) { return a.Data()*a.Data(); }
NG_INLINE SIMD<double,4> Trans (SIMD<double,4> a) { return a; }
NG_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
{
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
}
NG_INLINE double HSum (SIMD<double,4> sd)
{
__m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
}
NG_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
{
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
}
NG_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
{
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
__m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
_mm256_blend_pd (hsum1, hsum2, 12));
return SIMD<double,4>(hsum);
}
*/
#endif // __SSE__
/////////////////////////////////////////////////////////////////////////////
// AVX - Simd width 4
/////////////////////////////////////////////////////////////////////////////
#ifdef __AVX__
template<>
class alignas(32) SIMD<double,4>
{
__m256d data;
public:
static constexpr int Size() { return 4; }
SIMD () = default;
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (__m256d adata)
: data(adata)
{ ; }
// only called if T has a call operator of appropriate type
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm256_set_pd(func(3), func(2), func(1), func(0));
}
// only called if T is arithmetic (integral or floating point types)
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
SIMD (const T & val)
{
data = _mm256_set1_pd(val);
}
SIMD (double const * p)
{
data = _mm256_loadu_pd(p);
}
NG_INLINE operator __m256d() const { return data; }
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NG_INLINE double& operator[] (int i) { return ((double*)(&data))[i]; }
NG_INLINE __m256d Data() const { return data; }
NG_INLINE __m256d & Data() { return data; }
NG_INLINE operator std::tuple<double&,double&,double&,double&> ()
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
NG_INLINE SIMD<double,4> &operator+= (SIMD<double,4> b) { data+=b.Data(); return *this; }
NG_INLINE SIMD<double,4> &operator-= (SIMD<double,4> b) { data-=b.Data(); return *this; }
NG_INLINE SIMD<double,4> &operator*= (SIMD<double,4> b) { data*=b.Data(); return *this; }
NG_INLINE SIMD<double,4> &operator/= (SIMD<double,4> b) { data/=b.Data(); return *this; }
};
NG_INLINE SIMD<double,4> operator+ (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()+b.Data(); }
NG_INLINE SIMD<double,4> operator- (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()-b.Data(); }
NG_INLINE SIMD<double,4> operator- (SIMD<double,4> a) { return -a.Data(); }
NG_INLINE SIMD<double,4> operator* (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()*b.Data(); }
NG_INLINE SIMD<double,4> operator/ (SIMD<double,4> a, SIMD<double,4> b) { return a.Data()/b.Data(); }
NG_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
NG_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
NG_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
NG_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), -a.Data()); }
NG_INLINE SIMD<double,4> L2Norm2 (SIMD<double,4> a) { return a.Data()*a.Data(); }
NG_INLINE SIMD<double,4> Trans (SIMD<double,4> a) { return a; }
NG_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
{
auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
return _mm256_blendv_pd(c.Data(), b.Data(), cp);
}
NG_INLINE double HSum (SIMD<double,4> sd)
{
__m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
}
NG_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
{
__m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
__m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
return std::make_tuple(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
}
NG_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
{
__m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
__m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
__m256d hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
_mm256_blend_pd (hsum1, hsum2, 12));
return SIMD<double,4>(hsum);
}
#endif // __AVX__
/////////////////////////////////////////////////////////////////////////////
// AVX512 - Simd width 8
/////////////////////////////////////////////////////////////////////////////
#ifdef __AVX512F__
template<>
class alignas(64) SIMD<double,8>
{
__m512d data;
public:
static constexpr int Size() { return 8; }
SIMD () = default;
SIMD (const SIMD &) = default;
SIMD & operator= (const SIMD &) = default;
SIMD (__m512d adata)
: data(adata)
{ ; }
// only called if T has a call operator of appropriate type
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD (const T & func)
{
data = _mm512_set_pd(func(7), func(6), func(5), func(4),
func(3), func(2), func(1), func(0));
}
// only called if T is arithmetic (integral or floating point types)
template<typename T, typename std::enable_if<std::is_arithmetic<T>::value, int>::type = 0>
SIMD (const T & val)
{
data = _mm512_set1_pd(val);
}
SIMD (double const * p)
{
data = _mm512_loadu_pd(p);
}
NG_INLINE operator __m512d() const { return data; }
NG_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
NG_INLINE __m512d Data() const { return data; }
NG_INLINE __m512d & Data() { return data; }
NG_INLINE SIMD<double,8> &operator+= (SIMD<double,8> b) { data+=b.Data(); return *this; }
NG_INLINE SIMD<double,8> &operator-= (SIMD<double,8> b) { data-=b.Data(); return *this; }
NG_INLINE SIMD<double,8> &operator*= (SIMD<double,8> b) { data*=b.Data(); return *this; }
NG_INLINE SIMD<double,8> &operator/= (SIMD<double,8> b) { data/=b.Data(); return *this; }
};
NG_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return _mm512_sub_pd(_mm512_setzero_pd(), a.Data()); }
NG_INLINE SIMD<double,8> operator+ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_add_pd(a.Data(),b.Data()); }
NG_INLINE SIMD<double,8> operator- (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_sub_pd(a.Data(),b.Data()); }
NG_INLINE SIMD<double,8> operator* (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_mul_pd(a.Data(),b.Data()); }
NG_INLINE SIMD<double,8> operator/ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_div_pd(a.Data(),b.Data()); }
NG_INLINE SIMD<double,8> sqrt (SIMD<double,8> a) { return _mm512_sqrt_pd(a.Data()); }
NG_INLINE SIMD<double,8> floor (SIMD<double,8> a) { return _mm512_floor_pd(a.Data()); }
NG_INLINE SIMD<double,8> ceil (SIMD<double,8> a) { return _mm512_ceil_pd(a.Data()); }
NG_INLINE SIMD<double,8> fabs (SIMD<double,8> a) { return _mm512_max_pd(a.Data(), -a.Data()); }
NG_INLINE SIMD<double,8> L2Norm2 (SIMD<double,8> a) { return a.Data()*a.Data(); }
NG_INLINE SIMD<double,8> Trans (SIMD<double,8> a) { return a; }
NG_INLINE SIMD<double,8> IfPos (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
{
auto cp = _mm512_cmp_pd_mask (a.Data(), _mm512_setzero_pd(), _MM_CMPINT_GT);
return _mm512_mask_blend_pd(cp, c.Data(), b.Data());
}
template<> NG_INLINE auto FMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
{
return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data());
}
NG_INLINE double HSum (SIMD<double,8> sd)
{
SIMD<double,4> low = _mm512_extractf64x4_pd(sd.Data(),0);
SIMD<double,4> high = _mm512_extractf64x4_pd(sd.Data(),1);
return HSum(low)+HSum(high);
}
NG_INLINE auto HSum (SIMD<double,8> sd1, SIMD<double,8> sd2)
{
return std::make_tuple(HSum(sd1), HSum(sd2));
}
NG_INLINE SIMD<double,4> HSum (SIMD<double,8> v1, SIMD<double,8> v2, SIMD<double,8> v3, SIMD<double,8> v4)
{
SIMD<double,4> high1 = _mm512_extractf64x4_pd(v1.Data(),1);
SIMD<double,4> high2 = _mm512_extractf64x4_pd(v2.Data(),1);
SIMD<double,4> high3 = _mm512_extractf64x4_pd(v3.Data(),1);
SIMD<double,4> high4 = _mm512_extractf64x4_pd(v4.Data(),1);
SIMD<double,4> low1 = _mm512_extractf64x4_pd(v1.Data(),0);
SIMD<double,4> low2 = _mm512_extractf64x4_pd(v2.Data(),0);
SIMD<double,4> low3 = _mm512_extractf64x4_pd(v3.Data(),0);
SIMD<double,4> low4 = _mm512_extractf64x4_pd(v4.Data(),0);
return HSum(low1,low2,low3,low4) + HSum(high1,high2,high3,high4);
}
#endif // __AVX512F__
////////////////////////////////////////////////////////////////////////////////
// MultiSIMD - Multiple SIMD values in one struct using head-tail implementation
////////////////////////////////////////////////////////////////////////////////
template <int D, typename T>
class MultiSIMD
{
SIMD<T> head;
MultiSIMD<D-1,T> tail;
public:
MultiSIMD () = default;
MultiSIMD (const MultiSIMD & ) = default;
MultiSIMD (T v) : head(v), tail(v) { ; }
MultiSIMD (SIMD<T> _head, MultiSIMD<D-1,T> _tail)
: head(_head), tail(_tail) { ; }
SIMD<T> Head() const { return head; }
MultiSIMD<D-1,T> Tail() const { return tail; }
SIMD<T> & Head() { return head; }
MultiSIMD<D-1,T> & Tail() { return tail; }
template <int NR>
SIMD<T> Get() const { return NR==0 ? head : tail.template Get<NR-1>(); }
template <int NR>
SIMD<T> & Get() { return NR==0 ? head : tail.template Get<NR-1>(); }
auto MakeTuple() { return std::tuple_cat(std::tuple<SIMD<T>&> (head), tail.MakeTuple()); }
// not yet possible for MSVC
// operator auto () { return MakeTuple(); }
};
template <typename T>
class MultiSIMD<2,T>
{
SIMD<T> v0, v1;
public:
MultiSIMD () = default;
MultiSIMD (const MultiSIMD & ) = default;
MultiSIMD (T v) : v0(v), v1(v) { ; }
MultiSIMD (SIMD<T> _v0, SIMD<T> _v1) : v0(_v0), v1(_v1) { ; }
SIMD<T> Head() const { return v0; }
SIMD<T> Tail() const { return v1; }
SIMD<T> & Head() { return v0; }
SIMD<T> & Tail() { return v1; }
template <int NR>
SIMD<T> Get() const { return NR==0 ? v0 : v1; }
template <int NR>
SIMD<T> & Get() { return NR==0 ? v0 : v1; }
auto MakeTuple() { return std::tuple<SIMD<T>&, SIMD<T>&> (v0, v1); }
operator std::tuple<SIMD<T>&, SIMD<T>&>() { return MakeTuple(); }
};
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a.Head()+b.Head(), a.Tail()+b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (double a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a+b.Head(), a+b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator+ (MultiSIMD<D,double> b, double a)
{ return MultiSIMD<D,double> (a+b.Head(), a+b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a.Head()-b.Head(), a.Tail()-b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator- (double a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a-b.Head(), a-b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> b, double a)
{ return MultiSIMD<D,double> (b.Head()-a, b.Tail()-a); }
template <int D> NG_INLINE MultiSIMD<D,double> operator- (MultiSIMD<D,double> a)
{ return MultiSIMD<D,double> (-a.Head(), -a.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator* (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a.Head()*b.Head(), a.Tail()*b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator/ (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> (a.Head()/b.Head(), a.Tail()/b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator* (double a, MultiSIMD<D,double> b)
{ return MultiSIMD<D,double> ( a*b.Head(), a*b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> operator* (MultiSIMD<D,double> b, double a)
{ return MultiSIMD<D,double> ( a*b.Head(), a*b.Tail()); }
template <int D> NG_INLINE MultiSIMD<D,double> & operator+= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
{ a.Head()+=b.Head(); a.Tail()+=b.Tail(); return a; }
template <int D> NG_INLINE MultiSIMD<D,double> operator-= (MultiSIMD<D,double> & a, double b)
{ a.Head()-=b; a.Tail()-=b; return a; }
template <int D> NG_INLINE MultiSIMD<D,double> operator-= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
{ a.Head()-=b.Head(); a.Tail()-=b.Tail(); return a; }
template <int D> NG_INLINE MultiSIMD<D,double> & operator*= (MultiSIMD<D,double> & a, MultiSIMD<D,double> b)
{ a.Head()*=b.Head(); a.Tail()*=b.Tail(); return a; }
template <int D> NG_INLINE MultiSIMD<D,double> & operator*= (MultiSIMD<D,double> & a, double b)
{ a.Head()*=b; a.Tail()*=b; return a; }
// NG_INLINE MultiSIMD<double> operator/= (MultiSIMD<double> & a, MultiSIMD<double> b) { return a.Data()/=b.Data(); }
NG_INLINE SIMD<double> HVSum (SIMD<double> a) { return a; }
template <int D>
NG_INLINE SIMD<double> HVSum (MultiSIMD<D,double> a) { return a.Head() + HVSum(a.Tail()); }
template <int D> NG_INLINE double HSum (MultiSIMD<D,double> a) { return HSum(HVSum(a)); }
template <int D> NG_INLINE auto HSum (MultiSIMD<D,double> a, MultiSIMD<D,double> b)
{ return HSum(HVSum(a), HVSum(b)); }
template <int D, typename T>
std::ostream & operator<< (std::ostream & ost, MultiSIMD<D,T> multi)
{
ost << multi.Head() << " " << multi.Tail();
return ost;
}
template <typename T>
std::ostream & operator<< (std::ostream & ost, SIMD<T> simd)
{
ost << simd[0];
for (int i = 1; i < simd.Size(); i++)
ost << " " << simd[i];
return ost;
}
}
namespace netgen
{
using namespace ngsimd;
}
#endif

View File

@ -779,29 +779,23 @@ namespace netgen
#ifdef __SSE__
#include <immintrin.h>
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<1,1> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
cout << "multi-eltrafo simd called, 1,1,simd" << endl;
}
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<2,2> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<2>
(elnr, npts,
reinterpret_cast<const SIMD<double>*> (xi), sxi,
reinterpret_cast<SIMD<double>*> (x), sx,
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
/*
for (int i = 0; i < npts; i++)
{
@ -828,15 +822,15 @@ namespace netgen
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<3,3> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
mesh->GetCurvedElements().CalcMultiPointElementTransformation
(elnr, npts,
reinterpret_cast<const SIMD<double>*> (xi), sxi,
reinterpret_cast<SIMD<double>*> (x), sx,
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
xi, sxi,
x, sx,
dxdxi, sdxdxi);
/*
for (int i = 0; i < npts; i++)
{
@ -863,33 +857,30 @@ namespace netgen
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<0,2> (int elnr, int npts,
const tAVXd *xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> *xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
cout << "MultiElementtransformation<0,2> simd not implemented" << endl;
}
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<0,1> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
cout << "multi-eltrafo simd called, 0,1,simd" << endl;
}
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<1,3> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<3>
(elnr, npts,
reinterpret_cast<const SIMD<double>*> (xi), sxi,
reinterpret_cast<SIMD<double>*> (x), sx,
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
/*
double hxi[4][1];
double hx[4][3];
@ -912,15 +903,12 @@ namespace netgen
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<1,2> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
mesh->GetCurvedElements().CalcMultiPointSegmentTransformation<2>
(elnr, npts,
reinterpret_cast<const SIMD<double>*> (xi), sxi,
reinterpret_cast<SIMD<double>*> (x), sx,
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
/*
for (int i = 0; i < npts; i++)
{
@ -947,15 +935,12 @@ namespace netgen
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<2,3> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
mesh->GetCurvedElements().CalcMultiPointSurfaceTransformation<3>
(elnr, npts,
reinterpret_cast<const SIMD<double>*> (xi), sxi,
reinterpret_cast<SIMD<double>*> (x), sx,
reinterpret_cast<SIMD<double>*> (dxdxi), sdxdxi);
(elnr, npts, xi, sxi, x, sx, dxdxi, sdxdxi);
/*
for (int i = 0; i < npts; i++)
{
@ -982,9 +967,9 @@ namespace netgen
template<> DLL_HEADER void Ngx_Mesh ::
MultiElementTransformation<0,3> (int elnr, int npts,
const tAVXd * xi, size_t sxi,
tAVXd * x, size_t sx,
tAVXd * dxdxi, size_t sdxdxi) const
const SIMD<double> * xi, size_t sxi,
SIMD<double> * x, size_t sx,
SIMD<double> * dxdxi, size_t sdxdxi) const
{
for (int i = 0; i < npts; i++)
{
@ -1004,10 +989,6 @@ namespace netgen
}
#endif

View File

@ -6,15 +6,6 @@ namespace netgen
{
using namespace std;
/*
#if defined __AVX512F__
typedef __m512 tAVX;
typedef __m512d tAVXd;
#elif defined __AVX__
typedef __m256 tAVX;
typedef __m256d tAVXd;
#endif
*/
class SolutionData
{
@ -101,17 +92,15 @@ namespace netgen
return res;
}
#ifdef __SSE__
virtual bool GetMultiSurfValue (size_t selnr, size_t facetnr, size_t npts,
const ngsimd::tAVXd * xref,
const ngsimd::tAVXd * x,
const ngsimd::tAVXd * dxdxref,
ngsimd::tAVXd * values)
const SIMD<double> * xref,
const SIMD<double> * x,
const SIMD<double> * dxdxref,
SIMD<double> * values)
{
cerr << "GetMultiSurfVaue not overloaded for SIMD<double>" << endl;
return false;
}
#endif
virtual bool GetSegmentValue (int segnr, double xref, double * values)
{ return false; }