mirror of
https://github.com/NGSolve/netgen.git
synced 2024-11-11 16:49:16 +05:00
Merge branch 'simd_fixes' into 'master'
Some fixes for odd SIMD sizes See merge request jschoeberl/netgen!499
This commit is contained in:
commit
693135d52c
@ -42,6 +42,7 @@ namespace ngcore
|
||||
SIMD (const SIMD &) = default;
|
||||
// SIMD (double v0, double v1) : data{v0,v1} { }
|
||||
SIMD (double v0, double v1) : data{vcombine_f64(float64x1_t{v0}, float64x1_t{v1})} { }
|
||||
SIMD (SIMD<double,1> v0, SIMD<double,1> v1) : data{vcombine_f64(float64x1_t{v0.Data()}, float64x1_t{v1.Data()})} { }
|
||||
SIMD (std::array<double, 2> arr) : data{arr[0], arr[1]} { }
|
||||
|
||||
SIMD & operator= (const SIMD &) = default;
|
||||
|
@ -143,8 +143,6 @@ namespace ngcore
|
||||
NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
// [[deprecated("don't write to individual elements of SIMD")]]
|
||||
// NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
|
||||
template <int I>
|
||||
double Get() const { return ((double*)(&data))[I]; }
|
||||
NETGEN_INLINE __m256d Data() const { return data; }
|
||||
NETGEN_INLINE __m256d & Data() { return data; }
|
||||
|
||||
@ -153,6 +151,13 @@ namespace ngcore
|
||||
|
||||
operator std::tuple<double&,double&,double&,double&> ()
|
||||
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
||||
|
||||
template <int I>
|
||||
double Get() const
|
||||
{
|
||||
static_assert(I>=0 && I<4, "Index out of range");
|
||||
return (*this)[I];
|
||||
}
|
||||
};
|
||||
|
||||
NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
|
||||
|
@ -92,6 +92,12 @@ namespace ngcore
|
||||
SIMD (double const * p, SIMD<mask64,8> mask)
|
||||
{ data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
|
||||
SIMD (__m512d _data) { data = _data; }
|
||||
SIMD (SIMD<double,4> v0, SIMD<double,4> v1)
|
||||
: data(_mm512_set_pd(v1[3], v1[2], v1[1], v1[0], v0[3], v0[2], v0[1], v0[0]))
|
||||
{}
|
||||
SIMD (SIMD<double,6> v0, SIMD<double,2> v1)
|
||||
: data(_mm512_set_pd(v1[1], v1[0], v0[5], v0[4], v0[3], v0[2], v0[1], v0[0]))
|
||||
{}
|
||||
|
||||
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
|
||||
SIMD (const T & func)
|
||||
@ -129,6 +135,12 @@ namespace ngcore
|
||||
NETGEN_INLINE __m512d Data() const { return data; }
|
||||
NETGEN_INLINE __m512d & Data() { return data; }
|
||||
|
||||
template <int I>
|
||||
double Get() const
|
||||
{
|
||||
static_assert(I>=0 && I<8, "Index out of range");
|
||||
return (*this)[I];
|
||||
}
|
||||
};
|
||||
|
||||
NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return -a.Data(); }
|
||||
|
@ -28,6 +28,28 @@ namespace ngcore
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr bool IsNativeSIMDSize(int n) {
|
||||
if(n==1) return true;
|
||||
#if defined NETGEN_ARCH_AMD64 || defined __SSE__ || defined __aarch64__
|
||||
if(n==2) return true;
|
||||
#endif
|
||||
#if defined __AVX__
|
||||
if(n==4) return true;
|
||||
#endif
|
||||
#if defined __AVX512F__
|
||||
if(n==8) return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
// split n = k+l such that k is the largest natively supported simd size < n
|
||||
constexpr int GetLargestNativeSIMDPart(int n) {
|
||||
int k = n-1;
|
||||
while(!IsNativeSIMDSize(k))
|
||||
k--;
|
||||
return k;
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
|
||||
|
||||
@ -67,9 +89,9 @@ namespace ngcore
|
||||
|
||||
|
||||
template <int N>
|
||||
class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<mask64,N>
|
||||
class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<mask64,N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<mask64,N1> lo;
|
||||
@ -123,9 +145,9 @@ namespace ngcore
|
||||
};
|
||||
|
||||
template<int N>
|
||||
class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<int64_t,N>
|
||||
class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<int64_t,N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<int64_t,N1> lo;
|
||||
@ -240,9 +262,9 @@ namespace ngcore
|
||||
|
||||
|
||||
template<int N>
|
||||
class alignas(GetDefaultSIMDSize()*sizeof(double)) SIMD<double, N>
|
||||
class alignas(GetLargestNativeSIMDPart(N)*sizeof(double)) SIMD<double, N>
|
||||
{
|
||||
static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
|
||||
static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
||||
static constexpr int N2 = N-N1;
|
||||
|
||||
SIMD<double, N1> lo;
|
||||
@ -543,7 +565,7 @@ namespace ngcore
|
||||
|
||||
|
||||
template <int i, typename T, int N>
|
||||
T get(SIMD<T,N> a) { return a[i]; }
|
||||
T get(SIMD<T,N> a) { return a.template Get<i>(); }
|
||||
|
||||
template <int NUM, typename FUNC>
|
||||
NETGEN_INLINE void Iterate2 (FUNC f)
|
||||
|
@ -86,6 +86,9 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
|
||||
SIMD () {}
|
||||
SIMD (const SIMD &) = default;
|
||||
SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
|
||||
SIMD (SIMD<double,1> v0, SIMD<double,1> v1)
|
||||
: data{_mm_set_pd(v0.Data(), v1.Data())}
|
||||
{ }
|
||||
SIMD (std::array<double, 2> arr)
|
||||
: data{_mm_set_pd(arr[1], arr[0])}
|
||||
{}
|
||||
@ -137,6 +140,13 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
|
||||
NETGEN_INLINE __m128d Data() const { return data; }
|
||||
NETGEN_INLINE __m128d & Data() { return data; }
|
||||
|
||||
template <int I>
|
||||
double Get()
|
||||
{
|
||||
static_assert(I>=0 && I<2, "Index out of range");
|
||||
return (*this)[I];
|
||||
}
|
||||
|
||||
operator std::tuple<double&,double&> ()
|
||||
{
|
||||
auto pdata = (double*)&data;
|
||||
|
Loading…
Reference in New Issue
Block a user