Merge branch 'simd_fixes' into 'master'

Some fixes for odd SIMD sizes See merge request jschoeberl/netgen!499
2025-06-07 02:37:52 +05:00 · 2022-04-15 14:40:43 +00:00 · 2022-04-15 14:40:43 +00:00 · 693135d52c
commit 693135d52c
parent 39cc7ae0a3 e4ff37887b
5 changed files with 59 additions and 9 deletions
--- a/libsrc/core/simd_arm64.hpp
+++ b/libsrc/core/simd_arm64.hpp
@ -42,6 +42,7 @@ namespace ngcore
    SIMD (const SIMD &) = default;
    // SIMD (double v0, double v1) : data{v0,v1} { }
    SIMD (double v0, double v1) : data{vcombine_f64(float64x1_t{v0}, float64x1_t{v1})} { }
    SIMD (SIMD<double,1> v0, SIMD<double,1> v1) : data{vcombine_f64(float64x1_t{v0.Data()}, float64x1_t{v1.Data()})} { }
    SIMD (std::array<double, 2> arr) : data{arr[0], arr[1]} { } 
    SIMD & operator= (const SIMD &) = default;
--- a/libsrc/core/simd_avx.hpp
+++ b/libsrc/core/simd_avx.hpp
@ -143,8 +143,6 @@ namespace ngcore
    NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
    // [[deprecated("don't write to individual elements of SIMD")]]
    // NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
    template <int I>
    double Get() const { return ((double*)(&data))[I]; }
    NETGEN_INLINE __m256d Data() const { return data; }
    NETGEN_INLINE __m256d & Data() { return data; }
@ -153,6 +151,13 @@ namespace ngcore
    operator std::tuple<double&,double&,double&,double&> ()
    { return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
    template <int I>
    double Get() const
    {
      static_assert(I>=0 && I<4, "Index out of range");
      return (*this)[I];
    }
  };
  NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
--- a/libsrc/core/simd_avx512.hpp
+++ b/libsrc/core/simd_avx512.hpp
@ -92,6 +92,12 @@ namespace ngcore
    SIMD (double const * p, SIMD<mask64,8> mask)
      { data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
    SIMD (__m512d _data) { data = _data; }
    SIMD (SIMD<double,4> v0, SIMD<double,4> v1)
        : data(_mm512_set_pd(v1[3], v1[2], v1[1], v1[0], v0[3], v0[2], v0[1], v0[0]))
    {}
    SIMD (SIMD<double,6> v0, SIMD<double,2> v1)
        : data(_mm512_set_pd(v1[1], v1[0], v0[5], v0[4], v0[3], v0[2], v0[1], v0[0]))
    {}
    template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
    SIMD (const T & func)
@ -129,6 +135,12 @@ namespace ngcore
    NETGEN_INLINE __m512d Data() const { return data; }
    NETGEN_INLINE __m512d & Data() { return data; }
    template <int I>
    double Get() const
    {
      static_assert(I>=0 && I<8, "Index out of range");
      return (*this)[I];
    }
  };
  NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return -a.Data(); }
--- a/libsrc/core/simd_generic.hpp
+++ b/libsrc/core/simd_generic.hpp
@ -28,6 +28,28 @@ namespace ngcore
 #endif
  }
  constexpr bool IsNativeSIMDSize(int n) {
    if(n==1) return true;
 #if defined NETGEN_ARCH_AMD64 || defined  __SSE__ || defined __aarch64__
    if(n==2) return true;
 #endif
 #if defined __AVX__
    if(n==4) return true;
 #endif
 #if defined __AVX512F__
    if(n==8) return true;
 #endif
    return false;
  }
  // split n = k+l such that k is the largest natively supported simd size < n
  constexpr int GetLargestNativeSIMDPart(int n) {
      int k = n-1;
      while(!IsNativeSIMDSize(k))
          k--;
      return k;
  }
  template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
@ -67,9 +89,9 @@ namespace ngcore
  template <int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<mask64,N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<mask64,N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;
    SIMD<mask64,N1> lo;
@ -123,9 +145,9 @@ namespace ngcore
  };
  template<int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<int64_t,N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<int64_t,N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;
    SIMD<int64_t,N1> lo;
@ -240,9 +262,9 @@ namespace ngcore
  template<int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(double)) SIMD<double, N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(double)) SIMD<double, N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;
    SIMD<double, N1> lo;
@ -543,7 +565,7 @@ namespace ngcore
  template <int i, typename T, int N>
-  T get(SIMD<T,N> a) { return a[i]; }
+  T get(SIMD<T,N> a) { return a.template Get<i>(); }
  template <int NUM, typename FUNC>
  NETGEN_INLINE void Iterate2 (FUNC f)
--- a/libsrc/core/simd_sse.hpp
+++ b/libsrc/core/simd_sse.hpp
@ -86,6 +86,9 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
    SIMD () {}
    SIMD (const SIMD &) = default;
    SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
    SIMD (SIMD<double,1> v0, SIMD<double,1> v1)
        : data{_mm_set_pd(v0.Data(), v1.Data())}
    { }
    SIMD (std::array<double, 2> arr)
        : data{_mm_set_pd(arr[1], arr[0])}
    {}
@ -137,6 +140,13 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
    NETGEN_INLINE __m128d Data() const { return data; }
    NETGEN_INLINE __m128d & Data() { return data; }
    template <int I>
    double Get()
    {
      static_assert(I>=0 && I<2, "Index out of range");
      return (*this)[I];
    }
    operator std::tuple<double&,double&> ()
    {
      auto pdata = (double*)&data;