Some fixes for odd SIMD sizes

2025-06-03 08:27:51 +05:00 · 2022-04-15 15:27:44 +02:00 · 2022-04-15 15:27:44 +02:00 · e4ff37887b
commit e4ff37887b
parent 39cc7ae0a3
5 changed files with 59 additions and 9 deletions
--- a/libsrc/core/simd_arm64.hpp
+++ b/libsrc/core/simd_arm64.hpp
@ -42,6 +42,7 @@ namespace ngcore
    SIMD (const SIMD &) = default;
    // SIMD (double v0, double v1) : data{v0,v1} { }
    SIMD (double v0, double v1) : data{vcombine_f64(float64x1_t{v0}, float64x1_t{v1})} { }
+    SIMD (SIMD<double,1> v0, SIMD<double,1> v1) : data{vcombine_f64(float64x1_t{v0.Data()}, float64x1_t{v1.Data()})} { }
    SIMD (std::array<double, 2> arr) : data{arr[0], arr[1]} { } 

    SIMD & operator= (const SIMD &) = default;
--- a/libsrc/core/simd_avx.hpp
+++ b/libsrc/core/simd_avx.hpp
@ -143,8 +143,6 @@ namespace ngcore
    NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
    // [[deprecated("don't write to individual elements of SIMD")]]
    // NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
-    template <int I>
-    double Get() const { return ((double*)(&data))[I]; }
    NETGEN_INLINE __m256d Data() const { return data; }
    NETGEN_INLINE __m256d & Data() { return data; }

@ -153,6 +151,13 @@ namespace ngcore

    operator std::tuple<double&,double&,double&,double&> ()
    { return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
+
+    template <int I>
+    double Get() const
+    {
+      static_assert(I>=0 && I<4, "Index out of range");
+      return (*this)[I];
+    }
  };

  NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
--- a/libsrc/core/simd_avx512.hpp
+++ b/libsrc/core/simd_avx512.hpp
@ -92,6 +92,12 @@ namespace ngcore
    SIMD (double const * p, SIMD<mask64,8> mask)
      { data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
    SIMD (__m512d _data) { data = _data; }
+    SIMD (SIMD<double,4> v0, SIMD<double,4> v1)
+        : data(_mm512_set_pd(v1[3], v1[2], v1[1], v1[0], v0[3], v0[2], v0[1], v0[0]))
+    {}
+    SIMD (SIMD<double,6> v0, SIMD<double,2> v1)
+        : data(_mm512_set_pd(v1[1], v1[0], v0[5], v0[4], v0[3], v0[2], v0[1], v0[0]))
+    {}

    template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
    SIMD (const T & func)
@ -129,6 +135,12 @@ namespace ngcore
    NETGEN_INLINE __m512d Data() const { return data; }
    NETGEN_INLINE __m512d & Data() { return data; }

+    template <int I>
+    double Get() const
+    {
+      static_assert(I>=0 && I<8, "Index out of range");
+      return (*this)[I];
+    }
  };

  NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return -a.Data(); }
--- a/libsrc/core/simd_generic.hpp
+++ b/libsrc/core/simd_generic.hpp
@ -28,6 +28,28 @@ namespace ngcore
 #endif
  }

+  constexpr bool IsNativeSIMDSize(int n) {
+    if(n==1) return true;
+#if defined NETGEN_ARCH_AMD64 || defined  __SSE__ || defined __aarch64__
+    if(n==2) return true;
+#endif
+#if defined __AVX__
+    if(n==4) return true;
+#endif
+#if defined __AVX512F__
+    if(n==8) return true;
+#endif
+    return false;
+  }
+
+  // split n = k+l such that k is the largest natively supported simd size < n
+  constexpr int GetLargestNativeSIMDPart(int n) {
+      int k = n-1;
+      while(!IsNativeSIMDSize(k))
+          k--;
+      return k;
+  }
+

  template <typename T, int N=GetDefaultSIMDSize()> class SIMD;

@ -67,9 +89,9 @@ namespace ngcore


  template <int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<mask64,N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<mask64,N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;

    SIMD<mask64,N1> lo;
@ -123,9 +145,9 @@ namespace ngcore
  };

  template<int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(int64_t)) SIMD<int64_t,N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<int64_t,N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;

    SIMD<int64_t,N1> lo;
@ -240,9 +262,9 @@ namespace ngcore


  template<int N>
-  class alignas(GetDefaultSIMDSize()*sizeof(double)) SIMD<double, N>
+  class alignas(GetLargestNativeSIMDPart(N)*sizeof(double)) SIMD<double, N>
  {
-    static constexpr int N1 = std::min(GetDefaultSIMDSize(), N/2);
+    static constexpr int N1 = GetLargestNativeSIMDPart(N);
    static constexpr int N2 = N-N1;

    SIMD<double, N1> lo;
@ -543,7 +565,7 @@ namespace ngcore

  
  template <int i, typename T, int N>
-  T get(SIMD<T,N> a) { return a[i]; }
+  T get(SIMD<T,N> a) { return a.template Get<i>(); }

  template <int NUM, typename FUNC>
  NETGEN_INLINE void Iterate2 (FUNC f)
--- a/libsrc/core/simd_sse.hpp
+++ b/libsrc/core/simd_sse.hpp
@ -86,6 +86,9 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
    SIMD () {}
    SIMD (const SIMD &) = default;
    SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
+    SIMD (SIMD<double,1> v0, SIMD<double,1> v1)
+        : data{_mm_set_pd(v0.Data(), v1.Data())}
+    { }
    SIMD (std::array<double, 2> arr)
        : data{_mm_set_pd(arr[1], arr[0])}
    {}
@ -137,6 +140,13 @@ NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) {
    NETGEN_INLINE __m128d Data() const { return data; }
    NETGEN_INLINE __m128d & Data() { return data; }

+    template <int I>
+    double Get()
+    {
+      static_assert(I>=0 && I<2, "Index out of range");
+      return (*this)[I];
+    }
+
    operator std::tuple<double&,double&> ()
    {
      auto pdata = (double*)&data;