diff --git a/libsrc/core/simd_avx.hpp b/libsrc/core/simd_avx.hpp index 0f3112a0..281b10c8 100644 --- a/libsrc/core/simd_avx.hpp +++ b/libsrc/core/simd_avx.hpp @@ -174,6 +174,42 @@ namespace ngcore NETGEN_INLINE SIMD ceil (SIMD a) { return _mm256_ceil_pd(a.Data()); } NETGEN_INLINE SIMD fabs (SIMD a) { return _mm256_max_pd(a.Data(), (-a).Data()); } + +#ifdef __FMA__ + NETGEN_INLINE SIMD FMA (SIMD a, SIMD b, SIMD c) + { + return _mm256_fmadd_pd (a.Data(), b.Data(), c.Data()); + } + NETGEN_INLINE SIMD FMA (const double & a, SIMD b, SIMD c) + { + return _mm256_fmadd_pd (_mm256_set1_pd(a), b.Data(), c.Data()); + } +#endif + +#if defined(__FMA__) && !defined(__AVX512F__) + // make sure to use the update-version of fma + // important in matrix kernels using 12 sum-registers, 3 a-values and updated b-value + // avx512 has enough registers, and gcc seems to use only the first 16 z-regs + NETGEN_INLINE void FMAasm (SIMD a, SIMD b, SIMD & sum) + { + asm ("vfmadd231pd %[a], %[b], %[sum]" + : [sum] "+x" (sum.Data()) + : [a] "x" (a.Data()), [b] "x" (b.Data()) + ); + } + + NETGEN_INLINE void FNMAasm (SIMD a, SIMD b, SIMD & sum) + { + asm ("vfnmadd231pd %[a], %[b], %[sum]" + : [sum] "+x" (sum.Data()) + : [a] "x" (a.Data()), [b] "x" (b.Data()) + ); + } +#endif + + + + NETGEN_INLINE SIMD operator<= (SIMD a , SIMD b) { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LE_OQ); } NETGEN_INLINE SIMD operator< (SIMD a , SIMD b) diff --git a/libsrc/core/simd_generic.hpp b/libsrc/core/simd_generic.hpp index 849e0922..e479590d 100644 --- a/libsrc/core/simd_generic.hpp +++ b/libsrc/core/simd_generic.hpp @@ -527,6 +527,14 @@ namespace ngcore sum = FMA(a,b,sum); } + // update form of fms + template + void FNMAasm (SIMD a, SIMD b, SIMD & sum) + { + sum -= a*b; + } + + template T get(SIMD a) { return a[i]; }