diff --git a/libsrc/core/simd_avx.hpp b/libsrc/core/simd_avx.hpp index 281b10c8..37a0bab8 100644 --- a/libsrc/core/simd_avx.hpp +++ b/libsrc/core/simd_avx.hpp @@ -184,6 +184,14 @@ namespace ngcore { return _mm256_fmadd_pd (_mm256_set1_pd(a), b.Data(), c.Data()); } + NETGEN_INLINE SIMD FNMA (SIMD a, SIMD b, SIMD c) + { + return _mm256_fnmadd_pd (a.Data(), b.Data(), c.Data()); + } + NETGEN_INLINE SIMD FNMA (const double & a, SIMD b, SIMD c) + { + return _mm256_fnmadd_pd (_mm256_set1_pd(a), b.Data(), c.Data()); + } #endif #if defined(__FMA__) && !defined(__AVX512F__) diff --git a/libsrc/core/simd_avx512.hpp b/libsrc/core/simd_avx512.hpp index e453b7e4..bf57f4e1 100644 --- a/libsrc/core/simd_avx512.hpp +++ b/libsrc/core/simd_avx512.hpp @@ -234,6 +234,16 @@ namespace ngcore { return _mm512_fmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data()); } + + NETGEN_INLINE SIMD FNMA (SIMD a, SIMD b, SIMD c) + { + return _mm512_fnmadd_pd (a.Data(), b.Data(), c.Data()); + } + NETGEN_INLINE SIMD FNMA (const double & a, SIMD b, SIMD c) + { + return _mm512_fnmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data()); + } + } #endif // NETGEN_CORE_SIMD_AVX512_HPP diff --git a/libsrc/core/simd_generic.hpp b/libsrc/core/simd_generic.hpp index e479590d..1ad4ea99 100644 --- a/libsrc/core/simd_generic.hpp +++ b/libsrc/core/simd_generic.hpp @@ -513,11 +513,17 @@ namespace ngcore } - template // a*b+c + template NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c) { - return a*b+c; + return c+a*b; + } + + template + NETGEN_INLINE auto FNMA(T1 a, T2 b, T3 c) + { + return c-a*b; } // update form of fma @@ -531,7 +537,8 @@ namespace ngcore template void FNMAasm (SIMD a, SIMD b, SIMD & sum) { - sum -= a*b; + // sum -= a*b; + sum = FNMA(a,b,sum); }