Merge branch 'cuda' into 'master'

Some cuda fixes See merge request ngsolve/netgen!556
2025-04-15 19:11:23 +05:00 · 2023-02-16 12:55:12 +01:00 · 2023-02-16 12:55:12 +01:00 · 33a76ad6ba
commit 33a76ad6ba
parent 5851b558bb 52763603d3
4 changed files with 23 additions and 8 deletions
--- a/libsrc/core/exception.hpp
+++ b/libsrc/core/exception.hpp
@ -87,18 +87,18 @@ namespace ngcore
 // Convenience macro to append file name and line of exception origin to the string
 #define NG_EXCEPTION(s) ngcore::Exception(__FILE__ ":" NETGEN_CORE_NGEXEPTION_STR(__LINE__) "\t"+std::string(s))

-#ifdef NETGEN_ENABLE_CHECK_RANGE
+#if defined(NETGEN_ENABLE_CHECK_RANGE) && !defined(__CUDA_ARCH__)
 #define NETGEN_CHECK_RANGE(value, min, max_plus_one) \
  { if ((value)<(min) ||  (value)>=(max_plus_one)) \
      throw ngcore::RangeException(__FILE__ ":" NETGEN_CORE_NGEXEPTION_STR(__LINE__) "\t", (value), (min), (max_plus_one)); }
 #define NETGEN_CHECK_SHAPE(a,b) \
  { if(a.Shape() != b.Shape()) \
      throw ngcore::Exception(__FILE__": shape don't match"); }
-#else // NETGEN_ENABLE_CHECK_RANGE
+#else // defined(NETGEN_ENABLE_CHECK_RANGE) && !defined(__CUDA_ARCH__)
 #define NETGEN_CHECK_RANGE(value, min, max)
 #define NETGEN_CHECK_SHAPE(a,b)

-#endif // NETGEN_ENABLE_CHECK_RANGE
+#endif // defined(NETGEN_ENABLE_CHECK_RANGE) && !defined(__CUDA_ARCH__)


  
--- a/libsrc/core/localheap.hpp
+++ b/libsrc/core/localheap.hpp
@ -136,7 +136,7 @@ public:


  /// free memory
-  NETGEN_INLINE virtual ~LocalHeap ()
+  virtual ~LocalHeap ()
  {
    if (owner)
      delete [] data;
--- a/libsrc/core/ngcore_api.hpp
+++ b/libsrc/core/ngcore_api.hpp
@ -48,20 +48,29 @@
        #define NGCORE_API NGCORE_API_IMPORT
 #endif

+// Set __host__ __device__ for all inline functions
+#ifdef __CUDACC__
+  #define NETGEN_HD __host__ __device__
+#else // __CUDACC__
+  #define NETGEN_HD
+#endif // __CUDACC__
+
 #ifdef __INTEL_COMPILER
+  #define NETGEN_ALWAYS_INLINE __forceinline
+  #define NETGEN_INLINE __forceinline inline
  #ifdef WIN32
-    #define NETGEN_INLINE __forceinline inline
    #define NETGEN_LAMBDA_INLINE
  #else
-    #define NETGEN_INLINE __forceinline inline
    #define NETGEN_LAMBDA_INLINE __attribute__ ((__always_inline__))
  #endif
 #else
  #ifdef __GNUC__
-    #define NETGEN_INLINE __attribute__ ((__always_inline__)) inline
-    #define NETGEN_LAMBDA_INLINE __attribute__ ((__always_inline__))
+    #define NETGEN_ALWAYS_INLINE __attribute__ ((__always_inline__))
+    #define NETGEN_INLINE __attribute__ ((__always_inline__)) inline NETGEN_HD
+    #define NETGEN_LAMBDA_INLINE __attribute__ ((__always_inline__)) NETGEN_HD
    #define NETGEN_VLA
  #else
+    #define NETGEN_ALWAYS_INLINE
    #define NETGEN_INLINE inline
    #define NETGEN_LAMBDA_INLINE
  #endif
--- a/libsrc/core/simd.hpp
+++ b/libsrc/core/simd.hpp
@ -11,6 +11,8 @@

 #include "simd_generic.hpp"

+#ifndef __CUDA_ARCH__
+
 #ifdef NETGEN_ARCH_AMD64
 #ifndef __SSE__
 #define __SSE__
@ -30,8 +32,11 @@
 #include "simd_arm64.hpp"
 #endif

+#endif // __CUDA_ARCH__
+
 namespace ngcore
 {
+#ifndef __CUDA_ARCH__
 #ifdef NETGEN_ARCH_AMD64
  NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
  {
@ -45,6 +50,7 @@ namespace ngcore
    return SIMD<mask64>::GetMaskFromBits(i);
  }
 #endif
+#endif // __CUDA_ARCH__

  NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
                                    SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)