#ifndef NETGEN_CORE_SIMD_HPP
#define NETGEN_CORE_SIMD_HPP

/**************************************************************************/
/* File:   simd.hpp                                                       */
/* Author: Joachim Schoeberl, Matthias Hochsteger                         */
/* Date:   25. Mar. 16                                                    */
/**************************************************************************/

#include "ngcore_api.hpp"

#include "simd_generic.hpp"

#ifndef __CUDA_ARCH__

#ifdef NETGEN_ARCH_AMD64
#ifndef __SSE__
#define __SSE__
#endif
#include "simd_sse.hpp"
#endif

#ifdef __AVX__
#include "simd_avx.hpp"
#endif

#ifdef __AVX512F__
#include "simd_avx512.hpp"
#endif

#ifdef __aarch64__
#include "simd_arm64.hpp"
#endif

#endif // __CUDA_ARCH__

namespace ngcore
{
#ifndef __CUDA_ARCH__
#ifdef NETGEN_ARCH_AMD64
  /*
  NETGEN_INLINE auto HSum (SIMD<double,2> v1, SIMD<double,2> v2, SIMD<double,2> v3, SIMD<double,2> v4)
  {
    SIMD<double,2> hsum1 = my_mm_hadd_pd (v1.Data(), v2.Data());
    SIMD<double,2> hsum2 = my_mm_hadd_pd (v3.Data(), v4.Data());
    return SIMD<double,4> (hsum1, hsum2);
  }
  */
  
  NETGEN_INLINE auto GetMaskFromBits( unsigned int i )
  {
    return SIMD<mask64>::GetMaskFromBits(i);
  }
#endif
#endif // __CUDA_ARCH__
  
  NETGEN_INLINE void SIMDTranspose (SIMD<double,4> a1, SIMD<double,4> a2, SIMD <double,4> a3, SIMD<double,4> a4,
                                    SIMD<double,4> & b1, SIMD<double,4> & b2, SIMD<double,4> & b3, SIMD<double,4> & b4)
  {
    if constexpr (sizeof(a1.Lo()) == 16)
      {
        auto [h1,h2] = Unpack(a1,a2);
        auto [h3,h4] = Unpack(a3,a4);
        b1 = SIMD<double,4> (h1.Lo(), h3.Lo());
        b2 = SIMD<double,4> (h2.Lo(), h4.Lo());
        b3 = SIMD<double,4> (h1.Hi(), h3.Hi());
        b4 = SIMD<double,4> (h2.Hi(), h4.Hi());
      }
    else
      {
        b1 = SIMD<double,4> (a1[0], a2[0], a3[0], a4[0]);
        b2 = SIMD<double,4> (a1[1], a2[1], a3[1], a4[1]);
        b3 = SIMD<double,4> (a1[2], a2[2], a3[2], a4[2]);
        b4 = SIMD<double,4> (a1[3], a2[3], a3[3], a4[3]);
      }
  }
  
  template<int N>
  NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2)
  {
    return SIMD<double,2>(HSum(s1), HSum(s2));
  }

  template<int N>
  NETGEN_INLINE auto HSum (SIMD<double,N> s1, SIMD<double,N> s2, SIMD<double,N> s3, SIMD<double,N> s4 )
  {
    // return SIMD<double,4>(HSum(s1), HSum(s2), HSum(s3), HSum(s4));
    return SIMD<double,4>(HSum(s1, s2), HSum(s3,s4));
  }
}

#endif // NETGEN_CORE_SIMD_HPP