// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
// LICENSE and NOTICE for details. LLNL-CODE-806117.
//
// This file is part of the MFEM library. For more information and source code
// availability visit https://mfem.org.
//
// MFEM is free software; you can redistribute it and/or modify it under the
// terms of the BSD-3 license. We welcome feedback and contributions, see file
// CONTRIBUTING.md for details.

#ifndef MFEM_CUDA_HPP
#define MFEM_CUDA_HPP

#include "../config/config.hpp"
#include "error.hpp"

// CUDA block size used by MFEM.
#define MFEM_CUDA_BLOCKS 256

#if defined(MFEM_USE_CUDA) && defined(__CUDACC__)
#define MFEM_USE_CUDA_OR_HIP
#define MFEM_DEVICE __device__
#define MFEM_HOST __host__
#define MFEM_LAMBDA __host__
// #define MFEM_HOST_DEVICE __host__ __device__ // defined in config/config.hpp
#define MFEM_DEVICE_SYNC MFEM_GPU_CHECK(cudaDeviceSynchronize())
#define MFEM_STREAM_SYNC MFEM_GPU_CHECK(cudaStreamSynchronize(0))
// Define a CUDA error check macro, MFEM_GPU_CHECK(x), where x returns/is of
// type 'cudaError_t'. This macro evaluates 'x' and raises an error if the
// result is not cudaSuccess.
#define MFEM_GPU_CHECK(x)                                                      \
  do {                                                                         \
    cudaError_t mfem_err_internal_var_name = (x);                              \
    if (mfem_err_internal_var_name != cudaSuccess) {                           \
      ::mfem::mfem_cuda_error(mfem_err_internal_var_name, #x, _MFEM_FUNC_NAME, \
                              __FILE__, __LINE__);                             \
    }                                                                          \
  } while (0)

// Define the MFEM inner threading macros
#if defined(__CUDA_ARCH__)
#define MFEM_SHARED __shared__
#define MFEM_SYNC_THREAD __syncthreads()
#define MFEM_BLOCK_ID(k) blockIdx.k
#define MFEM_THREAD_ID(k) threadIdx.k
#define MFEM_THREAD_SIZE(k) blockDim.k
#define MFEM_FOREACH_THREAD(i,k,N) for(int i=threadIdx.k; i<N; i+=blockDim.k)
#define MFEM_FOREACH_THREAD_DIRECT(i,k,N) if(const int i=threadIdx.k; i<N)
#endif // defined(__CUDA_ARCH__)
#endif // defined(MFEM_USE_CUDA) && defined(__CUDACC__)

namespace mfem
{

#if defined(MFEM_USE_CUDA) && defined(__CUDACC__)
// Function used by the macro MFEM_GPU_CHECK.
void mfem_cuda_error(cudaError_t err, const char *expr, const char *func,
                     const char *file, int line);
#endif

/// Allocates device memory and returns destination ptr.
void* CuMemAlloc(void **d_ptr, size_t bytes);

/// Allocates managed device memory
void* CuMallocManaged(void **d_ptr, size_t bytes);

/// Allocates page-locked (pinned) host memory
void* CuMemAllocHostPinned(void **ptr, size_t bytes);

/// Frees device memory and returns destination ptr.
void* CuMemFree(void *d_ptr);

/// Frees page-locked (pinned) host memory and returns destination ptr.
void* CuMemFreeHostPinned(void *ptr);

/// Copies memory from Host to Device and returns destination ptr.
void* CuMemcpyHtoD(void *d_dst, const void *h_src, size_t bytes);

/// Copies memory from Host to Device and returns destination ptr.
void* CuMemcpyHtoDAsync(void *d_dst, const void *h_src, size_t bytes);

/// Copies memory from Device to Device
void* CuMemcpyDtoD(void *d_dst, const void *d_src, size_t bytes);

/// Copies memory from Device to Device
void* CuMemcpyDtoDAsync(void *d_dst, const void *d_src, size_t bytes);

/// Copies memory from Device to Host
void* CuMemcpyDtoH(void *h_dst, const void *d_src, size_t bytes);

/// Copies memory from Device to Host
void* CuMemcpyDtoHAsync(void *h_dst, const void *d_src, size_t bytes);

/// Check the error code returned by cudaGetLastError(), aborting on error.
void CuCheckLastError();

/// Get the number of CUDA devices
int CuGetDeviceCount();

} // namespace mfem

#endif // MFEM_CUDA_HPP
