29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
32 #include <hip/hcc_detail/hip_common.h>
48 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
49 #define __HIP_ENABLE_DEVICE_MALLOC__ 0
52 #if __HCC_OR_HIP_CLANG__
55 #if !defined(__align__)
56 #define __align__(x) __attribute__((aligned(x)))
60 #define CUDA_SUCCESS hipSuccess
63 #endif // __HCC_OR_HIP_CLANG__
67 #ifdef HIP_ENABLE_PRINTF
68 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
74 #include "grid_launch.h"
75 #include "hc_printf.hpp"
79 #if GENERIC_GRID_LAUNCH == 0
80 #define hipLaunchParm grid_launch_parm
83 struct Empty_launch_parm {};
85 #define hipLaunchParm hip_impl::Empty_launch_parm
86 #endif // GENERIC_GRID_LAUNCH
88 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
89 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
90 #error(HCC must support GRID_LAUNCH_20)
91 #endif // GRID_LAUNCH_VERSION
95 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
96 #include "grid_launch_GGL.hpp"
97 #endif // GENERIC_GRID_LAUNCH
101 #if __HCC_OR_HIP_CLANG__
102 extern int HIP_TRACE_API;
105 #include <hip/hcc_detail/hip_ldg.h>
107 #include <hip/hcc_detail/hip_atomic.h>
109 #include <hip/hcc_detail/device_functions.h>
110 #include <hip/hcc_detail/surface_functions.h>
112 #include <hip/hcc_detail/math_functions.h>
113 #include <hip/hcc_detail/texture_functions.h>
115 #include <hip/hcc_detail/texture_fetch_functions.h>
116 #include <hip/hcc_detail/texture_indirect_functions.h>
119 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
120 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
124 #if defined(__HCC__) && __HIP_DEVICE_COMPILE__ == 1
126 #define assert(COND) \
136 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
140 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
141 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
142 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
143 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
144 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
147 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
148 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
151 #define __HIP_ARCH_HAS_DOUBLES__ (1)
154 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
155 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
156 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
157 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
160 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
161 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
164 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
165 #define __HIP_ARCH_HAS_3DGRID__ (1)
166 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
171 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
172 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
173 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
174 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
175 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
176 #define select_impl_(_1, _2, impl_, ...) impl_
177 #define __launch_bounds__(...) \
178 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
181 #if defined(__cplusplus)
183 #elif defined(__STDC_VERSION__)
187 __host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
189 #if __HIP_ARCH_GFX701__ == 0
191 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
192 __device__
float __hip_ds_bpermutef(
int index,
float src);
193 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
194 __device__
float __hip_ds_permutef(
int index,
float src);
196 template <
int pattern>
197 __device__
unsigned __hip_ds_swizzle_N(
unsigned int src);
198 template <
int pattern>
199 __device__
float __hip_ds_swizzlef_N(
float src);
201 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
202 __device__
int __hip_move_dpp_N(
int src);
204 #endif //__HIP_ARCH_GFX803__ == 1
206 #endif // __HCC_OR_HIP_CLANG__
212 using R = decltype(hc_get_group_id(0));
215 R operator()(std::uint32_t x)
const noexcept {
return hc_get_group_id(x); }
218 using R = decltype(hc_get_group_size(0));
221 R operator()(std::uint32_t x)
const noexcept {
222 return hc_get_group_size(x);
226 using R = decltype(hc_get_num_groups(0));
229 R operator()(std::uint32_t x)
const noexcept {
230 return hc_get_num_groups(x);
234 using R = decltype(hc_get_workitem_id(0));
237 R operator()(std::uint32_t x)
const noexcept {
238 return hc_get_workitem_id(x);
243 template <
typename F>
245 using R = decltype(F{}(0));
247 struct X { __device__
operator R()
const noexcept {
return F{}(0); } };
248 struct Y { __device__
operator R()
const noexcept {
return F{}(1); } };
249 struct Z { __device__
operator R()
const noexcept {
return F{}(2); } };
251 static constexpr
X x{};
252 static constexpr Y y{};
253 static constexpr Z z{};
260 return hc_get_grid_size(0);
266 return hc_get_grid_size(0);
272 return hc_get_grid_size(1);
278 return hc_get_grid_size(1);
284 return hc_get_grid_size(2);
290 return hc_get_grid_size(2);
298 #define hipThreadIdx_x (hc_get_workitem_id(0))
299 #define hipThreadIdx_y (hc_get_workitem_id(1))
300 #define hipThreadIdx_z (hc_get_workitem_id(2))
302 #define hipBlockIdx_x (hc_get_group_id(0))
303 #define hipBlockIdx_y (hc_get_group_id(1))
304 #define hipBlockIdx_z (hc_get_group_id(2))
306 #define hipBlockDim_x (hc_get_group_size(0))
307 #define hipBlockDim_y (hc_get_group_size(1))
308 #define hipBlockDim_z (hc_get_group_size(2))
310 #define hipGridDim_x (hc_get_num_groups(0))
311 #define hipGridDim_y (hc_get_num_groups(1))
312 #define hipGridDim_z (hc_get_num_groups(2))
314 #endif // defined __HCC__
315 #if __HCC_OR_HIP_CLANG__
316 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
317 #if __HIP_ENABLE_DEVICE_MALLOC__
318 extern "C" __device__
void* __hip_malloc(
size_t);
319 extern "C" __device__
void* __hip_free(
void* ptr);
320 static inline __device__
void* malloc(
size_t size) {
return __hip_malloc(size); }
321 static inline __device__
void* free(
void* ptr) {
return __hip_free(ptr); }
323 static inline __device__
void* malloc(
size_t size) { __builtin_trap();
return nullptr; }
324 static inline __device__
void* free(
void* ptr) { __builtin_trap();
return nullptr; }
326 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
327 #endif //__HCC_OR_HIP_CLANG__
331 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
333 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
334 #define HIP_SYMBOL(X) #X
336 #if defined __HCC_CPP__
347 #if GENERIC_GRID_LAUNCH == 0
351 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
353 grid_launch_parm lp; \
354 lp.dynamic_group_mem_bytes = _groupMemBytes; \
355 hipStream_t trueStream = \
356 (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
357 _kernelName(lp, ##__VA_ARGS__); \
358 ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
360 #endif // GENERIC_GRID_LAUNCH
362 #elif defined(__HCC_C__)
389 #elif defined(__clang__) && defined(__HIP__)
391 #define HIP_KERNEL_NAME(...) __VA_ARGS__
392 #define HIP_SYMBOL(X) X
394 typedef int hipLaunchParm;
396 template <std::size_t n,
typename... Ts,
397 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
398 void pArgs(
const std::tuple<Ts...>&,
void*) {}
400 template <std::size_t n,
typename... Ts,
401 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
402 void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
403 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
405 static_assert(!std::is_reference<T>{},
406 "A __global__ function cannot have a reference as one of its "
408 #if defined(HIP_STRICT)
409 static_assert(std::is_trivially_copyable<T>{},
410 "Only TriviallyCopyable types can be arguments to a __global__ "
413 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
414 return pArgs<n + 1>(formals, _vargs);
417 template <
typename... Formals,
typename... Actuals>
418 std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
419 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
420 std::tuple<Formals...> to_formals{std::move(actuals)};
424 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
425 template <
typename... Args,
typename F = void (*)(Args...)>
426 void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
427 std::uint32_t sharedMemBytes,
hipStream_t stream, Args... args) {
428 constexpr
size_t count =
sizeof...(Args);
429 auto tup_ = std::tuple<Args...>{args...};
430 auto tup = validateArgsCountType(kernel, tup_);
432 pArgs<0>(tup, _Args);
434 auto k =
reinterpret_cast<void*
>(kernel);
435 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
438 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
440 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
443 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
448 #pragma push_macro("__DEVICE__")
449 #define __DEVICE__ static __device__ __forceinline__
451 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
452 __DEVICE__ uint __hip_get_thread_idx_x() {
return __ockl_get_local_id(0); }
453 __DEVICE__ uint __hip_get_thread_idx_y() {
return __ockl_get_local_id(1); }
454 __DEVICE__ uint __hip_get_thread_idx_z() {
return __ockl_get_local_id(2); }
456 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_group_id(uint);
457 __DEVICE__ uint __hip_get_block_idx_x() {
return __ockl_get_group_id(0); }
458 __DEVICE__ uint __hip_get_block_idx_y() {
return __ockl_get_group_id(1); }
459 __DEVICE__ uint __hip_get_block_idx_z() {
return __ockl_get_group_id(2); }
461 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_size(uint);
462 __DEVICE__ uint __hip_get_block_dim_x() {
return __ockl_get_local_size(0); }
463 __DEVICE__ uint __hip_get_block_dim_y() {
return __ockl_get_local_size(1); }
464 __DEVICE__ uint __hip_get_block_dim_z() {
return __ockl_get_local_size(2); }
466 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_num_groups(uint);
467 __DEVICE__ uint __hip_get_grid_dim_x() {
return __ockl_get_num_groups(0); }
468 __DEVICE__ uint __hip_get_grid_dim_y() {
return __ockl_get_num_groups(1); }
469 __DEVICE__ uint __hip_get_grid_dim_z() {
return __ockl_get_num_groups(2); }
471 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
472 __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
473 __DEVICE__ uint __get_##DIMENSION(void) { \
477 struct __hip_builtin_threadIdx_t {
478 __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
479 __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
480 __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
483 struct __hip_builtin_blockIdx_t {
484 __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
485 __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
486 __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
489 struct __hip_builtin_blockDim_t {
490 __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
491 __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
492 __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
495 struct __hip_builtin_gridDim_t {
496 __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
497 __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
498 __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
501 #undef __HIP_DEVICE_BUILTIN
502 #pragma pop_macro("__DEVICE__")
504 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
505 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
506 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
507 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
510 #define hipThreadIdx_x threadIdx.x
511 #define hipThreadIdx_y threadIdx.y
512 #define hipThreadIdx_z threadIdx.z
514 #define hipBlockIdx_x blockIdx.x
515 #define hipBlockIdx_y blockIdx.y
516 #define hipBlockIdx_z blockIdx.z
518 #define hipBlockDim_x blockDim.x
519 #define hipBlockDim_y blockDim.y
520 #define hipBlockDim_z blockDim.z
522 #define hipGridDim_x gridDim.x
523 #define hipGridDim_y gridDim.y
524 #define hipGridDim_z gridDim.z
526 #include <hip/hcc_detail/math_functions.h>
528 #if __HIP_HCC_COMPAT_MODE__
530 #pragma push_macro("__DEFINE_HCC_FUNC")
531 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
532 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
541 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
542 __DEFINE_HCC_FUNC(group_id, blockIdx)
543 __DEFINE_HCC_FUNC(group_size, blockDim)
544 __DEFINE_HCC_FUNC(num_groups, gridDim)
545 #pragma pop_macro("__DEFINE_HCC_FUNC")
547 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_id(uint);
548 inline __device__ __attribute__((always_inline)) uint
549 hc_get_workitem_absolute_id(
int dim)
551 return (uint)__ockl_get_global_id(dim);
556 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
558 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
559 #pragma push_macro("__CUDA__")
561 #include <__clang_cuda_math_forward_declares.h>
562 #include <__clang_cuda_complex_builtins.h>
568 #include <include/cuda_wrappers/algorithm>
569 #include <include/cuda_wrappers/complex>
570 #include <include/cuda_wrappers/new>
572 #pragma pop_macro("__CUDA__")
573 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
574 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
575 #endif // defined(__clang__) && defined(__HIP__)
577 #include <hip/hcc_detail/hip_memory.h>
579 #endif // HIP_HCC_DETAIL_RUNTIME_H