HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/hcc_detail/hip_common.h>
33 
34 //---
35 // Top part of file can be compiled with any compiler
36 
37 //#include <cstring>
38 #if __cplusplus
39 #include <cmath>
40 #else
41 #include <math.h>
42 #include <string.h>
43 #include <stddef.h>
44 #endif //__cplusplus
45 
46 #if __HCC_OR_HIP_CLANG__
47 
48 #if __HIP__
49 #if !defined(__align__)
50 #define __align__(x) __attribute__((aligned(x)))
51 #endif
52 #endif
53 
54 #define CUDA_SUCCESS hipSuccess
55 
56 #include <hip/hip_runtime_api.h>
57 #endif // __HCC_OR_HIP_CLANG__
58 
59 #if __HCC__
60 // define HIP_ENABLE_PRINTF to enable printf
61 #ifdef HIP_ENABLE_PRINTF
62 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
63 #endif
64 
65 //---
66 // Remainder of this file only compiles with HCC
67 #if defined __HCC__
68 #include "grid_launch.h"
69 #include "hc_printf.hpp"
70 // TODO-HCC-GL - change this to typedef.
71 // typedef grid_launch_parm hipLaunchParm ;
72 
73 #if GENERIC_GRID_LAUNCH == 0
74 #define hipLaunchParm grid_launch_parm
75 #else
76 namespace hip_impl {
77 struct Empty_launch_parm {};
78 } // namespace hip_impl
79 #define hipLaunchParm hip_impl::Empty_launch_parm
80 #endif // GENERIC_GRID_LAUNCH
81 
82 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
83 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
84 #error(HCC must support GRID_LAUNCH_20)
85 #endif // GRID_LAUNCH_VERSION
86 
87 #endif // HCC
88 
89 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
90 #include "grid_launch_GGL.hpp"
91 #endif // GENERIC_GRID_LAUNCH
92 
93 #endif // HCC
94 
95 #if __HCC_OR_HIP_CLANG__
96 extern int HIP_TRACE_API;
97 
98 #ifdef __cplusplus
99 #include <hip/hcc_detail/hip_ldg.h>
100 #endif
101 #include <hip/hcc_detail/hip_atomic.h>
103 #include <hip/hcc_detail/device_functions.h>
104 #include <hip/hcc_detail/surface_functions.h>
105 #include <hip/hcc_detail/texture_functions.h>
106 #if __HCC__
107  #include <hip/hcc_detail/math_functions.h>
108 #endif
109 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
110 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
111 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
112 #endif
113 
114 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
115 #if __HIP_DEVICE_COMPILE__ == 1
116 #undef assert
117 #define assert(COND) \
118  { \
119  if (!(COND)) { \
120  abort(); \
121  } \
122  }
123 #endif
124 
125 
126 // Feature tests:
127 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
128 // Device compile and not host compile:
129 
130 // 32-bit Atomics:
131 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
132 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
133 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
134 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
135 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
136 
137 // 64-bit Atomics:
138 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
139 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
140 
141 // Doubles
142 #define __HIP_ARCH_HAS_DOUBLES__ (1)
143 
144 // warp cross-lane operations:
145 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
146 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
147 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
148 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
149 
150 // sync
151 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
152 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
153 
154 // misc
155 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
156 #define __HIP_ARCH_HAS_3DGRID__ (1)
157 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
158 
159 #endif /* Device feature flags */
160 
161 
162 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
163  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
164 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
165  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
166  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
167 #define select_impl_(_1, _2, impl_, ...) impl_
168 #define __launch_bounds__(...) \
169  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
170 
171 // Detect if we are compiling C++ mode or C mode
172 #if defined(__cplusplus)
173 #define __HCC_CPP__
174 #elif defined(__STDC_VERSION__)
175 #define __HCC_C__
176 #endif
177 
178 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
179 
180 #if __HIP_ARCH_GFX701__ == 0
181 
182 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
183 __device__ float __hip_ds_bpermutef(int index, float src);
184 __device__ unsigned __hip_ds_permute(int index, unsigned src);
185 __device__ float __hip_ds_permutef(int index, float src);
186 
187 template <int pattern>
188 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
189 template <int pattern>
190 __device__ float __hip_ds_swizzlef_N(float src);
191 
192 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
193 __device__ int __hip_move_dpp_N(int src);
194 
195 #endif //__HIP_ARCH_GFX803__ == 1
196 
197 #endif // __HCC_OR_HIP_CLANG__
198 
199 #if defined __HCC__
200 
201 template <
202  typename std::common_type<decltype(hc_get_group_id), decltype(hc_get_group_size),
203  decltype(hc_get_num_groups), decltype(hc_get_workitem_id)>::type f>
204 class Coordinates {
205  using R = decltype(f(0));
206 
207  struct X {
208  __device__ operator R() const { return f(0); }
209  };
210  struct Y {
211  __device__ operator R() const { return f(1); }
212  };
213  struct Z {
214  __device__ operator R() const { return f(2); }
215  };
216 
217  public:
218  static constexpr X x{};
219  static constexpr Y y{};
220  static constexpr Z z{};
221 };
222 
223 static constexpr Coordinates<hc_get_group_size> blockDim;
224 static constexpr Coordinates<hc_get_group_id> blockIdx;
225 static constexpr Coordinates<hc_get_num_groups> gridDim;
226 static constexpr Coordinates<hc_get_workitem_id> threadIdx;
227 
228 #define hipThreadIdx_x (hc_get_workitem_id(0))
229 #define hipThreadIdx_y (hc_get_workitem_id(1))
230 #define hipThreadIdx_z (hc_get_workitem_id(2))
231 
232 #define hipBlockIdx_x (hc_get_group_id(0))
233 #define hipBlockIdx_y (hc_get_group_id(1))
234 #define hipBlockIdx_z (hc_get_group_id(2))
235 
236 #define hipBlockDim_x (hc_get_group_size(0))
237 #define hipBlockDim_y (hc_get_group_size(1))
238 #define hipBlockDim_z (hc_get_group_size(2))
239 
240 #define hipGridDim_x (hc_get_num_groups(0))
241 #define hipGridDim_y (hc_get_num_groups(1))
242 #define hipGridDim_z (hc_get_num_groups(2))
243 
244 #endif // defined __HCC__
245 #if __HCC_OR_HIP_CLANG__
246 extern "C" __device__ void* __hip_malloc(size_t);
247 extern "C" __device__ void* __hip_free(void* ptr);
248 
249 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
250 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
251 
252 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
253 template <typename... All>
254 static inline __device__ void printf(const char* format, All... all) {
255  hc::printf(format, all...);
256 }
257 #elif defined(__HCC_ACCELERATOR__) || __HIP__
258 template <typename... All>
259 static inline __device__ void printf(const char* format, All... all) {}
260 #endif
261 
262 #endif //__HCC_OR_HIP_CLANG__
263 
264 #ifdef __HCC__
265 
266 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
267 
268 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
269 #define HIP_SYMBOL(X) #X
270 
271 #if defined __HCC_CPP__
272 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
273  grid_launch_parm* lp, const char* kernelNameStr);
274 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
275  grid_launch_parm* lp, const char* kernelNameStr);
276 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
277  grid_launch_parm* lp, const char* kernelNameStr);
278 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
279  grid_launch_parm* lp, const char* kernelNameStr);
280 extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);
281 
282 #if GENERIC_GRID_LAUNCH == 0
283 //#warning "Original hipLaunchKernel defined"
284 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
285 // either size_t or dim3 types
286 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
287  do { \
288  grid_launch_parm lp; \
289  lp.dynamic_group_mem_bytes = _groupMemBytes; \
290  hipStream_t trueStream = \
291  (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
292  _kernelName(lp, ##__VA_ARGS__); \
293  ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
294  } while (0)
295 #endif // GENERIC_GRID_LAUNCH
296 
297 #elif defined(__HCC_C__)
298 
299 // TODO - develop C interface.
300 
301 #endif //__HCC_CPP__
302 
307 // extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
308 // extern int HIP_TRACE_API; ///< Trace HIP APIs.
309 // extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
310 
316 // End doxygen API:
321 //
322 // hip-clang functions
323 //
324 #elif defined(__clang__) && defined(__HIP__)
325 
326 #define HIP_KERNEL_NAME(...) __VA_ARGS__
327 #define HIP_SYMBOL(X) #X
328 
329 typedef int hipLaunchParm;
330 
331 #define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
332  do { \
333  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(hipLaunchParm{}, ##__VA_ARGS__); \
334  } while (0)
335 
336 #define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
337  do { \
338  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__); \
339  } while (0)
340 
341 #include <hip/hip_runtime_api.h>
342 
343 #pragma push_macro("__DEVICE__")
344 #define __DEVICE__ static __device__ __forceinline__
345 
346 extern "C" __device__ size_t __ockl_get_local_id(uint);
347 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
348 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
349 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
350 
351 extern "C" __device__ size_t __ockl_get_group_id(uint);
352 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
353 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
354 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
355 
356 extern "C" __device__ size_t __ockl_get_local_size(uint);
357 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
358 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
359 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
360 
361 extern "C" __device__ size_t __ockl_get_num_groups(uint);
362 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
363 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
364 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
365 
366 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
367  __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
368  __DEVICE__ uint __get_##DIMENSION(void) { \
369  return FUNCTION; \
370  }
371 
372 struct __hip_builtin_threadIdx_t {
373  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
374  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
375  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
376 };
377 
378 struct __hip_builtin_blockIdx_t {
379  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
380  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
381  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
382 };
383 
384 struct __hip_builtin_blockDim_t {
385  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
386  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
387  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
388 };
389 
390 struct __hip_builtin_gridDim_t {
391  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
392  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
393  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
394 };
395 
396 #undef __HIP_DEVICE_BUILTIN
397 #pragma pop_macro("__DEVICE__")
398 
399 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
400 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
401 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
402 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
403 
404 
405 #define hipThreadIdx_x threadIdx.x
406 #define hipThreadIdx_y threadIdx.y
407 #define hipThreadIdx_z threadIdx.z
408 
409 #define hipBlockIdx_x blockIdx.x
410 #define hipBlockIdx_y blockIdx.y
411 #define hipBlockIdx_z blockIdx.z
412 
413 #define hipBlockDim_x blockDim.x
414 #define hipBlockDim_y blockDim.y
415 #define hipBlockDim_z blockDim.z
416 
417 #define hipGridDim_x gridDim.x
418 #define hipGridDim_y gridDim.y
419 #define hipGridDim_z gridDim.z
420 
421 #include <hip/hcc_detail/math_functions.h>
422 
423 #if __HIP_HCC_COMPAT_MODE__
424 // Define HCC work item functions in terms of HIP builtin variables.
425 #pragma push_macro("__DEFINE_HCC_FUNC")
426 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
427 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
428  if (i==0) \
429  return hip_var.x; \
430  else if(i==1) \
431  return hip_var.y; \
432  else \
433  return hip_var.z; \
434 }
435 
436 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
437 __DEFINE_HCC_FUNC(group_id, blockIdx)
438 __DEFINE_HCC_FUNC(group_size, blockDim)
439 __DEFINE_HCC_FUNC(num_groups, gridDim)
440 #pragma pop_macro("__DEFINE_HCC_FUNC")
441 
442 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
443 inline __device__ __attribute__((always_inline)) uint
444 hc_get_workitem_absolute_id(int dim)
445 {
446  return (uint)__ockl_get_global_id(dim);
447 }
448 
449 #endif
450 
451 // Support std::complex.
452 #pragma push_macro("__CUDA__")
453 #define __CUDA__
454 #include <__clang_cuda_math_forward_declares.h>
455 #include <__clang_cuda_complex_builtins.h>
456 #include <cuda_wrappers/algorithm>
457 #include <cuda_wrappers/complex>
458 #include <cuda_wrappers/new>
459 #undef __CUDA__
460 #pragma pop_macro("__CUDA__")
461 
462 hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
463  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
464  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
465  uint32_t localWorkSizeZ, size_t sharedMemBytes,
466  hipStream_t hStream, void** kernelParams, void** extra,
467  hipEvent_t startEvent = nullptr,
468  hipEvent_t stopEvent = nullptr,
469  uint32_t flags = 0);
470 
471 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
472  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
473  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
474  uint32_t localWorkSizeZ, size_t sharedMemBytes,
475  hipStream_t hStream, void** kernelParams, void** extra,
476  hipEvent_t startEvent = nullptr,
477  hipEvent_t stopEvent = nullptr)
478  __attribute__((deprecated("use hipExtModuleLaunchKernel instead")));
479 
480 #endif // defined(__clang__) && defined(__HIP__)
481 
482 #include <hip/hcc_detail/hip_memory.h>
483 
484 #endif // HIP_HCC_DETAIL_RUNTIME_H
TODO-doc.
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Copies the size of symbol symbolName to size.
Definition: hip_fp16_math_fwd.h:53
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent=nullptr, hipEvent_t stopEvent=nullptr, uint32_t flags=0)
launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelp...
Definition: hip_module.cpp:290
Definition: hip_module.cpp:106
Definition: hip_runtime_api.h:269
#define __host__
Definition: host_defines.h:41
Definition: grid_launch.h:31
Definition: hip_runtime_api.h:82
Definition: hip_hcc_internal.h:705
Definition: hip_hcc_internal.h:524
Definition: hip_runtime.h:204