HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountll(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ulong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1ULL << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
228  union { int i; unsigned u; float f; } tmp; tmp.u = src;
229  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
230  return tmp.u;
231 }
232 
233 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
234  union { int i; unsigned u; float f; } tmp; tmp.f = src;
235  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
236  return tmp.f;
237 }
238 
239 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
240  union { int i; unsigned u; float f; } tmp; tmp.u = src;
241  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
242  return tmp.u;
243 }
244 
245 __device__ static inline float __hip_ds_permutef(int index, float src) {
246  union { int i; unsigned u; float f; } tmp; tmp.u = src;
247  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
248  return tmp.u;
249 }
250 
251 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
252 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
253 
254 template <int pattern>
255 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
256  union { int i; unsigned u; float f; } tmp; tmp.u = src;
257 #if defined(__HCC__)
258  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
259 #else
260  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
261 #endif
262  return tmp.u;
263 }
264 
265 template <int pattern>
266 __device__ static inline float __hip_ds_swizzlef_N(float src) {
267  union { int i; unsigned u; float f; } tmp; tmp.f = src;
268 #if defined(__HCC__)
269  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
270 #else
271  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
272 #endif
273  return tmp.f;
274 }
275 
276 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
277  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
278 
279 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
280 __device__ static inline int __hip_move_dpp_N(int src) {
281  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
282  bound_ctrl);
283 }
284 
285 static constexpr int warpSize = 64;
286 
287 __device__
288 inline
289 int __shfl(int var, int src_lane, int width = warpSize) {
290  int self = __lane_id();
291  int index = src_lane + (self & ~(width-1));
292  return __llvm_amdgcn_ds_bpermute(index<<2, var);
293 }
294 __device__
295 inline
296 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
297  union { int i; unsigned u; float f; } tmp; tmp.u = var;
298  tmp.i = __shfl(tmp.i, src_lane, width);
299  return tmp.u;
300 }
301 __device__
302 inline
303 float __shfl(float var, int src_lane, int width = warpSize) {
304  union { int i; unsigned u; float f; } tmp; tmp.f = var;
305  tmp.i = __shfl(tmp.i, src_lane, width);
306  return tmp.f;
307 }
308 __device__
309 inline
310 double __shfl(double var, int src_lane, int width = warpSize) {
311  static_assert(sizeof(double) == 2 * sizeof(int), "");
312  static_assert(sizeof(double) == sizeof(uint64_t), "");
313 
314  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
315  tmp[0] = __shfl(tmp[0], src_lane, width);
316  tmp[1] = __shfl(tmp[1], src_lane, width);
317 
318  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
319  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
320  return tmp1;
321 }
322 
323  __device__
324 inline
325 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
326  int self = __lane_id();
327  int index = self - lane_delta;
328  index = (index < (self & ~(width-1)))?self:index;
329  return __llvm_amdgcn_ds_bpermute(index<<2, var);
330 }
331 __device__
332 inline
333 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
334  union { int i; unsigned u; float f; } tmp; tmp.u = var;
335  tmp.i = __shfl_up(tmp.i, lane_delta, width);
336  return tmp.u;
337 }
338 __device__
339 inline
340 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
341  union { int i; unsigned u; float f; } tmp; tmp.f = var;
342  tmp.i = __shfl_up(tmp.i, lane_delta, width);
343  return tmp.f;
344 }
345 __device__
346 inline
347 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
348  static_assert(sizeof(double) == 2 * sizeof(int), "");
349  static_assert(sizeof(double) == sizeof(uint64_t), "");
350 
351  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
352  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
353  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
354 
355  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
356  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
357  return tmp1;
358 }
359 
360 __device__
361 inline
362 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
363  int self = __lane_id();
364  int index = self + lane_delta;
365  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
366  return __llvm_amdgcn_ds_bpermute(index<<2, var);
367 }
368 __device__
369 inline
370 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
371  union { int i; unsigned u; float f; } tmp; tmp.u = var;
372  tmp.i = __shfl_down(tmp.i, lane_delta, width);
373  return tmp.u;
374 }
375 __device__
376 inline
377 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
378  union { int i; unsigned u; float f; } tmp; tmp.f = var;
379  tmp.i = __shfl_down(tmp.i, lane_delta, width);
380  return tmp.f;
381 }
382 __device__
383 inline
384 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
385  static_assert(sizeof(double) == 2 * sizeof(int), "");
386  static_assert(sizeof(double) == sizeof(uint64_t), "");
387 
388  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
389  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
390  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
391 
392  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
393  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
394  return tmp1;
395 }
396 
397 __device__
398 inline
399 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
400  int self = __lane_id();
401  int index = self^lane_mask;
402  index = index >= ((self+width)&~(width-1))?self:index;
403  return __llvm_amdgcn_ds_bpermute(index<<2, var);
404 }
405 __device__
406 inline
407 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
408  union { int i; unsigned u; float f; } tmp; tmp.u = var;
409  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
410  return tmp.u;
411 }
412 __device__
413 inline
414 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
415  union { int i; unsigned u; float f; } tmp; tmp.f = var;
416  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
417  return tmp.f;
418 }
419 __device__
420 inline
421 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
422  static_assert(sizeof(double) == 2 * sizeof(int), "");
423  static_assert(sizeof(double) == sizeof(uint64_t), "");
424 
425  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
426  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
427  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
428 
429  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
430  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
431  return tmp1;
432 }
433 
434 #define MASK1 0x00ff00ff
435 #define MASK2 0xff00ff00
436 
437 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
438  char4 out;
439  unsigned one1 = in1.w & MASK1;
440  unsigned one2 = in2.w & MASK1;
441  out.w = (one1 + one2) & MASK1;
442  one1 = in1.w & MASK2;
443  one2 = in2.w & MASK2;
444  out.w = out.w | ((one1 + one2) & MASK2);
445  return out;
446 }
447 
448 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
449  char4 out;
450  unsigned one1 = in1.w & MASK1;
451  unsigned one2 = in2.w & MASK1;
452  out.w = (one1 - one2) & MASK1;
453  one1 = in1.w & MASK2;
454  one2 = in2.w & MASK2;
455  out.w = out.w | ((one1 - one2) & MASK2);
456  return out;
457 }
458 
459 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
460  char4 out;
461  unsigned one1 = in1.w & MASK1;
462  unsigned one2 = in2.w & MASK1;
463  out.w = (one1 * one2) & MASK1;
464  one1 = in1.w & MASK2;
465  one2 = in2.w & MASK2;
466  out.w = out.w | ((one1 * one2) & MASK2);
467  return out;
468 }
469 
470 /*
471  * Rounding modes are not yet supported in HIP
472  * TODO: Conversion functions are not correct, need to fix when BE is ready
473 */
474 
475 __device__ static inline float __double2float_rd(double x) { return (double)x; }
476 __device__ static inline float __double2float_rn(double x) { return (double)x; }
477 __device__ static inline float __double2float_ru(double x) { return (double)x; }
478 __device__ static inline float __double2float_rz(double x) { return (double)x; }
479 
480 __device__ static inline int __double2hiint(double x) {
481  static_assert(sizeof(double) == 2 * sizeof(int), "");
482 
483  int tmp[2];
484  __builtin_memcpy(tmp, &x, sizeof(tmp));
485 
486  return tmp[1];
487 }
488 __device__ static inline int __double2loint(double x) {
489  static_assert(sizeof(double) == 2 * sizeof(int), "");
490 
491  int tmp[2];
492  __builtin_memcpy(tmp, &x, sizeof(tmp));
493 
494  return tmp[0];
495 }
496 
497 __device__ static inline int __double2int_rd(double x) { return (int)x; }
498 __device__ static inline int __double2int_rn(double x) { return (int)x; }
499 __device__ static inline int __double2int_ru(double x) { return (int)x; }
500 __device__ static inline int __double2int_rz(double x) { return (int)x; }
501 
502 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
503 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
504 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
505 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
506 
507 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
508 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
509 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
510 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
511 
512 __device__ static inline unsigned long long int __double2ull_rd(double x) {
513  return (unsigned long long int)x;
514 }
515 __device__ static inline unsigned long long int __double2ull_rn(double x) {
516  return (unsigned long long int)x;
517 }
518 __device__ static inline unsigned long long int __double2ull_ru(double x) {
519  return (unsigned long long int)x;
520 }
521 __device__ static inline unsigned long long int __double2ull_rz(double x) {
522  return (unsigned long long int)x;
523 }
524 
525 __device__ static inline long long int __double_as_longlong(double x) {
526  static_assert(sizeof(long long) == sizeof(double), "");
527 
528  long long tmp;
529  __builtin_memcpy(&tmp, &x, sizeof(tmp));
530 
531  return tmp;
532 }
533 
534 /*
535 __device__ unsigned short __float2half_rn(float x);
536 __device__ float __half2float(unsigned short);
537 
538 The above device function are not a valid .
539 Use
540 __device__ __half __float2half_rn(float x);
541 __device__ float __half2float(__half);
542 from hip_fp16.h
543 
544 CUDA implements half as unsigned short whereas, HIP doesn't.
545 
546 */
547 
548 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
549 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
550 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
551 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
552 
553 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
554 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
555 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
556 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
557 
558 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
559 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
560 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
561 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
562 
563 __device__ static inline unsigned long long int __float2ull_rd(float x) {
564  return (unsigned long long int)x;
565 }
566 __device__ static inline unsigned long long int __float2ull_rn(float x) {
567  return (unsigned long long int)x;
568 }
569 __device__ static inline unsigned long long int __float2ull_ru(float x) {
570  return (unsigned long long int)x;
571 }
572 __device__ static inline unsigned long long int __float2ull_rz(float x) {
573  return (unsigned long long int)x;
574 }
575 
576 __device__ static inline int __float_as_int(float x) {
577  static_assert(sizeof(int) == sizeof(float), "");
578 
579  int tmp;
580  __builtin_memcpy(&tmp, &x, sizeof(tmp));
581 
582  return tmp;
583 }
584 
585 __device__ static inline unsigned int __float_as_uint(float x) {
586  static_assert(sizeof(unsigned int) == sizeof(float), "");
587 
588  unsigned int tmp;
589  __builtin_memcpy(&tmp, &x, sizeof(tmp));
590 
591  return tmp;
592 }
593 
594 __device__ static inline double __hiloint2double(int hi, int lo) {
595  static_assert(sizeof(double) == sizeof(uint64_t), "");
596 
597  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
598  double tmp1;
599  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
600 
601  return tmp1;
602 }
603 
604 __device__ static inline double __int2double_rn(int x) { return (double)x; }
605 
606 __device__ static inline float __int2float_rd(int x) { return (float)x; }
607 __device__ static inline float __int2float_rn(int x) { return (float)x; }
608 __device__ static inline float __int2float_ru(int x) { return (float)x; }
609 __device__ static inline float __int2float_rz(int x) { return (float)x; }
610 
611 __device__ static inline float __int_as_float(int x) {
612  static_assert(sizeof(float) == sizeof(int), "");
613 
614  float tmp;
615  __builtin_memcpy(&tmp, &x, sizeof(tmp));
616 
617  return tmp;
618 }
619 
620 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
621 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
622 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
623 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
624 
625 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
626 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
627 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
628 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
629 
630 __device__ static inline double __longlong_as_double(long long int x) {
631  static_assert(sizeof(double) == sizeof(long long), "");
632 
633  double tmp;
634  __builtin_memcpy(&tmp, &x, sizeof(tmp));
635 
636  return tmp;
637 }
638 
639 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
640 
641 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
642 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
643 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
644 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
645 
646 __device__ static inline float __uint_as_float(unsigned int x) {
647  static_assert(sizeof(float) == sizeof(unsigned int), "");
648 
649  float tmp;
650  __builtin_memcpy(&tmp, &x, sizeof(tmp));
651 
652  return tmp;
653 }
654 
655 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
656 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
657 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
658 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
659 
660 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
661 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
662 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
663 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
664 
665 #if defined(__HCC__)
666 #define __HCC_OR_HIP_CLANG__ 1
667 #elif defined(__clang__) && defined(__HIP__)
668 #define __HCC_OR_HIP_CLANG__ 1
669 #else
670 #define __HCC_OR_HIP_CLANG__ 0
671 #endif
672 
673 #ifdef __HCC_OR_HIP_CLANG__
674 
675 // Clock functions
676 __device__ long long int __clock64();
677 __device__ long long int __clock();
678 __device__ long long int clock64();
679 __device__ long long int clock();
680 // hip.amdgcn.bc - named sync
681 __device__ void __named_sync(int a, int b);
682 
683 #ifdef __HIP_DEVICE_COMPILE__
684 
685 // Clock functions
686 #if __HCC__
687 extern "C" uint64_t __clock_u64() __HC__;
688 #endif
689 
690 __device__
691 inline __attribute((always_inline))
692 long long int __clock64() {
693 // ToDo: Unify HCC and HIP implementation.
694 #if __HCC__
695  return (long long int) __clock_u64();
696 #else
697  return (long long int) __builtin_amdgcn_s_memrealtime();
698 #endif
699 }
700 
701 __device__
702 inline __attribute((always_inline))
703 long long int __clock() { return __clock64(); }
704 
705 __device__
706 inline __attribute__((always_inline))
707 long long int clock64() { return __clock64(); }
708 
709 __device__
710 inline __attribute__((always_inline))
711 long long int clock() { return __clock(); }
712 
713 // hip.amdgcn.bc - named sync
714 __device__
715 inline
716 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
717 
718 #endif // __HIP_DEVICE_COMPILE__
719 
720 // warp vote function __all __any __ballot
721 __device__
722 inline
723 int __all(int predicate) {
724  return __ockl_wfall_i32(predicate);
725 }
726 
727 __device__
728 inline
729 int __any(int predicate) {
730  return __ockl_wfany_i32(predicate);
731 }
732 
733 // XXX from llvm/include/llvm/IR/InstrTypes.h
734 #define ICMP_NE 33
735 
736 __device__
737 inline
738 unsigned long long int __ballot(int predicate) {
739 #if defined(__HCC__)
740  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
741 #else
742  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
743 #endif
744 }
745 
746 __device__
747 inline
748 unsigned long long int __ballot64(int predicate) {
749 #if defined(__HCC__)
750  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
751 #else
752  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
753 #endif
754 }
755 
756 // hip.amdgcn.bc - lanemask
757 __device__
758 inline
759 int64_t __lanemask_gt()
760 {
761  int32_t activelane = __ockl_activelane_u32();
762  int64_t ballot = __ballot64(1);
763  if (activelane != 63) {
764  int64_t tmp = (~0ULL) << (activelane + 1);
765  return tmp & ballot;
766  }
767  return 0;
768 }
769 
770 __device__
771 inline
772 int64_t __lanemask_lt()
773 {
774  int32_t activelane = __ockl_activelane_u32();
775  int64_t ballot = __ballot64(1);
776  if (activelane == 0)
777  return 0;
778  return ballot;
779 }
780 
781 __device__ inline void* __local_to_generic(void* p) { return p; }
782 
783 #ifdef __HIP_DEVICE_COMPILE__
784 __device__
785 inline
786 void* __get_dynamicgroupbaseptr()
787 {
788  // Get group segment base pointer.
789  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
790 }
791 #else
792 __device__
793 void* __get_dynamicgroupbaseptr();
794 #endif // __HIP_DEVICE_COMPILE__
795 
796 __device__
797 inline
798 void *__amdgcn_get_dynamicgroupbaseptr() {
799  return __get_dynamicgroupbaseptr();
800 }
801 
802 #if defined(__HCC__) && (__hcc_minor__ < 3)
803 // hip.amdgcn.bc - sync threads
804 #define __CLK_LOCAL_MEM_FENCE 0x01
805 typedef unsigned __cl_mem_fence_flags;
806 
807 typedef enum __memory_scope {
808  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
809  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
810  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
811  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
812  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
813 } __memory_scope;
814 
815 // enum values aligned with what clang uses in EmitAtomicExpr()
816 typedef enum __memory_order
817 {
818  __memory_order_relaxed = __ATOMIC_RELAXED,
819  __memory_order_acquire = __ATOMIC_ACQUIRE,
820  __memory_order_release = __ATOMIC_RELEASE,
821  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
822  __memory_order_seq_cst = __ATOMIC_SEQ_CST
823 } __memory_order;
824 
825 __device__
826 inline
827 static void
828 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
829 {
830  // We're tying global-happens-before and local-happens-before together as does HSA
831  if (order != __memory_order_relaxed) {
832  switch (scope) {
833  case __memory_scope_work_item:
834  break;
835  case __memory_scope_sub_group:
836  switch (order) {
837  case __memory_order_relaxed: break;
838  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
839  case __memory_order_release: __llvm_fence_rel_sg(); break;
840  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
841  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
842  }
843  break;
844  case __memory_scope_work_group:
845  switch (order) {
846  case __memory_order_relaxed: break;
847  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
848  case __memory_order_release: __llvm_fence_rel_wg(); break;
849  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
850  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
851  }
852  break;
853  case __memory_scope_device:
854  switch (order) {
855  case __memory_order_relaxed: break;
856  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
857  case __memory_order_release: __llvm_fence_rel_dev(); break;
858  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
859  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
860  }
861  break;
862  case __memory_scope_all_svm_devices:
863  switch (order) {
864  case __memory_order_relaxed: break;
865  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
866  case __memory_order_release: __llvm_fence_rel_sys(); break;
867  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
868  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
869  }
870  break;
871  }
872  }
873 }
874 #endif
875 
876 // Memory Fence Functions
877 __device__
878 inline
879 static void __threadfence()
880 {
881  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
882 }
883 
884 __device__
885 inline
886 static void __threadfence_block()
887 {
888  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
889 }
890 
891 __device__
892 inline
893 static void __threadfence_system()
894 {
895  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
896 }
897 
898 // abort
899 __device__
900 inline
901 __attribute__((weak))
902 void abort() {
903  return __builtin_trap();
904 }
905 
906 
907 #endif // __HCC_OR_HIP_CLANG__
908 
909 #ifdef __HCC__
910 
915 // Macro to replace extern __shared__ declarations
916 // to local variable definitions
917 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
918 
919 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
920 
921 
922 #elif defined(__clang__) && defined(__HIP__)
923 
924 #pragma push_macro("__DEVICE__")
925 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
926  __attribute__((weak))
927 
928 __DEVICE__
929 inline
930 void __assert_fail(const char * __assertion,
931  const char *__file,
932  unsigned int __line,
933  const char *__function)
934 {
935  // Ignore all the args for now.
936  __builtin_trap();
937 }
938 
939 __DEVICE__
940 inline
941 void __assertfail(const char * __assertion,
942  const char *__file,
943  unsigned int __line,
944  const char *__function,
945  size_t charsize)
946 {
947  // ignore all the args for now.
948  __builtin_trap();
949 }
950 
951 __device__
952 inline
953 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
954 {
955  if (flags) {
956  __atomic_work_item_fence(flags, __memory_order_release, scope);
957  __builtin_amdgcn_s_barrier();
958  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
959  } else {
960  __builtin_amdgcn_s_barrier();
961  }
962 }
963 
964 __device__
965 inline
966 static void __barrier(int n)
967 {
968  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
969 }
970 
971 __device__
972 inline
973 __attribute__((noduplicate))
974 void __syncthreads()
975 {
976  __barrier(__CLK_LOCAL_MEM_FENCE);
977 }
978 
979 // hip.amdgcn.bc - device routine
980 /*
981  HW_ID Register bit structure
982  WAVE_ID 3:0 Wave buffer slot number. 0-9.
983  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
984  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
985  CU_ID 11:8 Compute Unit the wave is assigned to.
986  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
987  SE_ID 14:13 Shader Engine the wave is assigned to.
988  TG_ID 19:16 Thread-group ID
989  VM_ID 23:20 Virtual Memory ID
990  QUEUE_ID 26:24 Queue from which this wave was dispatched.
991  STATE_ID 29:27 State ID (graphics only, not compute).
992  ME_ID 31:30 Micro-engine ID.
993  */
994 
995 #define HW_ID 4
996 
997 #define HW_ID_CU_ID_SIZE 4
998 #define HW_ID_CU_ID_OFFSET 8
999 
1000 #define HW_ID_SE_ID_SIZE 2
1001 #define HW_ID_SE_ID_OFFSET 13
1002 
1003 /*
1004  Encoding of parameter bitmask
1005  HW_ID 5:0 HW_ID
1006  OFFSET 10:6 Range: 0..31
1007  SIZE 15:11 Range: 1..32
1008  */
1009 
1010 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1011 
1012 /*
1013  __smid returns the wave's assigned Compute Unit and Shader Engine.
1014  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1015  Note: the results vary over time.
1016  SZ minus 1 since SIZE is 1-based.
1017 */
1018 __device__
1019 inline
1020 unsigned __smid(void)
1021 {
1022  unsigned cu_id = __builtin_amdgcn_s_getreg(
1023  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1024  unsigned se_id = __builtin_amdgcn_s_getreg(
1025  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1026 
1027  /* Each shader engine has 16 CU */
1028  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1029 }
1030 
1031 #pragma push_macro("__DEVICE__")
1032 
1033 // Macro to replace extern __shared__ declarations
1034 // to local variable definitions
1035 #define HIP_DYNAMIC_SHARED(type, var) \
1036  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1037 
1038 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1039 
1040 
1041 #endif //defined(__clang__) && defined(__HIP__)
1042 
1043 
1044 // loop unrolling
1045 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1046  auto dstPtr = static_cast<unsigned char*>(dst);
1047  auto srcPtr = static_cast<const unsigned char*>(src);
1048 
1049  while (size >= 4u) {
1050  dstPtr[0] = srcPtr[0];
1051  dstPtr[1] = srcPtr[1];
1052  dstPtr[2] = srcPtr[2];
1053  dstPtr[3] = srcPtr[3];
1054 
1055  size -= 4u;
1056  srcPtr += 4u;
1057  dstPtr += 4u;
1058  }
1059  switch (size) {
1060  case 3:
1061  dstPtr[2] = srcPtr[2];
1062  case 2:
1063  dstPtr[1] = srcPtr[1];
1064  case 1:
1065  dstPtr[0] = srcPtr[0];
1066  }
1067 
1068  return dst;
1069 }
1070 
1071 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1072  auto dstPtr = static_cast<unsigned char*>(dst);
1073 
1074  while (size >= 4u) {
1075  dstPtr[0] = val;
1076  dstPtr[1] = val;
1077  dstPtr[2] = val;
1078  dstPtr[3] = val;
1079 
1080  size -= 4u;
1081  dstPtr += 4u;
1082  }
1083  switch (size) {
1084  case 3:
1085  dstPtr[2] = val;
1086  case 2:
1087  dstPtr[1] = val;
1088  case 1:
1089  dstPtr[0] = val;
1090  }
1091 
1092  return dst;
1093 }
1094 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1095  return __hip_hc_memcpy(dst, src, size);
1096 }
1097 
1098 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1099  unsigned char val8 = static_cast<unsigned char>(val);
1100  return __hip_hc_memset(ptr, val8, size);
1101 }
1102 
1103 #endif
TODO-doc.
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Copies the size of symbol symbolName to size.
Definition: hip_fp16_math_fwd.h:53
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.