HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 #include <unordered_map>
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 extern int HIP_SYNC_FREE;
87 
88 extern int HIP_DUMP_CODE_OBJECT;
89 
90 // TODO - remove when this is standard behavior.
91 extern int HCC_OPT_FLUSH;
92 
93 #define IMAGE_PITCH_ALIGNMENT 256
94 template <typename T> inline T alignDown(T value, size_t alignment) {
95  return (T)(value & ~(alignment - 1));
96 }
97 
98 template <typename T> inline T* alignDown(T* value, size_t alignment) {
99  return (T*)alignDown((intptr_t)value, alignment);
100 }
101 
102 template <typename T> inline T alignUp(T value, size_t alignment) {
103  return alignDown((T)(value + alignment - 1), alignment);
104 }
105 
106 template <typename T> inline T* alignUp(T* value, size_t alignment) {
107  return (T*)alignDown((intptr_t)(value + alignment - 1), alignment);
108 }
109 
110 size_t getNumChannels(hsa_ext_image_channel_order_t channelOrder) {
111  switch (channelOrder) {
112  case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
113  return 2;
114  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
115  return 3;
116  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
117  return 4;
118  case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
119  default:
120  return 1;
121  }
122 }
123 
124 size_t getElementSize(hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType) {
125  size_t bytesPerPixel = getNumChannels(channelOrder);
126  switch (channelType) {
127  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
128  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
129  break;
130 
131  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
132  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
133  case HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT:
134  bytesPerPixel *= 4;
135  break;
136 
137  default:
138  bytesPerPixel *= 2;
139  break;
140  }
141  return bytesPerPixel;
142 }
143 
144 // Class to assign a short TID to each new thread, for HIP debugging purposes.
145 class TidInfo {
146  public:
147  TidInfo();
148 
149  int tid() const { return _shortTid; };
150  pid_t pid() const { return _pid; };
151  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
152  uint64_t apiSeqNum() const { return _apiSeqNum; };
153 
154  private:
155  int _shortTid;
156  pid_t _pid;
157 
158  // monotonically increasing API sequence number for this threa.
159  uint64_t _apiSeqNum;
160 };
161 
162 struct ProfTrigger {
163  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
164 
165  void print(int tid) {
166  std::cout << "Enabling tracing for ";
167  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
168  std::cout << "tid:" << tid << "." << *iter << ",";
169  }
170  std::cout << "\n";
171  };
172 
173  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
174  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
175  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
176 
177  private:
178  std::vector<uint64_t> _profTrigger;
179 };
180 
181 
182 //---
183 // Extern TLS
184 // Use a single struct to hold all TLS data. Attempt to reduce TLS accesses.
185 struct TlsData {
186  explicit TlsData() {
187  lastHipError = hipSuccess;
188  getPrimaryCtx = true;
189  defaultCtx = nullptr;
190  }
191 
192  hipError_t lastHipError;
193  TidInfo tidInfo;
194  // This is the implicit context used by all HIP commands.
195  // It can be set by hipSetDevice or by the CTX manipulation commands:
196  ihipCtx_t* defaultCtx;
197  // Stack of contexts
198  std::stack<ihipCtx_t*> ctxStack;
199  bool getPrimaryCtx;
200 };
201 TlsData* tls_get_ptr();
202 #define GET_TLS() TlsData *tls = tls_get_ptr()
203 
204 extern std::vector<ProfTrigger> g_dbStartTriggers;
205 extern std::vector<ProfTrigger> g_dbStopTriggers;
206 
207 //---
208 // Forward defs:
209 class ihipStream_t;
210 class ihipDevice_t;
211 class ihipCtx_t;
212 struct ihipEventData_t;
213 
214 // Color defs for debug messages:
215 #define KNRM "\x1B[0m"
216 #define KRED "\x1B[31m"
217 #define KGRN "\x1B[32m"
218 #define KYEL "\x1B[33m"
219 #define KBLU "\x1B[34m"
220 #define KMAG "\x1B[35m"
221 #define KCYN "\x1B[36m"
222 #define KWHT "\x1B[37m"
223 
224 extern const char* API_COLOR;
225 extern const char* API_COLOR_END;
226 
227 
228 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
229 // Can disable for performance or functional experiments - in this case
230 // the code uses a dummy "no-op" mutex.
231 #define EVENT_THREAD_SAFE 1
232 
233 #define STREAM_THREAD_SAFE 1
234 
235 #define CTX_THREAD_SAFE 1
236 
237 #define DEVICE_THREAD_SAFE 1
238 
239 
240 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
241 // May be set to 0 to remove debug if checks - possible code size and performance difference?
242 #define COMPILE_HIP_DB 1
243 
244 
245 // Compile HIP tracing capability.
246 // 0x1 = print a string at function entry with arguments.
247 // 0x2 = prints a simple message with function name + return code when function exits.
248 // 0x3 = print both.
249 // Must be enabled at runtime with HIP_TRACE_API
250 #define COMPILE_HIP_TRACE_API 0x3
251 
252 //---
253 // HIP Trace modes - use with HIP_TRACE_API=...
254 #define TRACE_ALL 0 // 0x01
255 #define TRACE_KCMD 1 // 0x02, kernel command
256 #define TRACE_MCMD 2 // 0x04, memory command
257 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
258 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
259 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
260 
261 
262 //---
263 // HIP_DB Debug flags:
264 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
265 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
266 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
267 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
268 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
269 #define DB_FB 5 /* 0x20 - trace loading fat binary */
270 #define DB_MAX_FLAG 6
271 // When adding a new debug flag, also add to the char name table below.
272 //
273 //
274 
275 struct DbName {
276  const char* _color;
277  const char* _shortName;
278 };
279 
280 // This table must be kept in-sync with the defines above.
281 static const DbName dbName[] = {
282  {KGRN, "api"}, // not used,
283  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
284  {KBLU, "fatbin"},
285 };
286 
287 
288 #if COMPILE_HIP_DB
289 #define tprintf(trace_level, ...) \
290  { \
291  if (HIP_DB & (1 << (trace_level))) { \
292  GET_TLS(); \
293  char msgStr[1000]; \
294  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
295  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
296  dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \
297  } \
298  }
299 #else
300 /* Compile to empty code */
301 #define tprintf(trace_level, ...)
302 #endif
303 
304 
305 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
306 
307 //---
308 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
309 
310 #if (COMPILE_HIP_TRACE_API & 0x1)
311 #define API_TRACE(forceTrace, ...) \
312  GET_TLS(); \
313  uint64_t hipApiStartTick = 0; \
314  { \
315  tls->tidInfo.incApiSeqNum(); \
316  if (forceTrace || \
317  (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \
318  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
319  std::string fullStr; \
320  hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
321  } \
322  }
323 
324 #else
325 // Swallow API_TRACE
326 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum();
327 #endif
328 
329 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls)
330 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx
331 
332 #define HIP_SET_DEVICE() ihipDeviceSetState(tls);
333 
334 // This macro should be called at the beginning of every HIP API.
335 // It initializes the hip runtime (exactly once), and
336 // generates a trace string that can be output to stderr or to ATP file.
337 #define HIP_INIT_API(cid, ...) \
338  hip_impl::hip_init(); \
339  API_TRACE(0, __VA_ARGS__); \
340  HIP_CB_SPAWNER_OBJECT(cid);
341 
342 
343 // Like above, but will trace with a specified "special" bit.
344 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
345 // kernel launches, copy commands, memory sets, etc.
346 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
347  hip_impl::hip_init(); \
348  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
349  HIP_CB_SPAWNER_OBJECT(cid);
350 
351 
352 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
353 // APIS (not internal hip) It has dual function: logs the last error returned for use by
354 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
355 #define ihipLogStatus(hipStatus) \
356  ({ \
357  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
358  tls->lastHipError = localHipStatus; \
359  \
360  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
361  auto ticks = getTicks() - hipApiStartTick; \
362  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
363  (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \
364  tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
365  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
366  } \
367  localHipStatus; \
368  })
369 
370 
371 class ihipException : public std::exception {
372  public:
373  explicit ihipException(hipError_t e) : _code(e){};
374 
375  hipError_t _code;
376 };
377 
378 
379 #ifdef __cplusplus
380 extern "C" {
381 #endif
382 
383 
384 #ifdef __cplusplus
385 }
386 #endif
387 
388 const hipStream_t hipStreamNull = 0x0;
389 
390 
394 #define HIP_IPC_MEM_RESERVED_SIZE 24
396  public:
397 #if USE_IPC
398  hsa_amd_ipc_memory_t ipc_handle;
399 #endif
400  size_t psize;
401  char reserved[HIP_IPC_MEM_RESERVED_SIZE];
402 };
403 
407 #define HIP_IPC_EVENT_RESERVED_SIZE 32
409  public:
410 #if USE_IPC
411  char shmem_name[HIP_IPC_HANDLE_SIZE];
412 #endif
413 };
414 
415 struct ihipModule_t {
416  std::string fileName;
417  hsa_executable_t executable = {};
418  hsa_code_object_reader_t coReader = {};
419  std::string hash;
420  std::unordered_map<
421  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
422 
423  ~ihipModule_t() {
424  if (executable.handle) hsa_executable_destroy(executable);
425  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
426  }
427 };
428 
429 
430 //---
431 // Used to remove lock, for performance or stimulating bugs.
432 class FakeMutex {
433  public:
434  void lock() {}
435  bool try_lock() { return true; }
436  void unlock() {}
437 };
438 
439 #if EVENT_THREAD_SAFE
440 typedef std::mutex EventMutex;
441 #else
442 #warning "Stream thread-safe disabled"
443 typedef FakeMutex EventMutex;
444 #endif
445 
446 #if STREAM_THREAD_SAFE
447 typedef std::mutex StreamMutex;
448 #else
449 #warning "Stream thread-safe disabled"
450 typedef FakeMutex StreamMutex;
451 #endif
452 
453 // Pair Device and Ctx together, these could also be toggled separately if desired.
454 #if CTX_THREAD_SAFE
455 typedef std::mutex CtxMutex;
456 #else
457 typedef FakeMutex CtxMutex;
458 #warning "Ctx thread-safe disabled"
459 #endif
460 
461 #if DEVICE_THREAD_SAFE
462 typedef std::mutex DeviceMutex;
463 #else
464 typedef FakeMutex DeviceMutex;
465 #warning "Device thread-safe disabled"
466 #endif
467 
468 //
469 //---
470 // Protects access to the member _data with a lock acquired on contruction/destruction.
471 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
472 template <typename T>
474  public:
475  LockedAccessor(T& criticalData, bool autoUnlock = true)
476  : _criticalData(&criticalData),
477  _autoUnlock(autoUnlock)
478 
479  {
480  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
481  ToString(_criticalData->_parent).c_str());
482  _criticalData->_mutex.lock();
483  };
484 
485  ~LockedAccessor() {
486  if (_autoUnlock) {
487  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
488  ToString(_criticalData->_parent).c_str());
489  _criticalData->_mutex.unlock();
490  }
491  }
492 
493  void unlock() {
494  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
495  ToString(_criticalData->_parent).c_str());
496  _criticalData->_mutex.unlock();
497  }
498 
499  // Syntactic sugar so -> can be used to get the underlying type.
500  T* operator->() { return _criticalData; };
501 
502  private:
503  T* _criticalData;
504  bool _autoUnlock;
505 };
506 
507 
508 template <typename MUTEX_TYPE>
509 struct LockedBase {
510  // Experts-only interface for explicit locking.
511  // Most uses should use the lock-accessor.
512  void lock() { _mutex.lock(); }
513  void unlock() { _mutex.unlock(); }
514  bool try_lock() { return _mutex.try_lock(); }
515 
516  MUTEX_TYPE _mutex;
517 };
518 
519 
520 template <typename MUTEX_TYPE>
521 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
522 public:
523  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
524  : _parent{parentStream}, _av{av}, _last_op_was_a_copy{false}
525  {}
526 
528 
531  return this;
532  };
533 
534  void munlock() {
535  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
536  ToString(this->_parent).c_str());
538  };
539 
541  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
542  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
543  ToString(this->_parent).c_str());
544  return gotLock ? this : nullptr;
545  };
546 
547  ihipStream_t* _parent;
548  hc::accelerator_view _av;
549  bool _last_op_was_a_copy;
550 };
551 
552 
553 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
554 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
555 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
556 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
557 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
558 // query.
559 
560 
563 
564 // do not change these two structs without changing the device library
565 struct mg_sync {
566  uint w0;
567  uint w1;
568 };
569 
570 struct mg_info {
571  struct mg_sync *mgs;
572  uint grid_id;
573  uint num_grids;
574  ulong prev_sum;
575  ulong all_sum;
576 };
577 
578 //---
579 // Internal stream structure.
581  public:
582  enum ScheduleMode { Auto, Spin, Yield };
583  typedef uint64_t SeqNum_t;
584 
585  // TODOD -make av a reference to avoid shared_ptr overhead?
586  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
587  ~ihipStream_t();
588 
589  // kind is hipMemcpyKind
590  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
591  bool resolveOn = true);
592 
593  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
594  bool resolveOn = true);
595 
596  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
597 
598  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
599 
600  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
601  size_t offset, unsigned kind);
602  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
603  size_t offset, unsigned kind);
604 
605  //---
606  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
607  // the critical mutex.
608  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
609  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
610 
611  void locked_wait(bool& waited);
612  void locked_wait();
613 
614  hc::accelerator_view* locked_getAv() {
615  LockedAccessor_StreamCrit_t crit(_criticalData);
616  return &(crit->_av);
617  };
618 
619  void locked_streamWaitEvent(ihipEventData_t& event);
620  hc::completion_future locked_recordEvent(hipEvent_t event);
621 
622  ihipStreamCritical_t& criticalData() { return _criticalData; };
623 
624  //---
625  hc::hcWaitMode waitMode() const;
626 
627  // Use this if we already have the stream critical data mutex:
628  void wait(LockedAccessor_StreamCrit_t& crit);
629 
630  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
631  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
632  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
633  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
634  uint64_t kernel);
635 
636 
637  //-- Non-racy accessors:
638  // These functions access fields set at initialization time and are non-racy (so do not acquire
639  // mutex)
640  const ihipDevice_t* getDevice() const;
641  ihipCtx_t* getCtx() const;
642 
643  // Before calling this function, stream must be resolved from "0" to the actual stream:
644  bool isDefaultStream() const { return _id == 0; };
645 
646  std::vector<mg_info*> coopMemsTracker;
647 
648  public:
649  //---
650  // Public member vars - these are set at initialization and never change:
651  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
652  unsigned _flags;
653 
654 
655  private:
656  // The unsigned return is hipMemcpyKind
657  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
658  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
659  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
660  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
661 
662  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
663  const hc::AmPointerInfo* srcInfo);
664 
665  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
666 
667  private: // Data
668  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
669  ihipStreamCritical_t _criticalData;
670 
671  std::mutex _hasQueueLock;
672 
673  ihipCtx_t* _ctx; // parent context that owns this stream.
674 
675  // Friends:
676  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
677  friend hipError_t hipStreamQuery(hipStream_t);
678 
679  ScheduleMode _scheduleMode;
680 };
681 
682 
683 //----
684 // Internal event structure:
685 enum hipEventStatus_t {
686  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
687  hipEventStatusCreated = 1, // event created, but not yet Recorded
688  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
689  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
690 };
691 
692 // TODO - rename to ihip type of some kind
693 enum ihipEventType_t {
694  hipEventTypeIndependent,
695  hipEventTypeStartCommand,
696  hipEventTypeStopCommand,
697 };
698 
699 #define IPC_SIGNALS_PER_EVENT 32
700 typedef struct ihipIpcEventShmem_s {
701  std::atomic<int> owners;
702  std::atomic<int> read_index;
703  std::atomic<int> write_index;
704  std::atomic<int> signal[IPC_SIGNALS_PER_EVENT];
706 
707 
709  ihipEventData_t() {
710  _state = hipEventStatusCreated;
711  _stream = NULL;
712  _timestamp = 0;
713  _type = hipEventTypeIndependent;
714  _ipc_name = "";
715  _ipc_fd = 0;
716  _ipc_shmem = NULL;
717  };
718 
719  void marker(const hc::completion_future& marker) { _marker = marker; }
720  hc::completion_future& marker() { return _marker; }
721  uint64_t timestamp() const { return _timestamp; }
722  ihipEventType_t type() const { return _type; }
723 
724  ihipEventType_t _type;
725  hipEventStatus_t _state;
726  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
727  // stream when recorded
728  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
729  std::string _ipc_name;
730  int _ipc_fd;
731  ihipIpcEventShmem_t *_ipc_shmem;
732  private:
733  hc::completion_future _marker;
734 };
735 
736 
737 //=============================================================================
738 // class ihipEventCriticalBase_t
739 template <typename MUTEX_TYPE>
740 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
741  public:
742  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
744 
745  // Keep data in structure so it can be easily copied into snapshots
746  // (used to reduce lock contention and preserve correct lock order)
747  ihipEventData_t _eventData;
748 
749  private:
750  const ihipEvent_t* _parent;
752 };
753 
755 
757 
758 // internal hip event structure.
759 class ihipEvent_t {
760  public:
761  explicit ihipEvent_t(unsigned flags);
762  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
763  ihipEventType_t eventType);
764 
765  // Return a copy of the critical state. The critical data is locked during the copy.
766  ihipEventData_t locked_copyCrit() {
767  LockedAccessor_EventCrit_t crit(_criticalData);
768  return _criticalData._eventData;
769  };
770 
771  ihipEventCritical_t& criticalData() { return _criticalData; };
772 
773  public:
774  unsigned _flags;
775  int _deviceId;
776 
777  private:
778  ihipEventCritical_t _criticalData;
779 
780  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
781 };
782 
783 
784 //=============================================================================
785 // class ihipDeviceCriticalBase_t
786 template <typename MUTEX_TYPE>
788  public:
789  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
790  : _parent(parentDevice), _ctxCount(0){};
791 
793 
794  // Contexts:
795  void addContext(ihipCtx_t* ctx);
796  void removeContext(ihipCtx_t* ctx);
797  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
798  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
799  int getcount() { return _ctxCount; };
801 
802  private:
803  ihipDevice_t* _parent;
804 
805  //--- Context Tracker:
806  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
807 
808  int _ctxCount;
809 };
810 
812 
814 
815 //----
816 // Properties of the HIP device.
817 // Multiple contexts can point to same device.
819  public:
820  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
821  ~ihipDevice_t();
822 
823  // Accessors:
824  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
825  void locked_removeContext(ihipCtx_t* c);
826  void locked_reset();
827  ihipDeviceCritical_t& criticalData() { return _criticalData; };
828 
829  public:
830  unsigned _deviceId; // device ID
831 
832  hc::accelerator _acc;
833  hsa_agent_t _hsaAgent; // hsa agent handle
834 
836  unsigned _computeUnits;
837  hipDeviceProp_t _props; // saved device properties.
838 
839  // Node id reported by kfd for this device
840  uint32_t _driver_node_id;
841 
842  ihipCtx_t* _primaryCtx;
843 
844  int _state; // 1 if device is set otherwise 0
845 
846  private:
847  hipError_t initProperties(hipDeviceProp_t* prop);
848 
849  private:
850  ihipDeviceCritical_t _criticalData;
851 };
852 //=============================================================================
853 
854 
855 //---
856 //
857 struct ihipExec_t {
858  dim3 _gridDim;
859  dim3 _blockDim;
860  size_t _sharedMem;
861  hipStream_t _hStream;
862  std::vector<char> _arguments;
863 };
864 
865 //=============================================================================
866 // class ihipCtxCriticalBase_t
867 template <typename MUTEX_TYPE>
868 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
869  public:
870  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
871  : _parent(parentCtx), _peerCnt(0) {
872  _peerAgents = new hsa_agent_t[deviceCnt];
873  };
874 
876  if (_peerAgents != nullptr) {
877  delete _peerAgents;
878  _peerAgents = nullptr;
879  }
880  _peerCnt = 0;
881  }
882 
883  // Streams:
884  void addStream(ihipStream_t* stream);
885  std::list<ihipStream_t*>& streams() { return _streams; };
886  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
887 
888 
889  // Peer Accessor classes:
890  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
891  // physically located on this device.
892  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
893  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
894  void resetPeerWatchers(ihipCtx_t* thisDevice);
895  void printPeerWatchers(FILE* f) const;
896 
897  uint32_t peerCnt() const { return _peerCnt; };
898  hsa_agent_t* peerAgents() const { return _peerAgents; };
899 
900 
901  // TODO - move private
902  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
903  //--- Execution stack:
904  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
905 
907 
908  private:
909  ihipCtx_t* _parent;
910 
911  //--- Stream Tracker:
912  std::list<ihipStream_t*> _streams; // streams associated with this device.
913 
914 
915  //--- Peer Tracker:
916  // These reflect the currently Enabled set of peers for this GPU:
917  // Enabled peers have permissions to access the memory physically allocated on this device.
918  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
919  uint32_t _peerCnt; // number of enabled peers
920  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
921  private:
922  void recomputePeerAgents();
923 };
924 // Note Mutex type Real/Fake selected based on CtxMutex
926 
927 // This type is used by functions that need access to the critical device structures.
929 //=============================================================================
930 
931 
932 //=============================================================================
933 // class ihipCtx_t:
934 // A HIP CTX (context) points at one of the existing devices and contains the streams,
935 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
936 // device.
937 //
938 class ihipCtx_t {
939  public: // Functions:
940  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
941  unsigned flags); // note: calls constructor for _criticalData
942  ~ihipCtx_t();
943 
944  // Functions which read or write the critical data are named locked_.
945  // (might be better called "locking_"
946  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
947  // locked_ function from within a locked_ function. External functions which call several
948  // locked_ functions will acquire and release the lock for each function. if this occurs in
949  // performance-sensitive code we may want to refactor by adding non-locked functions and
950  // creating a new locked_ member function to call them all.
951  void locked_removeStream(ihipStream_t* s);
952  void locked_reset();
953  void locked_waitAllStreams();
954  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
955 
956  ihipCtxCritical_t& criticalData() { return _criticalData; };
957 
958  const ihipDevice_t* getDevice() const { return _device; };
959  int getDeviceNum() const { return _device->_deviceId; };
960 
961  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
962  ihipDevice_t* getWriteableDevice() const { return _device; };
963 
964  std::string toString() const;
965 
966  public: // Data
967  // The NULL stream is used if no other stream is specified.
968  // Default stream has special synchronization properties with other streams.
969  ihipStream_t* _defaultStream;
970 
971  // Flags specified when the context is created:
972  unsigned _ctxFlags;
973 
974  private:
975  ihipDevice_t* _device;
976 
977 
978  private: // Critical data, protected with locked access:
979  // Members of _protected data MUST be accessed through the LockedAccessor.
980  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
981  // directly.
982  ihipCtxCritical_t _criticalData;
983 };
984 
985 
986 //=================================================================================================
987 // Global variable definition:
988 extern unsigned g_deviceCnt;
989 extern hsa_agent_t g_cpu_agent; // the CPU agent.
990 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
991 
992 //=================================================================================================
993 // Extern functions:
994 extern void ihipInit();
995 extern const char* ihipErrorString(hipError_t);
996 extern hipError_t ihipSynchronize(TlsData *tls);
997 extern void ihipCtxStackUpdate();
998 extern hipError_t ihipDeviceSetState(TlsData *tls);
999 
1000 extern ihipDevice_t* ihipGetDevice(int);
1001 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
1002 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
1003  const char* name, hsa_agent_t *agent);
1004 
1005 
1006 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
1007 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
1008 
1018 hipError_t ihipGetGlobalVar(hipDeviceptr_t* dev_ptr, size_t* size_ptr, const char* hostVar,
1019  hipModule_t hmod = nullptr);
1020 
1021 // Stream printf functions:
1022 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
1023  os << "stream:";
1024  os << s.getDevice()->_deviceId;
1025  ;
1026  os << '.';
1027  os << s._id;
1028  return os;
1029 }
1030 
1031 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
1032  os << '{';
1033  os << s.x;
1034  os << ',';
1035  os << s.y;
1036  os << ',';
1037  os << s.z;
1038  os << '}';
1039  return os;
1040 }
1041 
1042 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
1043  os << '{';
1044  os << s.x;
1045  os << ',';
1046  os << s.y;
1047  os << ',';
1048  os << s.z;
1049  os << '}';
1050  return os;
1051 }
1052 
1053 // Stream printf functions:
1054 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
1055  os << "event:" << std::hex << static_cast<void*>(e);
1056  return os;
1057 }
1058 
1059 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
1060  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
1061  return os;
1062 }
1063 
1064 
1065 // Helper functions that are used across src files:
1066 namespace hip_internal {
1067 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1068  hipStream_t stream);
1069 
1070 hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags, bool noSync = 0);
1071 
1072 hipError_t ihipHostFree(TlsData *tls, void* ptr);
1073 
1074 };
1075 
1076 #define MAX_COOPERATIVE_GPUs 255
1077 
1078 //---
1079 // TODO - review the context creation strategy here. Really should be:
1080 // - first "non-device" runtime call creates the context for this thread. Allowed to call
1081 // setDevice first.
1082 // - hipDeviceReset destroys the primary context for device?
1083 // - Then context is created again for next usage.
1084 static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
1085  // Per-thread initialization of the TLS:
1086  if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
1087  tls->defaultCtx = ihipGetPrimaryCtx(0);
1088  }
1089  return tls->defaultCtx;
1090 }
1091 
1100 hipFunction_t ihipGetDeviceFunction(const void *hostFunction);
1101 
1102 #endif
LockedBase
Definition: hip_hcc_internal.h:509
ihipIpcEventHandle_t
Definition: hip_hcc_internal.h:408
ihipException
Definition: hip_hcc_internal.h:371
mg_sync
Definition: hip_hcc_internal.h:565
ihipIpcEventShmem_s
Definition: hip_hcc_internal.h:700
TlsData
Definition: hip_hcc_internal.h:185
DbName
Definition: hip_hcc_internal.h:275
dim3::y
uint32_t y
y
Definition: hip_runtime_api.h:315
ihipExec_t
Definition: hip_hcc_internal.h:857
dim3::x
uint32_t x
x
Definition: hip_runtime_api.h:314
FakeMutex
Definition: hip_hcc_internal.h:432
mg_info
Definition: hip_hcc_internal.h:570
gl_dim3
Definition: grid_launch.h:18
hipSuccess
hipSuccess
Successful completion.
Definition: hip_runtime_api.h:196
ihipCtxCriticalBase_t
Definition: hip_hcc_internal.h:868
ProfTrigger
Definition: hip_hcc_internal.h:162
hipDeviceProp_t
Definition: hip_runtime_api.h:83
ihipCtx_t
Definition: hip_hcc_internal.h:938
ihipEvent_t
Definition: hip_hcc_internal.h:759
ihipDeviceCriticalBase_t
Definition: hip_hcc_internal.h:787
ihipIpcMemHandle_t::ipc_handle
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:398
ihipStream_t
Definition: hip_hcc_internal.h:580
ihipDevice_t::_computeUnits
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:836
LockedAccessor
Definition: hip_hcc_internal.h:473
ihipStreamCriticalBase_t
Definition: hip_hcc_internal.h:521
dim3
Definition: hip_runtime_api.h:313
TidInfo
Definition: hip_hcc_internal.h:145
ihipEventCriticalBase_t
Definition: hip_hcc_internal.h:740
ihipEventData_t
Definition: hip_hcc_internal.h:708
ihipModuleSymbol_t
Definition: hip_module.cpp:108
ihipStream_t::hipStreamQuery
friend hipError_t hipStreamQuery(hipStream_t)
Return hipSuccess if all of the operations in the specified stream have completed,...
Definition: hip_stream.cpp:161
dim3::z
uint32_t z
z
Definition: hip_runtime_api.h:316
ihipModule_t
Definition: hip_hcc_internal.h:415
ihipDevice_t
Definition: hip_hcc_internal.h:818
ihipEvent_t::hipEventRecord
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:213
ihipIpcMemHandle_t
Definition: hip_hcc_internal.h:395