HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 extern int HIP_PROFILE_API;
67 // extern int HIP_TRACE_API;
68 extern int HIP_ATP;
69 extern int HIP_DB;
70 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
71 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
72 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
73 extern int HIP_FORCE_P2P_HOST;
74 
75 extern int HIP_HOST_COHERENT;
76 
77 extern int HIP_HIDDEN_FREE_MEM;
78 //---
79 // Chicken bits for disabling functionality to work around potential issues:
80 extern int HIP_SYNC_HOST_ALLOC;
81 extern int HIP_SYNC_STREAM_WAIT;
82 
83 extern int HIP_SYNC_NULL_STREAM;
84 extern int HIP_INIT_ALLOC;
85 extern int HIP_FORCE_NULL_STREAM;
86 
87 extern int HIP_SYNC_FREE;
88 
89 extern int HIP_DUMP_CODE_OBJECT;
90 
91 // TODO - remove when this is standard behavior.
92 extern int HCC_OPT_FLUSH;
93 
94 // Class to assign a short TID to each new thread, for HIP debugging purposes.
95 class TidInfo {
96  public:
97  TidInfo();
98 
99  int tid() const { return _shortTid; };
100  pid_t pid() const { return _pid; };
101  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
102  uint64_t apiSeqNum() const { return _apiSeqNum; };
103 
104  private:
105  int _shortTid;
106  pid_t _pid;
107 
108  // monotonically increasing API sequence number for this threa.
109  uint64_t _apiSeqNum;
110 };
111 
112 struct ProfTrigger {
113  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
114 
115  void print(int tid) {
116  std::cout << "Enabling tracing for ";
117  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
118  std::cout << "tid:" << tid << "." << *iter << ",";
119  }
120  std::cout << "\n";
121  };
122 
123  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
124  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
125  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
126 
127  private:
128  std::vector<uint64_t> _profTrigger;
129 };
130 
131 
132 //---
133 // Extern tls
134 extern thread_local hipError_t tls_lastHipError;
135 extern thread_local TidInfo tls_tidInfo;
136 extern thread_local bool tls_getPrimaryCtx;
137 
138 extern std::vector<ProfTrigger> g_dbStartTriggers;
139 extern std::vector<ProfTrigger> g_dbStopTriggers;
140 
141 //---
142 // Forward defs:
143 class ihipStream_t;
144 class ihipDevice_t;
145 class ihipCtx_t;
146 struct ihipEventData_t;
147 
148 // Color defs for debug messages:
149 #define KNRM "\x1B[0m"
150 #define KRED "\x1B[31m"
151 #define KGRN "\x1B[32m"
152 #define KYEL "\x1B[33m"
153 #define KBLU "\x1B[34m"
154 #define KMAG "\x1B[35m"
155 #define KCYN "\x1B[36m"
156 #define KWHT "\x1B[37m"
157 
158 extern const char* API_COLOR;
159 extern const char* API_COLOR_END;
160 
161 
162 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
163 // Can disable for performance or functional experiments - in this case
164 // the code uses a dummy "no-op" mutex.
165 #define EVENT_THREAD_SAFE 1
166 
167 #define STREAM_THREAD_SAFE 1
168 
169 #define CTX_THREAD_SAFE 1
170 
171 #define DEVICE_THREAD_SAFE 1
172 
173 
174 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
175 // May be set to 0 to remove debug if checks - possible code size and performance difference?
176 #define COMPILE_HIP_DB 1
177 
178 
179 // Compile HIP tracing capability.
180 // 0x1 = print a string at function entry with arguments.
181 // 0x2 = prints a simple message with function name + return code when function exits.
182 // 0x3 = print both.
183 // Must be enabled at runtime with HIP_TRACE_API
184 #define COMPILE_HIP_TRACE_API 0x3
185 
186 
187 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
188 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
189 #ifndef COMPILE_HIP_ATP_MARKER
190 #define COMPILE_HIP_ATP_MARKER 0
191 #endif
192 
193 
194 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
195 // boundary.
196 // TODO - currently we print the trace message at the beginning. if we waited, we could also
197 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
198 // by hipMalloc).
199 #if COMPILE_HIP_ATP_MARKER
200 #include "CXLActivityLogger.h"
201 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
202 #define MARKER_END() amdtEndMarker();
203 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
204 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
205 #else
206 // Swallow scoped markers:
207 #define MARKER_BEGIN(markerName, group)
208 #define MARKER_END()
209 #define RESUME_PROFILING
210 #define STOP_PROFILING
211 #endif
212 
213 
214 //---
215 // HIP Trace modes - use with HIP_TRACE_API=...
216 #define TRACE_ALL 0 // 0x01
217 #define TRACE_KCMD 1 // 0x02, kernel command
218 #define TRACE_MCMD 2 // 0x04, memory command
219 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
220 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
221 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
222 
223 
224 //---
225 // HIP_DB Debug flags:
226 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
227 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
228 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
229 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
230 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
231 #define DB_FB 5 /* 0x20 - trace loading fat binary */
232 #define DB_MAX_FLAG 6
233 // When adding a new debug flag, also add to the char name table below.
234 //
235 //
236 
237 struct DbName {
238  const char* _color;
239  const char* _shortName;
240 };
241 
242 // This table must be kept in-sync with the defines above.
243 static const DbName dbName[] = {
244  {KGRN, "api"}, // not used,
245  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
246  {KBLU, "fatbin"},
247 };
248 
249 
250 #if COMPILE_HIP_DB
251 #define tprintf(trace_level, ...) \
252  { \
253  if (HIP_DB & (1 << (trace_level))) { \
254  char msgStr[1000]; \
255  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
256  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
257  dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
258  } \
259  }
260 #else
261 /* Compile to empty code */
262 #define tprintf(trace_level, ...)
263 #endif
264 
265 
266 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
267 
268 //---
269 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
270 
271 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
272 #define API_TRACE(forceTrace, ...) \
273  uint64_t hipApiStartTick = 0; \
274  { \
275  tls_tidInfo.incApiSeqNum(); \
276  if (forceTrace || \
277  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
278  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
279  std::string fullStr; \
280  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
281  if (HIP_PROFILE_API == 0x1) { \
282  MARKER_BEGIN(__func__, "HIP") \
283  } else if (HIP_PROFILE_API == 0x2) { \
284  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
285  } \
286  } \
287  }
288 
289 #else
290 // Swallow API_TRACE
291 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
292 #endif
293 
294 #define HIP_SET_DEVICE() ihipDeviceSetState();
295 
296 // This macro should be called at the beginning of every HIP API.
297 // It initializes the hip runtime (exactly once), and
298 // generates a trace string that can be output to stderr or to ATP file.
299 #define HIP_INIT_API(cid, ...) \
300  hip_impl::hip_init(); \
301  API_TRACE(0, __VA_ARGS__); \
302  HIP_CB_SPAWNER_OBJECT(cid);
303 
304 
305 // Like above, but will trace with a specified "special" bit.
306 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
307 // kernel launches, copy commands, memory sets, etc.
308 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
309  hip_impl::hip_init(); \
310  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
311  HIP_CB_SPAWNER_OBJECT(cid);
312 
313 
314 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
315 // APIS (not internal hip) It has dual function: logs the last error returned for use by
316 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
317 #define ihipLogStatus(hipStatus) \
318  ({ \
319  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
320  tls_lastHipError = localHipStatus; \
321  \
322  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
323  auto ticks = getTicks() - hipApiStartTick; \
324  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
325  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
326  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
327  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
328  } \
329  if (HIP_PROFILE_API) { \
330  MARKER_END(); \
331  } \
332  localHipStatus; \
333  })
334 
335 
336 class ihipException : public std::exception {
337  public:
338  explicit ihipException(hipError_t e) : _code(e){};
339 
340  hipError_t _code;
341 };
342 
343 
344 #ifdef __cplusplus
345 extern "C" {
346 #endif
347 
348 
349 #ifdef __cplusplus
350 }
351 #endif
352 
353 const hipStream_t hipStreamNull = 0x0;
354 
355 
359 #define HIP_IPC_RESERVED_SIZE 24
361  public:
362 #if USE_IPC
363  hsa_amd_ipc_memory_t ipc_handle;
364 #endif
365  size_t psize;
366  char reserved[HIP_IPC_RESERVED_SIZE];
367 };
368 
369 
370 struct ihipModule_t {
371  std::string fileName;
372  hsa_executable_t executable = {};
373  hsa_code_object_reader_t coReader = {};
374  std::string hash;
375  std::unordered_map<
376  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
377 
378  ~ihipModule_t() {
379  if (executable.handle) hsa_executable_destroy(executable);
380  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
381  }
382 };
383 
384 
385 //---
386 // Used to remove lock, for performance or stimulating bugs.
387 class FakeMutex {
388  public:
389  void lock() {}
390  bool try_lock() { return true; }
391  void unlock() {}
392 };
393 
394 #if EVENT_THREAD_SAFE
395 typedef std::mutex EventMutex;
396 #else
397 #warning "Stream thread-safe disabled"
398 typedef FakeMutex EventMutex;
399 #endif
400 
401 #if STREAM_THREAD_SAFE
402 typedef std::mutex StreamMutex;
403 #else
404 #warning "Stream thread-safe disabled"
405 typedef FakeMutex StreamMutex;
406 #endif
407 
408 // Pair Device and Ctx together, these could also be toggled separately if desired.
409 #if CTX_THREAD_SAFE
410 typedef std::mutex CtxMutex;
411 #else
412 typedef FakeMutex CtxMutex;
413 #warning "Ctx thread-safe disabled"
414 #endif
415 
416 #if DEVICE_THREAD_SAFE
417 typedef std::mutex DeviceMutex;
418 #else
419 typedef FakeMutex DeviceMutex;
420 #warning "Device thread-safe disabled"
421 #endif
422 
423 //
424 //---
425 // Protects access to the member _data with a lock acquired on contruction/destruction.
426 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
427 template <typename T>
429  public:
430  LockedAccessor(T& criticalData, bool autoUnlock = true)
431  : _criticalData(&criticalData),
432  _autoUnlock(autoUnlock)
433 
434  {
435  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
436  ToString(_criticalData->_parent).c_str());
437  _criticalData->_mutex.lock();
438  };
439 
440  ~LockedAccessor() {
441  if (_autoUnlock) {
442  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
443  ToString(_criticalData->_parent).c_str());
444  _criticalData->_mutex.unlock();
445  }
446  }
447 
448  void unlock() {
449  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
450  ToString(_criticalData->_parent).c_str());
451  _criticalData->_mutex.unlock();
452  }
453 
454  // Syntactic sugar so -> can be used to get the underlying type.
455  T* operator->() { return _criticalData; };
456 
457  private:
458  T* _criticalData;
459  bool _autoUnlock;
460 };
461 
462 
463 template <typename MUTEX_TYPE>
464 struct LockedBase {
465  // Experts-only interface for explicit locking.
466  // Most uses should use the lock-accessor.
467  void lock() { _mutex.lock(); }
468  void unlock() { _mutex.unlock(); }
469  bool try_lock() { return _mutex.try_lock(); }
470 
471  MUTEX_TYPE _mutex;
472 };
473 
474 
475 template <typename MUTEX_TYPE>
476 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
477  public:
478  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
479  : _kernelCnt(0), _av(av), _parent(parentStream){};
480 
482 
485  return this;
486  };
487 
488  void munlock() {
489  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
490  ToString(this->_parent).c_str());
492  };
493 
495  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
496  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
497  ToString(this->_parent).c_str());
498  return gotLock ? this : nullptr;
499  };
500 
501  public:
502  ihipStream_t* _parent;
503  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
504 
505  hc::accelerator_view _av;
506 
507  private:
508 };
509 
510 
511 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
512 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
513 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
514 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
515 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
516 // query.
517 
518 
521 
522 //---
523 // Internal stream structure.
525  public:
526  enum ScheduleMode { Auto, Spin, Yield };
527  typedef uint64_t SeqNum_t;
528 
529  // TODOD -make av a reference to avoid shared_ptr overhead?
530  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
531  ~ihipStream_t();
532 
533  // kind is hipMemcpyKind
534  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
535  bool resolveOn = true);
536 
537  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
538  bool resolveOn = true);
539 
540  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
541 
542  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
543 
544  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
545  size_t offset, unsigned kind);
546  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
547  size_t offset, unsigned kind);
548 
549  //---
550  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
551  // the critical mutex.
552  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
553  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
554 
555 
556  void locked_wait();
557 
558  hc::accelerator_view* locked_getAv() {
559  LockedAccessor_StreamCrit_t crit(_criticalData);
560  return &(crit->_av);
561  };
562 
563  void locked_streamWaitEvent(ihipEventData_t& event);
564  hc::completion_future locked_recordEvent(hipEvent_t event);
565 
566  bool locked_eventIsReady(hipEvent_t event);
567  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
568 
569  ihipStreamCritical_t& criticalData() { return _criticalData; };
570 
571  //---
572  hc::hcWaitMode waitMode() const;
573 
574  // Use this if we already have the stream critical data mutex:
575  void wait(LockedAccessor_StreamCrit_t& crit);
576 
577  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
578  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
579  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
580  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
581  uint64_t kernel);
582 
583 
584  //-- Non-racy accessors:
585  // These functions access fields set at initialization time and are non-racy (so do not acquire
586  // mutex)
587  const ihipDevice_t* getDevice() const;
588  ihipCtx_t* getCtx() const;
589 
590  // Before calling this function, stream must be resolved from "0" to the actual stream:
591  bool isDefaultStream() const { return _id == 0; };
592 
593  public:
594  //---
595  // Public member vars - these are set at initialization and never change:
596  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
597  unsigned _flags;
598 
599 
600  private:
601  // The unsigned return is hipMemcpyKind
602  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
603  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
604  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
605  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
606 
607  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
608  const hc::AmPointerInfo* srcInfo);
609 
610  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
611 
612  private: // Data
613  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
614  ihipStreamCritical_t _criticalData;
615 
616  std::mutex _hasQueueLock;
617 
618  ihipCtx_t* _ctx; // parent context that owns this stream.
619 
620  // Friends:
621  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
622  friend hipError_t hipStreamQuery(hipStream_t);
623 
624  ScheduleMode _scheduleMode;
625 };
626 
627 
628 //----
629 // Internal structure for stream callback handler
631  public:
632  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
633  : _stream(stream), _callback(callback), _userData(userData) {
634  };
635  hipStream_t _stream;
636  hipStreamCallback_t _callback;
637  void* _userData;
638 };
639 
640 
641 //----
642 // Internal event structure:
643 enum hipEventStatus_t {
644  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
645  hipEventStatusCreated = 1, // event created, but not yet Recorded
646  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
647  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
648 };
649 
650 // TODO - rename to ihip type of some kind
651 enum ihipEventType_t {
652  hipEventTypeIndependent,
653  hipEventTypeStartCommand,
654  hipEventTypeStopCommand,
655 };
656 
657 
659  ihipEventData_t() {
660  _state = hipEventStatusCreated;
661  _stream = NULL;
662  _timestamp = 0;
663  _type = hipEventTypeIndependent;
664  };
665 
666  void marker(const hc::completion_future& marker) { _marker = marker; };
667  hc::completion_future& marker() { return _marker; }
668  uint64_t timestamp() const { return _timestamp; };
669  ihipEventType_t type() const { return _type; };
670 
671  ihipEventType_t _type;
672  hipEventStatus_t _state;
673  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
674  // stream when recorded
675  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
676  private:
677  hc::completion_future _marker;
678 };
679 
680 
681 //=============================================================================
682 // class ihipEventCriticalBase_t
683 template <typename MUTEX_TYPE>
684 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
685  public:
686  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
688 
689  // Keep data in structure so it can be easily copied into snapshots
690  // (used to reduce lock contention and preserve correct lock order)
691  ihipEventData_t _eventData;
692 
693  private:
694  const ihipEvent_t* _parent;
695  friend class LockedAccessor<ihipEventCriticalBase_t>;
696 };
697 
699 
701 
702 // internal hip event structure.
703 class ihipEvent_t {
704  public:
705  explicit ihipEvent_t(unsigned flags);
706  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
707  ihipEventType_t eventType);
708  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
709 
710 
711  // Return a copy of the critical state. The critical data is locked during the copy.
712  ihipEventData_t locked_copyCrit() {
713  LockedAccessor_EventCrit_t crit(_criticalData);
714  return _criticalData._eventData;
715  };
716 
717  ihipEventCritical_t& criticalData() { return _criticalData; };
718 
719  public:
720  unsigned _flags;
721 
722  private:
723  ihipEventCritical_t _criticalData;
724 
725  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
726 };
727 
728 
729 //=============================================================================
730 // class ihipDeviceCriticalBase_t
731 template <typename MUTEX_TYPE>
733  public:
734  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
735  : _parent(parentDevice), _ctxCount(0){};
736 
738 
739  // Contexts:
740  void addContext(ihipCtx_t* ctx);
741  void removeContext(ihipCtx_t* ctx);
742  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
743  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
744  int getcount() { return _ctxCount; };
745  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
746 
747  private:
748  ihipDevice_t* _parent;
749 
750  //--- Context Tracker:
751  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
752 
753  int _ctxCount;
754 };
755 
757 
759 
760 //----
761 // Properties of the HIP device.
762 // Multiple contexts can point to same device.
764  public:
765  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
766  ~ihipDevice_t();
767 
768  // Accessors:
769  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
770  void locked_removeContext(ihipCtx_t* c);
771  void locked_reset();
772  ihipDeviceCritical_t& criticalData() { return _criticalData; };
773 
774  public:
775  unsigned _deviceId; // device ID
776 
777  hc::accelerator _acc;
778  hsa_agent_t _hsaAgent; // hsa agent handle
779 
781  unsigned _computeUnits;
782  hipDeviceProp_t _props; // saved device properties.
783 
784  // TODO - report this through device properties, base on HCC API call.
785  int _isLargeBar;
786 
787  ihipCtx_t* _primaryCtx;
788 
789  int _state; // 1 if device is set otherwise 0
790 
791  private:
792  hipError_t initProperties(hipDeviceProp_t* prop);
793 
794  private:
795  ihipDeviceCritical_t _criticalData;
796 };
797 //=============================================================================
798 
799 
800 //---
801 //
802 struct ihipExec_t {
803  dim3 _gridDim;
804  dim3 _blockDim;
805  size_t _sharedMem;
806  hipStream_t _hStream;
807  std::vector<char> _arguments;
808 };
809 
810 //=============================================================================
811 // class ihipCtxCriticalBase_t
812 template <typename MUTEX_TYPE>
813 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
814  public:
815  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
816  : _parent(parentCtx), _peerCnt(0) {
817  _peerAgents = new hsa_agent_t[deviceCnt];
818  };
819 
821  if (_peerAgents != nullptr) {
822  delete _peerAgents;
823  _peerAgents = nullptr;
824  }
825  _peerCnt = 0;
826  }
827 
828  // Streams:
829  void addStream(ihipStream_t* stream);
830  std::list<ihipStream_t*>& streams() { return _streams; };
831  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
832 
833 
834  // Peer Accessor classes:
835  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
836  // physically located on this device.
837  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
838  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
839  void resetPeerWatchers(ihipCtx_t* thisDevice);
840  void printPeerWatchers(FILE* f) const;
841 
842  uint32_t peerCnt() const { return _peerCnt; };
843  hsa_agent_t* peerAgents() const { return _peerAgents; };
844 
845 
846  // TODO - move private
847  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
848  //--- Execution stack:
849  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
850 
851  friend class LockedAccessor<ihipCtxCriticalBase_t>;
852 
853  private:
854  ihipCtx_t* _parent;
855 
856  //--- Stream Tracker:
857  std::list<ihipStream_t*> _streams; // streams associated with this device.
858 
859 
860  //--- Peer Tracker:
861  // These reflect the currently Enabled set of peers for this GPU:
862  // Enabled peers have permissions to access the memory physically allocated on this device.
863  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
864  uint32_t _peerCnt; // number of enabled peers
865  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
866  private:
867  void recomputePeerAgents();
868 };
869 // Note Mutex type Real/Fake selected based on CtxMutex
871 
872 // This type is used by functions that need access to the critical device structures.
874 //=============================================================================
875 
876 
877 //=============================================================================
878 // class ihipCtx_t:
879 // A HIP CTX (context) points at one of the existing devices and contains the streams,
880 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
881 // device.
882 //
883 class ihipCtx_t {
884  public: // Functions:
885  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
886  unsigned flags); // note: calls constructor for _criticalData
887  ~ihipCtx_t();
888 
889  // Functions which read or write the critical data are named locked_.
890  // (might be better called "locking_"
891  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
892  // locked_ function from within a locked_ function. External functions which call several
893  // locked_ functions will acquire and release the lock for each function. if this occurs in
894  // performance-sensitive code we may want to refactor by adding non-locked functions and
895  // creating a new locked_ member function to call them all.
896  void locked_removeStream(ihipStream_t* s);
897  void locked_reset();
898  void locked_waitAllStreams();
899  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
900 
901  ihipCtxCritical_t& criticalData() { return _criticalData; };
902 
903  const ihipDevice_t* getDevice() const { return _device; };
904  int getDeviceNum() const { return _device->_deviceId; };
905 
906  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
907  ihipDevice_t* getWriteableDevice() const { return _device; };
908 
909  std::string toString() const;
910 
911  public: // Data
912  // The NULL stream is used if no other stream is specified.
913  // Default stream has special synchronization properties with other streams.
914  ihipStream_t* _defaultStream;
915 
916  // Flags specified when the context is created:
917  unsigned _ctxFlags;
918 
919  private:
920  ihipDevice_t* _device;
921 
922 
923  private: // Critical data, protected with locked access:
924  // Members of _protected data MUST be accessed through the LockedAccessor.
925  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
926  // directly.
927  ihipCtxCritical_t _criticalData;
928 };
929 
930 
931 //=================================================================================================
932 // Global variable definition:
933 extern unsigned g_deviceCnt;
934 extern hsa_agent_t g_cpu_agent; // the CPU agent.
935 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
936 
937 //=================================================================================================
938 // Extern functions:
939 extern void ihipInit();
940 extern const char* ihipErrorString(hipError_t);
941 extern ihipCtx_t* ihipGetTlsDefaultCtx();
942 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
943 extern hipError_t ihipSynchronize(void);
944 extern void ihipCtxStackUpdate();
945 extern hipError_t ihipDeviceSetState();
946 
947 extern ihipDevice_t* ihipGetDevice(int);
948 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
949 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
950  const char* name, hsa_agent_t *agent);
951 
952 
953 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
954 hipError_t ihipStreamSynchronize(hipStream_t stream);
955 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
956 
957 // Stream printf functions:
958 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
959  os << "stream:";
960  os << s.getDevice()->_deviceId;
961  ;
962  os << '.';
963  os << s._id;
964  return os;
965 }
966 
967 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
968  os << '{';
969  os << s.x;
970  os << ',';
971  os << s.y;
972  os << ',';
973  os << s.z;
974  os << '}';
975  return os;
976 }
977 
978 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
979  os << '{';
980  os << s.x;
981  os << ',';
982  os << s.y;
983  os << ',';
984  os << s.z;
985  os << '}';
986  return os;
987 }
988 
989 // Stream printf functions:
990 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
991  os << "event:" << std::hex << static_cast<void*>(e);
992  return os;
993 }
994 
995 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
996  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
997  return os;
998 }
999 
1000 
1001 // Helper functions that are used across src files:
1002 namespace hip_internal {
1003 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1004  hipStream_t stream);
1005 };
1006 
1007 
1008 #endif
Definition: hip_hcc_internal.h:237
Definition: hip_hcc_internal.h:763
Definition: hip_hcc_internal.h:464
Definition: hip_hcc_internal.h:112
Definition: hip_hcc_internal.h:387
Definition: hip_hcc_internal.h:360
uint32_t x
x
Definition: hip_runtime_api.h:270
Definition: grid_launch.h:17
Definition: hip_module.cpp:106
Definition: hip_hcc_internal.h:802
Definition: hip_hcc_internal.h:883
Definition: hip_runtime_api.h:269
uint32_t y
y
Definition: hip_runtime_api.h:271
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:843
Definition: hip_hcc_internal.h:630
Definition: hip_hcc_internal.h:732
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:781
uint32_t z
z
Definition: hip_runtime_api.h:272
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:658
Definition: hip_hcc_internal.h:684
Definition: hip_hcc_internal.h:1002
Definition: hip_hcc_internal.h:336
Definition: hip_hcc_internal.h:703
Definition: hip_hcc_internal.h:813
Definition: hip_hcc_internal.h:524
Definition: hip_hcc_internal.h:476
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:363
Definition: hip_hcc_internal.h:428
Definition: hip_hcc_internal.h:370
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:161
Definition: hip_hcc_internal.h:95