HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 extern int HIP_PROFILE_API;
67 // extern int HIP_TRACE_API;
68 extern int HIP_ATP;
69 extern int HIP_DB;
70 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
71 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
72 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
73 extern int HIP_FORCE_P2P_HOST;
74 
75 extern int HIP_HOST_COHERENT;
76 
77 extern int HIP_HIDDEN_FREE_MEM;
78 //---
79 // Chicken bits for disabling functionality to work around potential issues:
80 extern int HIP_SYNC_HOST_ALLOC;
81 extern int HIP_SYNC_STREAM_WAIT;
82 
83 extern int HIP_SYNC_NULL_STREAM;
84 extern int HIP_INIT_ALLOC;
85 extern int HIP_FORCE_NULL_STREAM;
86 
87 extern int HIP_SYNC_FREE;
88 
89 extern int HIP_DUMP_CODE_OBJECT;
90 
91 // TODO - remove when this is standard behavior.
92 extern int HCC_OPT_FLUSH;
93 
94 // Class to assign a short TID to each new thread, for HIP debugging purposes.
95 class TidInfo {
96  public:
97  TidInfo();
98 
99  int tid() const { return _shortTid; };
100  pid_t pid() const { return _pid; };
101  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
102  uint64_t apiSeqNum() const { return _apiSeqNum; };
103 
104  private:
105  int _shortTid;
106  pid_t _pid;
107 
108  // monotonically increasing API sequence number for this threa.
109  uint64_t _apiSeqNum;
110 };
111 
112 struct ProfTrigger {
113  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
114 
115  void print(int tid) {
116  std::cout << "Enabling tracing for ";
117  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
118  std::cout << "tid:" << tid << "." << *iter << ",";
119  }
120  std::cout << "\n";
121  };
122 
123  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
124  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
125  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
126 
127  private:
128  std::vector<uint64_t> _profTrigger;
129 };
130 
131 
132 //---
133 // Extern tls
134 extern thread_local hipError_t tls_lastHipError;
135 extern thread_local TidInfo tls_tidInfo;
136 extern thread_local bool tls_getPrimaryCtx;
137 
138 extern std::vector<ProfTrigger> g_dbStartTriggers;
139 extern std::vector<ProfTrigger> g_dbStopTriggers;
140 
141 //---
142 // Forward defs:
143 class ihipStream_t;
144 class ihipDevice_t;
145 class ihipCtx_t;
146 struct ihipEventData_t;
147 
148 // Color defs for debug messages:
149 #define KNRM "\x1B[0m"
150 #define KRED "\x1B[31m"
151 #define KGRN "\x1B[32m"
152 #define KYEL "\x1B[33m"
153 #define KBLU "\x1B[34m"
154 #define KMAG "\x1B[35m"
155 #define KCYN "\x1B[36m"
156 #define KWHT "\x1B[37m"
157 
158 extern const char* API_COLOR;
159 extern const char* API_COLOR_END;
160 
161 
162 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
163 // Can disable for performance or functional experiments - in this case
164 // the code uses a dummy "no-op" mutex.
165 #define EVENT_THREAD_SAFE 1
166 
167 #define STREAM_THREAD_SAFE 1
168 
169 #define CTX_THREAD_SAFE 1
170 
171 #define DEVICE_THREAD_SAFE 1
172 
173 
174 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
175 // May be set to 0 to remove debug if checks - possible code size and performance difference?
176 #define COMPILE_HIP_DB 1
177 
178 
179 // Compile HIP tracing capability.
180 // 0x1 = print a string at function entry with arguments.
181 // 0x2 = prints a simple message with function name + return code when function exits.
182 // 0x3 = print both.
183 // Must be enabled at runtime with HIP_TRACE_API
184 #define COMPILE_HIP_TRACE_API 0x3
185 
186 
187 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
188 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
189 #ifndef COMPILE_HIP_ATP_MARKER
190 #define COMPILE_HIP_ATP_MARKER 0
191 #endif
192 
193 
194 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
195 // boundary.
196 // TODO - currently we print the trace message at the beginning. if we waited, we could also
197 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
198 // by hipMalloc).
199 #if COMPILE_HIP_ATP_MARKER
200 #include "CXLActivityLogger.h"
201 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
202 #define MARKER_END() amdtEndMarker();
203 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
204 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
205 #else
206 // Swallow scoped markers:
207 #define MARKER_BEGIN(markerName, group)
208 #define MARKER_END()
209 #define RESUME_PROFILING
210 #define STOP_PROFILING
211 #endif
212 
213 
214 //---
215 // HIP Trace modes - use with HIP_TRACE_API=...
216 #define TRACE_ALL 0 // 0x01
217 #define TRACE_KCMD 1 // 0x02, kernel command
218 #define TRACE_MCMD 2 // 0x04, memory command
219 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
220 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
221 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
222 
223 
224 //---
225 // HIP_DB Debug flags:
226 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
227 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
228 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
229 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
230 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
231 #define DB_FB 5 /* 0x20 - trace loading fat binary */
232 #define DB_MAX_FLAG 6
233 // When adding a new debug flag, also add to the char name table below.
234 //
235 //
236 
237 struct DbName {
238  const char* _color;
239  const char* _shortName;
240 };
241 
242 // This table must be kept in-sync with the defines above.
243 static const DbName dbName[] = {
244  {KGRN, "api"}, // not used,
245  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
246  {KBLU, "fatbin"},
247 };
248 
249 
250 #if COMPILE_HIP_DB
251 #define tprintf(trace_level, ...) \
252  { \
253  if (HIP_DB & (1 << (trace_level))) { \
254  char msgStr[1000]; \
255  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
256  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
257  dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
258  } \
259  }
260 #else
261 /* Compile to empty code */
262 #define tprintf(trace_level, ...)
263 #endif
264 
265 
266 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
267 
268 //---
269 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
270 
271 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
272 #define API_TRACE(forceTrace, ...) \
273  uint64_t hipApiStartTick = 0; \
274  { \
275  tls_tidInfo.incApiSeqNum(); \
276  if (forceTrace || \
277  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
278  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
279  std::string fullStr; \
280  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
281  if (HIP_PROFILE_API == 0x1) { \
282  MARKER_BEGIN(__func__, "HIP") \
283  } else if (HIP_PROFILE_API == 0x2) { \
284  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
285  } \
286  } \
287  }
288 
289 #else
290 // Swallow API_TRACE
291 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
292 #endif
293 
294 #define HIP_SET_DEVICE() ihipDeviceSetState();
295 
296 // This macro should be called at the beginning of every HIP API.
297 // It initializes the hip runtime (exactly once), and
298 // generates a trace string that can be output to stderr or to ATP file.
299 #define HIP_INIT_API(cid, ...) \
300  hip_impl::hip_init(); \
301  API_TRACE(0, __VA_ARGS__); \
302  HIP_CB_SPAWNER_OBJECT(cid);
303 
304 
305 // Like above, but will trace with a specified "special" bit.
306 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
307 // kernel launches, copy commands, memory sets, etc.
308 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
309  hip_impl::hip_init(); \
310  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
311  HIP_CB_SPAWNER_OBJECT(cid);
312 
313 
314 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
315 // APIS (not internal hip) It has dual function: logs the last error returned for use by
316 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
317 #define ihipLogStatus(hipStatus) \
318  ({ \
319  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
320  tls_lastHipError = localHipStatus; \
321  \
322  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
323  auto ticks = getTicks() - hipApiStartTick; \
324  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
325  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
326  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
327  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
328  } \
329  if (HIP_PROFILE_API) { \
330  MARKER_END(); \
331  } \
332  localHipStatus; \
333  })
334 
335 
336 class ihipException : public std::exception {
337  public:
338  explicit ihipException(hipError_t e) : _code(e){};
339 
340  hipError_t _code;
341 };
342 
343 
344 #ifdef __cplusplus
345 extern "C" {
346 #endif
347 
348 
349 #ifdef __cplusplus
350 }
351 #endif
352 
353 const hipStream_t hipStreamNull = 0x0;
354 
355 
359 #define HIP_IPC_RESERVED_SIZE 24
361  public:
362 #if USE_IPC
363  hsa_amd_ipc_memory_t ipc_handle;
364 #endif
365  size_t psize;
366  char reserved[HIP_IPC_RESERVED_SIZE];
367 };
368 
369 
370 struct ihipModule_t {
371  std::string fileName;
372  hsa_executable_t executable = {};
373  hsa_code_object_reader_t coReader = {};
374  std::string hash;
375  std::unordered_map<
376  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
377 
378  ~ihipModule_t() {
379  if (executable.handle) hsa_executable_destroy(executable);
380  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
381  }
382 };
383 
384 
385 //---
386 // Used to remove lock, for performance or stimulating bugs.
387 class FakeMutex {
388  public:
389  void lock() {}
390  bool try_lock() { return true; }
391  void unlock() {}
392 };
393 
394 #if EVENT_THREAD_SAFE
395 typedef std::mutex EventMutex;
396 #else
397 #warning "Stream thread-safe disabled"
398 typedef FakeMutex EventMutex;
399 #endif
400 
401 #if STREAM_THREAD_SAFE
402 typedef std::mutex StreamMutex;
403 #else
404 #warning "Stream thread-safe disabled"
405 typedef FakeMutex StreamMutex;
406 #endif
407 
408 // Pair Device and Ctx together, these could also be toggled separately if desired.
409 #if CTX_THREAD_SAFE
410 typedef std::mutex CtxMutex;
411 #else
412 typedef FakeMutex CtxMutex;
413 #warning "Ctx thread-safe disabled"
414 #endif
415 
416 #if DEVICE_THREAD_SAFE
417 typedef std::mutex DeviceMutex;
418 #else
419 typedef FakeMutex DeviceMutex;
420 #warning "Device thread-safe disabled"
421 #endif
422 
423 //
424 //---
425 // Protects access to the member _data with a lock acquired on contruction/destruction.
426 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
427 template <typename T>
429  public:
430  LockedAccessor(T& criticalData, bool autoUnlock = true)
431  : _criticalData(&criticalData),
432  _autoUnlock(autoUnlock)
433 
434  {
435  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
436  ToString(_criticalData->_parent).c_str());
437  _criticalData->_mutex.lock();
438  };
439 
440  ~LockedAccessor() {
441  if (_autoUnlock) {
442  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
443  ToString(_criticalData->_parent).c_str());
444  _criticalData->_mutex.unlock();
445  }
446  }
447 
448  void unlock() {
449  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
450  ToString(_criticalData->_parent).c_str());
451  _criticalData->_mutex.unlock();
452  }
453 
454  // Syntactic sugar so -> can be used to get the underlying type.
455  T* operator->() { return _criticalData; };
456 
457  private:
458  T* _criticalData;
459  bool _autoUnlock;
460 };
461 
462 
463 template <typename MUTEX_TYPE>
464 struct LockedBase {
465  // Experts-only interface for explicit locking.
466  // Most uses should use the lock-accessor.
467  void lock() { _mutex.lock(); }
468  void unlock() { _mutex.unlock(); }
469  bool try_lock() { return _mutex.try_lock(); }
470 
471  MUTEX_TYPE _mutex;
472 };
473 
474 
475 template <typename MUTEX_TYPE>
476 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
477  public:
478  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
479  : _kernelCnt(0), _av(av), _parent(parentStream){};
480 
482 
485  return this;
486  };
487 
488  void munlock() {
489  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
490  ToString(this->_parent).c_str());
492  };
493 
495  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
496  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
497  ToString(this->_parent).c_str());
498  return gotLock ? this : nullptr;
499  };
500 
501  public:
502  ihipStream_t* _parent;
503  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
504 
505  hc::accelerator_view _av;
506 
507  private:
508 };
509 
510 
511 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
512 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
513 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
514 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
515 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
516 // query.
517 
518 
521 
522 //---
523 // Internal stream structure.
525  public:
526  enum ScheduleMode { Auto, Spin, Yield };
527  typedef uint64_t SeqNum_t;
528 
529  // TODOD -make av a reference to avoid shared_ptr overhead?
530  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
531  ~ihipStream_t();
532 
533  // kind is hipMemcpyKind
534  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
535  bool resolveOn = true);
536 
537  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
538  bool resolveOn = true);
539 
540  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
541 
542  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
543 
544  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
545  size_t offset, unsigned kind);
546  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
547  size_t offset, unsigned kind);
548 
549  //---
550  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
551  // the critical mutex.
552  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
553  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
554 
555 
556  void locked_wait();
557 
558  hc::accelerator_view* locked_getAv() {
559  LockedAccessor_StreamCrit_t crit(_criticalData);
560  return &(crit->_av);
561  };
562 
563  void locked_streamWaitEvent(ihipEventData_t& event);
564  hc::completion_future locked_recordEvent(hipEvent_t event);
565 
566  bool locked_eventIsReady(hipEvent_t event);
567  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
568 
569  ihipStreamCritical_t& criticalData() { return _criticalData; };
570 
571  //---
572  hc::hcWaitMode waitMode() const;
573 
574  // Use this if we already have the stream critical data mutex:
575  void wait(LockedAccessor_StreamCrit_t& crit);
576 
577  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
578  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
579  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
580  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
581  uint64_t kernel);
582 
583 
584  //-- Non-racy accessors:
585  // These functions access fields set at initialization time and are non-racy (so do not acquire
586  // mutex)
587  const ihipDevice_t* getDevice() const;
588  ihipCtx_t* getCtx() const;
589 
590  // Before calling this function, stream must be resolved from "0" to the actual stream:
591  bool isDefaultStream() const { return _id == 0; };
592 
593  public:
594  //---
595  // Public member vars - these are set at initialization and never change:
596  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
597  unsigned _flags;
598 
599 
600  private:
601  // The unsigned return is hipMemcpyKind
602  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
603  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
604  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
605  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
606 
607  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
608  const hc::AmPointerInfo* srcInfo);
609 
610  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
611 
612 
613  public: // TODO - move private
614  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
615  ihipStreamCritical_t _criticalData;
616 
617  private: // Data
618  std::mutex _hasQueueLock;
619 
620  ihipCtx_t* _ctx; // parent context that owns this stream.
621 
622  // Friends:
623  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
624  friend hipError_t hipStreamQuery(hipStream_t);
625 
626  ScheduleMode _scheduleMode;
627 };
628 
629 
630 //----
631 // Internal structure for stream callback handler
633  public:
634  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
635  : _stream(stream), _callback(callback), _userData(userData) {
636  };
637  hipStream_t _stream;
638  hipStreamCallback_t _callback;
639  void* _userData;
640 };
641 
642 
643 //----
644 // Internal event structure:
645 enum hipEventStatus_t {
646  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
647  hipEventStatusCreated = 1, // event created, but not yet Recorded
648  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
649  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
650 };
651 
652 // TODO - rename to ihip type of some kind
653 enum ihipEventType_t {
654  hipEventTypeIndependent,
655  hipEventTypeStartCommand,
656  hipEventTypeStopCommand,
657 };
658 
659 
661  ihipEventData_t() {
662  _state = hipEventStatusCreated;
663  _stream = NULL;
664  _timestamp = 0;
665  _type = hipEventTypeIndependent;
666  };
667 
668  void marker(const hc::completion_future& marker) { _marker = marker; };
669  hc::completion_future& marker() { return _marker; }
670  uint64_t timestamp() const { return _timestamp; };
671  ihipEventType_t type() const { return _type; };
672 
673  ihipEventType_t _type;
674  hipEventStatus_t _state;
675  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
676  // stream when recorded
677  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
678  private:
679  hc::completion_future _marker;
680 };
681 
682 
683 //=============================================================================
684 // class ihipEventCriticalBase_t
685 template <typename MUTEX_TYPE>
686 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
687  public:
688  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
690 
691  // Keep data in structure so it can be easily copied into snapshots
692  // (used to reduce lock contention and preserve correct lock order)
693  ihipEventData_t _eventData;
694 
695  private:
696  const ihipEvent_t* _parent;
697  friend class LockedAccessor<ihipEventCriticalBase_t>;
698 };
699 
701 
703 
704 // internal hip event structure.
705 class ihipEvent_t {
706  public:
707  explicit ihipEvent_t(unsigned flags);
708  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
709  ihipEventType_t eventType);
710  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
711 
712 
713  // Return a copy of the critical state. The critical data is locked during the copy.
714  ihipEventData_t locked_copyCrit() {
715  LockedAccessor_EventCrit_t crit(_criticalData);
716  return _criticalData._eventData;
717  };
718 
719  ihipEventCritical_t& criticalData() { return _criticalData; };
720 
721  public:
722  unsigned _flags;
723 
724  private:
725  ihipEventCritical_t _criticalData;
726 
727  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
728 };
729 
730 
731 //=============================================================================
732 // class ihipDeviceCriticalBase_t
733 template <typename MUTEX_TYPE>
735  public:
736  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
737  : _parent(parentDevice), _ctxCount(0){};
738 
740 
741  // Contexts:
742  void addContext(ihipCtx_t* ctx);
743  void removeContext(ihipCtx_t* ctx);
744  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
745  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
746  int getcount() { return _ctxCount; };
747  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
748 
749  private:
750  ihipDevice_t* _parent;
751 
752  //--- Context Tracker:
753  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
754 
755  int _ctxCount;
756 };
757 
759 
761 
762 //----
763 // Properties of the HIP device.
764 // Multiple contexts can point to same device.
766  public:
767  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
768  ~ihipDevice_t();
769 
770  // Accessors:
771  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
772  void locked_removeContext(ihipCtx_t* c);
773  void locked_reset();
774  ihipDeviceCritical_t& criticalData() { return _criticalData; };
775 
776  public:
777  unsigned _deviceId; // device ID
778 
779  hc::accelerator _acc;
780  hsa_agent_t _hsaAgent; // hsa agent handle
781 
783  unsigned _computeUnits;
784  hipDeviceProp_t _props; // saved device properties.
785 
786  // TODO - report this through device properties, base on HCC API call.
787  int _isLargeBar;
788 
789  ihipCtx_t* _primaryCtx;
790 
791  int _state; // 1 if device is set otherwise 0
792 
793  private:
794  hipError_t initProperties(hipDeviceProp_t* prop);
795 
796  private:
797  ihipDeviceCritical_t _criticalData;
798 };
799 //=============================================================================
800 
801 
802 //---
803 //
804 struct ihipExec_t {
805  dim3 _gridDim;
806  dim3 _blockDim;
807  size_t _sharedMem;
808  hipStream_t _hStream;
809  std::vector<char> _arguments;
810 };
811 
812 //=============================================================================
813 // class ihipCtxCriticalBase_t
814 template <typename MUTEX_TYPE>
815 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
816  public:
817  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
818  : _parent(parentCtx), _peerCnt(0) {
819  _peerAgents = new hsa_agent_t[deviceCnt];
820  };
821 
823  if (_peerAgents != nullptr) {
824  delete _peerAgents;
825  _peerAgents = nullptr;
826  }
827  _peerCnt = 0;
828  }
829 
830  // Streams:
831  void addStream(ihipStream_t* stream);
832  std::list<ihipStream_t*>& streams() { return _streams; };
833  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
834 
835 
836  // Peer Accessor classes:
837  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
838  // physically located on this device.
839  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
840  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
841  void resetPeerWatchers(ihipCtx_t* thisDevice);
842  void printPeerWatchers(FILE* f) const;
843 
844  uint32_t peerCnt() const { return _peerCnt; };
845  hsa_agent_t* peerAgents() const { return _peerAgents; };
846 
847 
848  // TODO - move private
849  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
850  //--- Execution stack:
851  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
852 
853  friend class LockedAccessor<ihipCtxCriticalBase_t>;
854 
855  private:
856  ihipCtx_t* _parent;
857 
858  //--- Stream Tracker:
859  std::list<ihipStream_t*> _streams; // streams associated with this device.
860 
861 
862  //--- Peer Tracker:
863  // These reflect the currently Enabled set of peers for this GPU:
864  // Enabled peers have permissions to access the memory physically allocated on this device.
865  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
866  uint32_t _peerCnt; // number of enabled peers
867  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
868  private:
869  void recomputePeerAgents();
870 };
871 // Note Mutex type Real/Fake selected based on CtxMutex
873 
874 // This type is used by functions that need access to the critical device structures.
876 //=============================================================================
877 
878 
879 //=============================================================================
880 // class ihipCtx_t:
881 // A HIP CTX (context) points at one of the existing devices and contains the streams,
882 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
883 // device.
884 //
885 class ihipCtx_t {
886  public: // Functions:
887  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
888  unsigned flags); // note: calls constructor for _criticalData
889  ~ihipCtx_t();
890 
891  // Functions which read or write the critical data are named locked_.
892  // (might be better called "locking_"
893  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
894  // locked_ function from within a locked_ function. External functions which call several
895  // locked_ functions will acquire and release the lock for each function. if this occurs in
896  // performance-sensitive code we may want to refactor by adding non-locked functions and
897  // creating a new locked_ member function to call them all.
898  void locked_removeStream(ihipStream_t* s);
899  void locked_reset();
900  void locked_waitAllStreams();
901  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
902 
903  ihipCtxCritical_t& criticalData() { return _criticalData; };
904 
905  const ihipDevice_t* getDevice() const { return _device; };
906  int getDeviceNum() const { return _device->_deviceId; };
907 
908  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
909  ihipDevice_t* getWriteableDevice() const { return _device; };
910 
911  std::string toString() const;
912 
913  public: // Data
914  // The NULL stream is used if no other stream is specified.
915  // Default stream has special synchronization properties with other streams.
916  ihipStream_t* _defaultStream;
917 
918  // Flags specified when the context is created:
919  unsigned _ctxFlags;
920 
921  private:
922  ihipDevice_t* _device;
923 
924 
925  private: // Critical data, protected with locked access:
926  // Members of _protected data MUST be accessed through the LockedAccessor.
927  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
928  // directly.
929  ihipCtxCritical_t _criticalData;
930 };
931 
932 
933 //=================================================================================================
934 // Global variable definition:
935 extern unsigned g_deviceCnt;
936 extern hsa_agent_t g_cpu_agent; // the CPU agent.
937 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
938 
939 //=================================================================================================
940 // Extern functions:
941 extern void ihipInit();
942 extern const char* ihipErrorString(hipError_t);
943 extern ihipCtx_t* ihipGetTlsDefaultCtx();
944 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
945 extern hipError_t ihipSynchronize(void);
946 extern void ihipCtxStackUpdate();
947 extern hipError_t ihipDeviceSetState();
948 
949 extern ihipDevice_t* ihipGetDevice(int);
950 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
951 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
952  const char* name, hsa_agent_t *agent);
953 
954 
955 hipStream_t ihipSyncAndResolveStream(hipStream_t);
956 hipError_t ihipStreamSynchronize(hipStream_t stream);
957 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
958 
959 // Stream printf functions:
960 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
961  os << "stream:";
962  os << s.getDevice()->_deviceId;
963  ;
964  os << '.';
965  os << s._id;
966  return os;
967 }
968 
969 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
970  os << '{';
971  os << s.x;
972  os << ',';
973  os << s.y;
974  os << ',';
975  os << s.z;
976  os << '}';
977  return os;
978 }
979 
980 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
981  os << '{';
982  os << s.x;
983  os << ',';
984  os << s.y;
985  os << ',';
986  os << s.z;
987  os << '}';
988  return os;
989 }
990 
991 // Stream printf functions:
992 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
993  os << "event:" << std::hex << static_cast<void*>(e);
994  return os;
995 }
996 
997 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
998  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
999  return os;
1000 }
1001 
1002 
1003 // Helper functions that are used across src files:
1004 namespace hip_internal {
1005 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1006  hipStream_t stream);
1007 };
1008 
1009 
1010 #endif
Definition: hip_hcc_internal.h:237
Definition: hip_hcc_internal.h:765
Definition: hip_hcc_internal.h:464
Definition: hip_hcc_internal.h:112
Definition: hip_hcc_internal.h:387
Definition: hip_hcc_internal.h:360
uint32_t x
x
Definition: hip_runtime_api.h:270
Definition: grid_launch.h:17
Definition: hip_module.cpp:106
Definition: hip_hcc_internal.h:804
Definition: hip_hcc_internal.h:885
Definition: hip_runtime_api.h:269
uint32_t y
y
Definition: hip_runtime_api.h:271
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:843
Definition: hip_hcc_internal.h:632
Definition: hip_hcc_internal.h:734
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:783
uint32_t z
z
Definition: hip_runtime_api.h:272
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:660
Definition: hip_hcc_internal.h:686
Definition: hip_hcc_internal.h:1004
Definition: hip_hcc_internal.h:336
Definition: hip_hcc_internal.h:705
Definition: hip_hcc_internal.h:815
Definition: hip_hcc_internal.h:524
Definition: hip_hcc_internal.h:476
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:363
Definition: hip_hcc_internal.h:428
Definition: hip_hcc_internal.h:370
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:157
Definition: hip_hcc_internal.h:95