HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 extern int HIP_PROFILE_API;
67 // extern int HIP_TRACE_API;
68 extern int HIP_ATP;
69 extern int HIP_DB;
70 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
71 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
72 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
73 extern int HIP_FORCE_P2P_HOST;
74 
75 extern int HIP_HOST_COHERENT;
76 
77 extern int HIP_HIDDEN_FREE_MEM;
78 //---
79 // Chicken bits for disabling functionality to work around potential issues:
80 extern int HIP_SYNC_HOST_ALLOC;
81 extern int HIP_SYNC_STREAM_WAIT;
82 
83 extern int HIP_SYNC_NULL_STREAM;
84 extern int HIP_INIT_ALLOC;
85 extern int HIP_FORCE_NULL_STREAM;
86 
87 extern int HIP_SYNC_FREE;
88 
89 extern int HIP_DUMP_CODE_OBJECT;
90 
91 // TODO - remove when this is standard behavior.
92 extern int HCC_OPT_FLUSH;
93 
94 // Class to assign a short TID to each new thread, for HIP debugging purposes.
95 class TidInfo {
96  public:
97  TidInfo();
98 
99  int tid() const { return _shortTid; };
100  pid_t pid() const { return _pid; };
101  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
102  uint64_t apiSeqNum() const { return _apiSeqNum; };
103 
104  private:
105  int _shortTid;
106  pid_t _pid;
107 
108  // monotonically increasing API sequence number for this threa.
109  uint64_t _apiSeqNum;
110 };
111 
112 struct ProfTrigger {
113  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
114 
115  void print(int tid) {
116  std::cout << "Enabling tracing for ";
117  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
118  std::cout << "tid:" << tid << "." << *iter << ",";
119  }
120  std::cout << "\n";
121  };
122 
123  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
124  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
125  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
126 
127  private:
128  std::vector<uint64_t> _profTrigger;
129 };
130 
131 
132 //---
133 // Extern TLS
134 // Use a single struct to hold all TLS data. Attempt to reduce TLS accesses.
135 struct TlsData {
136  explicit TlsData() {
137  lastHipError = hipSuccess;
138  getPrimaryCtx = true;
139  defaultCtx = nullptr;
140  }
141 
142  hipError_t lastHipError;
143  TidInfo tidInfo;
144  // This is the implicit context used by all HIP commands.
145  // It can be set by hipSetDevice or by the CTX manipulation commands:
146  ihipCtx_t* defaultCtx;
147  // Stack of contexts
148  std::stack<ihipCtx_t*> ctxStack;
149  bool getPrimaryCtx;
150 };
151 TlsData* tls_get_ptr();
152 #define GET_TLS() TlsData *tls = tls_get_ptr()
153 
154 extern std::vector<ProfTrigger> g_dbStartTriggers;
155 extern std::vector<ProfTrigger> g_dbStopTriggers;
156 
157 //---
158 // Forward defs:
159 class ihipStream_t;
160 class ihipDevice_t;
161 class ihipCtx_t;
162 struct ihipEventData_t;
163 
164 // Color defs for debug messages:
165 #define KNRM "\x1B[0m"
166 #define KRED "\x1B[31m"
167 #define KGRN "\x1B[32m"
168 #define KYEL "\x1B[33m"
169 #define KBLU "\x1B[34m"
170 #define KMAG "\x1B[35m"
171 #define KCYN "\x1B[36m"
172 #define KWHT "\x1B[37m"
173 
174 extern const char* API_COLOR;
175 extern const char* API_COLOR_END;
176 
177 
178 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
179 // Can disable for performance or functional experiments - in this case
180 // the code uses a dummy "no-op" mutex.
181 #define EVENT_THREAD_SAFE 1
182 
183 #define STREAM_THREAD_SAFE 1
184 
185 #define CTX_THREAD_SAFE 1
186 
187 #define DEVICE_THREAD_SAFE 1
188 
189 
190 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
191 // May be set to 0 to remove debug if checks - possible code size and performance difference?
192 #define COMPILE_HIP_DB 1
193 
194 
195 // Compile HIP tracing capability.
196 // 0x1 = print a string at function entry with arguments.
197 // 0x2 = prints a simple message with function name + return code when function exits.
198 // 0x3 = print both.
199 // Must be enabled at runtime with HIP_TRACE_API
200 #define COMPILE_HIP_TRACE_API 0x3
201 
202 
203 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
204 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
205 #ifndef COMPILE_HIP_ATP_MARKER
206 #define COMPILE_HIP_ATP_MARKER 0
207 #endif
208 
209 
210 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
211 // boundary.
212 // TODO - currently we print the trace message at the beginning. if we waited, we could also
213 // tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
214 // by hipMalloc).
215 #if COMPILE_HIP_ATP_MARKER
216 #include "CXLActivityLogger.h"
217 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
218 #define MARKER_END() amdtEndMarker();
219 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
220 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
221 #else
222 // Swallow scoped markers:
223 #define MARKER_BEGIN(markerName, group)
224 #define MARKER_END()
225 #define RESUME_PROFILING
226 #define STOP_PROFILING
227 #endif
228 
229 
230 //---
231 // HIP Trace modes - use with HIP_TRACE_API=...
232 #define TRACE_ALL 0 // 0x01
233 #define TRACE_KCMD 1 // 0x02, kernel command
234 #define TRACE_MCMD 2 // 0x04, memory command
235 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
236 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
237 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
238 
239 
240 //---
241 // HIP_DB Debug flags:
242 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
243 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
244 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
245 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
246 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
247 #define DB_FB 5 /* 0x20 - trace loading fat binary */
248 #define DB_MAX_FLAG 6
249 // When adding a new debug flag, also add to the char name table below.
250 //
251 //
252 
253 struct DbName {
254  const char* _color;
255  const char* _shortName;
256 };
257 
258 // This table must be kept in-sync with the defines above.
259 static const DbName dbName[] = {
260  {KGRN, "api"}, // not used,
261  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
262  {KBLU, "fatbin"},
263 };
264 
265 
266 #if COMPILE_HIP_DB
267 #define tprintf(trace_level, ...) \
268  { \
269  if (HIP_DB & (1 << (trace_level))) { \
270  GET_TLS(); \
271  char msgStr[1000]; \
272  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
273  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
274  dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \
275  } \
276  }
277 #else
278 /* Compile to empty code */
279 #define tprintf(trace_level, ...)
280 #endif
281 
282 
283 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
284 
285 //---
286 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
287 
288 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
289 #define API_TRACE(forceTrace, ...) \
290  GET_TLS(); \
291  uint64_t hipApiStartTick = 0; \
292  { \
293  tls->tidInfo.incApiSeqNum(); \
294  if (forceTrace || \
295  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
296  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
297  std::string fullStr; \
298  hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
299  if (HIP_PROFILE_API == 0x1) { \
300  MARKER_BEGIN(__func__, "HIP") \
301  } else if (HIP_PROFILE_API == 0x2) { \
302  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
303  } \
304  } \
305  }
306 
307 #else
308 // Swallow API_TRACE
309 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum();
310 #endif
311 
312 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls)
313 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx
314 
315 #define HIP_SET_DEVICE() ihipDeviceSetState(tls);
316 
317 // This macro should be called at the beginning of every HIP API.
318 // It initializes the hip runtime (exactly once), and
319 // generates a trace string that can be output to stderr or to ATP file.
320 #define HIP_INIT_API(cid, ...) \
321  hip_impl::hip_init(); \
322  API_TRACE(0, __VA_ARGS__); \
323  HIP_CB_SPAWNER_OBJECT(cid);
324 
325 
326 // Like above, but will trace with a specified "special" bit.
327 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
328 // kernel launches, copy commands, memory sets, etc.
329 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
330  hip_impl::hip_init(); \
331  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
332  HIP_CB_SPAWNER_OBJECT(cid);
333 
334 
335 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
336 // APIS (not internal hip) It has dual function: logs the last error returned for use by
337 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
338 #define ihipLogStatus(hipStatus) \
339  ({ \
340  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
341  tls->lastHipError = localHipStatus; \
342  \
343  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
344  auto ticks = getTicks() - hipApiStartTick; \
345  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
346  (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \
347  tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
348  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
349  } \
350  if (HIP_PROFILE_API) { \
351  MARKER_END(); \
352  } \
353  localHipStatus; \
354  })
355 
356 
357 class ihipException : public std::exception {
358  public:
359  explicit ihipException(hipError_t e) : _code(e){};
360 
361  hipError_t _code;
362 };
363 
364 
365 #ifdef __cplusplus
366 extern "C" {
367 #endif
368 
369 
370 #ifdef __cplusplus
371 }
372 #endif
373 
374 const hipStream_t hipStreamNull = 0x0;
375 
376 
380 #define HIP_IPC_RESERVED_SIZE 24
382  public:
383 #if USE_IPC
384  hsa_amd_ipc_memory_t ipc_handle;
385 #endif
386  size_t psize;
387  char reserved[HIP_IPC_RESERVED_SIZE];
388 };
389 
390 
391 struct ihipModule_t {
392  std::string fileName;
393  hsa_executable_t executable = {};
394  hsa_code_object_reader_t coReader = {};
395  std::string hash;
396  std::unordered_map<
397  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
398 
399  ~ihipModule_t() {
400  if (executable.handle) hsa_executable_destroy(executable);
401  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
402  }
403 };
404 
405 
406 //---
407 // Used to remove lock, for performance or stimulating bugs.
408 class FakeMutex {
409  public:
410  void lock() {}
411  bool try_lock() { return true; }
412  void unlock() {}
413 };
414 
415 #if EVENT_THREAD_SAFE
416 typedef std::mutex EventMutex;
417 #else
418 #warning "Stream thread-safe disabled"
419 typedef FakeMutex EventMutex;
420 #endif
421 
422 #if STREAM_THREAD_SAFE
423 typedef std::mutex StreamMutex;
424 #else
425 #warning "Stream thread-safe disabled"
426 typedef FakeMutex StreamMutex;
427 #endif
428 
429 // Pair Device and Ctx together, these could also be toggled separately if desired.
430 #if CTX_THREAD_SAFE
431 typedef std::mutex CtxMutex;
432 #else
433 typedef FakeMutex CtxMutex;
434 #warning "Ctx thread-safe disabled"
435 #endif
436 
437 #if DEVICE_THREAD_SAFE
438 typedef std::mutex DeviceMutex;
439 #else
440 typedef FakeMutex DeviceMutex;
441 #warning "Device thread-safe disabled"
442 #endif
443 
444 //
445 //---
446 // Protects access to the member _data with a lock acquired on contruction/destruction.
447 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
448 template <typename T>
450  public:
451  LockedAccessor(T& criticalData, bool autoUnlock = true)
452  : _criticalData(&criticalData),
453  _autoUnlock(autoUnlock)
454 
455  {
456  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
457  ToString(_criticalData->_parent).c_str());
458  _criticalData->_mutex.lock();
459  };
460 
461  ~LockedAccessor() {
462  if (_autoUnlock) {
463  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
464  ToString(_criticalData->_parent).c_str());
465  _criticalData->_mutex.unlock();
466  }
467  }
468 
469  void unlock() {
470  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
471  ToString(_criticalData->_parent).c_str());
472  _criticalData->_mutex.unlock();
473  }
474 
475  // Syntactic sugar so -> can be used to get the underlying type.
476  T* operator->() { return _criticalData; };
477 
478  private:
479  T* _criticalData;
480  bool _autoUnlock;
481 };
482 
483 
484 template <typename MUTEX_TYPE>
485 struct LockedBase {
486  // Experts-only interface for explicit locking.
487  // Most uses should use the lock-accessor.
488  void lock() { _mutex.lock(); }
489  void unlock() { _mutex.unlock(); }
490  bool try_lock() { return _mutex.try_lock(); }
491 
492  MUTEX_TYPE _mutex;
493 };
494 
495 
496 template <typename MUTEX_TYPE>
497 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
498  public:
499  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
500  : _av(av), _parent(parentStream){};
501 
503 
506  return this;
507  };
508 
509  void munlock() {
510  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
511  ToString(this->_parent).c_str());
513  };
514 
516  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
517  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
518  ToString(this->_parent).c_str());
519  return gotLock ? this : nullptr;
520  };
521 
522  public:
523  ihipStream_t* _parent;
524 
525  hc::accelerator_view _av;
526 
527  private:
528 };
529 
530 
531 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
532 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
533 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
534 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
535 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
536 // query.
537 
538 
541 
542 //---
543 // Internal stream structure.
545  public:
546  enum ScheduleMode { Auto, Spin, Yield };
547  typedef uint64_t SeqNum_t;
548 
549  // TODOD -make av a reference to avoid shared_ptr overhead?
550  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
551  ~ihipStream_t();
552 
553  // kind is hipMemcpyKind
554  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
555  bool resolveOn = true);
556 
557  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
558  bool resolveOn = true);
559 
560  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
561 
562  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
563 
564  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
565  size_t offset, unsigned kind);
566  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
567  size_t offset, unsigned kind);
568 
569  //---
570  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
571  // the critical mutex.
572  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
573  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
574 
575 
576  void locked_wait();
577 
578  hc::accelerator_view* locked_getAv() {
579  LockedAccessor_StreamCrit_t crit(_criticalData);
580  return &(crit->_av);
581  };
582 
583  void locked_streamWaitEvent(ihipEventData_t& event);
584  hc::completion_future locked_recordEvent(hipEvent_t event);
585 
586  bool locked_eventIsReady(hipEvent_t event);
587 
588  ihipStreamCritical_t& criticalData() { return _criticalData; };
589 
590  //---
591  hc::hcWaitMode waitMode() const;
592 
593  // Use this if we already have the stream critical data mutex:
594  void wait(LockedAccessor_StreamCrit_t& crit);
595 
596  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
597  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
598  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
599  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
600  uint64_t kernel);
601 
602 
603  //-- Non-racy accessors:
604  // These functions access fields set at initialization time and are non-racy (so do not acquire
605  // mutex)
606  const ihipDevice_t* getDevice() const;
607  ihipCtx_t* getCtx() const;
608 
609  // Before calling this function, stream must be resolved from "0" to the actual stream:
610  bool isDefaultStream() const { return _id == 0; };
611 
612  public:
613  //---
614  // Public member vars - these are set at initialization and never change:
615  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
616  unsigned _flags;
617 
618 
619  private:
620  // The unsigned return is hipMemcpyKind
621  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
622  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
623  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
624  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
625 
626  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
627  const hc::AmPointerInfo* srcInfo);
628 
629  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
630 
631  private: // Data
632  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
633  ihipStreamCritical_t _criticalData;
634 
635  std::mutex _hasQueueLock;
636 
637  ihipCtx_t* _ctx; // parent context that owns this stream.
638 
639  // Friends:
640  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
641  friend hipError_t hipStreamQuery(hipStream_t);
642 
643  ScheduleMode _scheduleMode;
644 };
645 
646 
647 //----
648 // Internal structure for stream callback handler
650  public:
651  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
652  : _stream(stream), _callback(callback), _userData(userData) {
653  };
654  hipStream_t _stream;
655  hipStreamCallback_t _callback;
656  void* _userData;
657 };
658 
659 
660 //----
661 // Internal event structure:
662 enum hipEventStatus_t {
663  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
664  hipEventStatusCreated = 1, // event created, but not yet Recorded
665  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
666  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
667 };
668 
669 // TODO - rename to ihip type of some kind
670 enum ihipEventType_t {
671  hipEventTypeIndependent,
672  hipEventTypeStartCommand,
673  hipEventTypeStopCommand,
674 };
675 
676 
678  ihipEventData_t() {
679  _state = hipEventStatusCreated;
680  _stream = NULL;
681  _timestamp = 0;
682  _type = hipEventTypeIndependent;
683  };
684 
685  void marker(const hc::completion_future& marker) { _marker = marker; };
686  hc::completion_future& marker() { return _marker; }
687  uint64_t timestamp() const { return _timestamp; };
688  ihipEventType_t type() const { return _type; };
689 
690  ihipEventType_t _type;
691  hipEventStatus_t _state;
692  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
693  // stream when recorded
694  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
695  private:
696  hc::completion_future _marker;
697 };
698 
699 
700 //=============================================================================
701 // class ihipEventCriticalBase_t
702 template <typename MUTEX_TYPE>
703 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
704  public:
705  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
707 
708  // Keep data in structure so it can be easily copied into snapshots
709  // (used to reduce lock contention and preserve correct lock order)
710  ihipEventData_t _eventData;
711 
712  private:
713  const ihipEvent_t* _parent;
714  friend class LockedAccessor<ihipEventCriticalBase_t>;
715 };
716 
718 
720 
721 // internal hip event structure.
722 class ihipEvent_t {
723  public:
724  explicit ihipEvent_t(unsigned flags);
725  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
726  ihipEventType_t eventType);
727  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
728 
729 
730  // Return a copy of the critical state. The critical data is locked during the copy.
731  ihipEventData_t locked_copyCrit() {
732  LockedAccessor_EventCrit_t crit(_criticalData);
733  return _criticalData._eventData;
734  };
735 
736  ihipEventCritical_t& criticalData() { return _criticalData; };
737 
738  public:
739  unsigned _flags;
740 
741  private:
742  ihipEventCritical_t _criticalData;
743 
744  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
745 };
746 
747 
748 //=============================================================================
749 // class ihipDeviceCriticalBase_t
750 template <typename MUTEX_TYPE>
752  public:
753  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
754  : _parent(parentDevice), _ctxCount(0){};
755 
757 
758  // Contexts:
759  void addContext(ihipCtx_t* ctx);
760  void removeContext(ihipCtx_t* ctx);
761  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
762  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
763  int getcount() { return _ctxCount; };
764  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
765 
766  private:
767  ihipDevice_t* _parent;
768 
769  //--- Context Tracker:
770  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
771 
772  int _ctxCount;
773 };
774 
776 
778 
779 //----
780 // Properties of the HIP device.
781 // Multiple contexts can point to same device.
783  public:
784  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
785  ~ihipDevice_t();
786 
787  // Accessors:
788  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
789  void locked_removeContext(ihipCtx_t* c);
790  void locked_reset();
791  ihipDeviceCritical_t& criticalData() { return _criticalData; };
792 
793  public:
794  unsigned _deviceId; // device ID
795 
796  hc::accelerator _acc;
797  hsa_agent_t _hsaAgent; // hsa agent handle
798 
800  unsigned _computeUnits;
801  hipDeviceProp_t _props; // saved device properties.
802 
803  // TODO - report this through device properties, base on HCC API call.
804  int _isLargeBar;
805 
806  ihipCtx_t* _primaryCtx;
807 
808  int _state; // 1 if device is set otherwise 0
809 
810  private:
811  hipError_t initProperties(hipDeviceProp_t* prop);
812 
813  private:
814  ihipDeviceCritical_t _criticalData;
815 };
816 //=============================================================================
817 
818 
819 //---
820 //
821 struct ihipExec_t {
822  dim3 _gridDim;
823  dim3 _blockDim;
824  size_t _sharedMem;
825  hipStream_t _hStream;
826  std::vector<char> _arguments;
827 };
828 
829 //=============================================================================
830 // class ihipCtxCriticalBase_t
831 template <typename MUTEX_TYPE>
832 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
833  public:
834  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
835  : _parent(parentCtx), _peerCnt(0) {
836  _peerAgents = new hsa_agent_t[deviceCnt];
837  };
838 
840  if (_peerAgents != nullptr) {
841  delete _peerAgents;
842  _peerAgents = nullptr;
843  }
844  _peerCnt = 0;
845  }
846 
847  // Streams:
848  void addStream(ihipStream_t* stream);
849  std::list<ihipStream_t*>& streams() { return _streams; };
850  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
851 
852 
853  // Peer Accessor classes:
854  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
855  // physically located on this device.
856  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
857  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
858  void resetPeerWatchers(ihipCtx_t* thisDevice);
859  void printPeerWatchers(FILE* f) const;
860 
861  uint32_t peerCnt() const { return _peerCnt; };
862  hsa_agent_t* peerAgents() const { return _peerAgents; };
863 
864 
865  // TODO - move private
866  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
867  //--- Execution stack:
868  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
869 
870  friend class LockedAccessor<ihipCtxCriticalBase_t>;
871 
872  private:
873  ihipCtx_t* _parent;
874 
875  //--- Stream Tracker:
876  std::list<ihipStream_t*> _streams; // streams associated with this device.
877 
878 
879  //--- Peer Tracker:
880  // These reflect the currently Enabled set of peers for this GPU:
881  // Enabled peers have permissions to access the memory physically allocated on this device.
882  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
883  uint32_t _peerCnt; // number of enabled peers
884  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
885  private:
886  void recomputePeerAgents();
887 };
888 // Note Mutex type Real/Fake selected based on CtxMutex
890 
891 // This type is used by functions that need access to the critical device structures.
893 //=============================================================================
894 
895 
896 //=============================================================================
897 // class ihipCtx_t:
898 // A HIP CTX (context) points at one of the existing devices and contains the streams,
899 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
900 // device.
901 //
902 class ihipCtx_t {
903  public: // Functions:
904  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
905  unsigned flags); // note: calls constructor for _criticalData
906  ~ihipCtx_t();
907 
908  // Functions which read or write the critical data are named locked_.
909  // (might be better called "locking_"
910  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
911  // locked_ function from within a locked_ function. External functions which call several
912  // locked_ functions will acquire and release the lock for each function. if this occurs in
913  // performance-sensitive code we may want to refactor by adding non-locked functions and
914  // creating a new locked_ member function to call them all.
915  void locked_removeStream(ihipStream_t* s);
916  void locked_reset();
917  void locked_waitAllStreams();
918  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
919 
920  ihipCtxCritical_t& criticalData() { return _criticalData; };
921 
922  const ihipDevice_t* getDevice() const { return _device; };
923  int getDeviceNum() const { return _device->_deviceId; };
924 
925  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
926  ihipDevice_t* getWriteableDevice() const { return _device; };
927 
928  std::string toString() const;
929 
930  public: // Data
931  // The NULL stream is used if no other stream is specified.
932  // Default stream has special synchronization properties with other streams.
933  ihipStream_t* _defaultStream;
934 
935  // Flags specified when the context is created:
936  unsigned _ctxFlags;
937 
938  private:
939  ihipDevice_t* _device;
940 
941 
942  private: // Critical data, protected with locked access:
943  // Members of _protected data MUST be accessed through the LockedAccessor.
944  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
945  // directly.
946  ihipCtxCritical_t _criticalData;
947 };
948 
949 
950 //=================================================================================================
951 // Global variable definition:
952 extern unsigned g_deviceCnt;
953 extern hsa_agent_t g_cpu_agent; // the CPU agent.
954 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
955 
956 //=================================================================================================
957 // Extern functions:
958 extern void ihipInit();
959 extern const char* ihipErrorString(hipError_t);
960 extern hipError_t ihipSynchronize(TlsData *tls);
961 extern void ihipCtxStackUpdate();
962 extern hipError_t ihipDeviceSetState(TlsData *tls);
963 
964 extern ihipDevice_t* ihipGetDevice(int);
965 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
966 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
967  const char* name, hsa_agent_t *agent);
968 
969 
970 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
971 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
972 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
973 
974 // Stream printf functions:
975 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
976  os << "stream:";
977  os << s.getDevice()->_deviceId;
978  ;
979  os << '.';
980  os << s._id;
981  return os;
982 }
983 
984 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
985  os << '{';
986  os << s.x;
987  os << ',';
988  os << s.y;
989  os << ',';
990  os << s.z;
991  os << '}';
992  return os;
993 }
994 
995 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
996  os << '{';
997  os << s.x;
998  os << ',';
999  os << s.y;
1000  os << ',';
1001  os << s.z;
1002  os << '}';
1003  return os;
1004 }
1005 
1006 // Stream printf functions:
1007 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
1008  os << "event:" << std::hex << static_cast<void*>(e);
1009  return os;
1010 }
1011 
1012 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
1013  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
1014  return os;
1015 }
1016 
1017 
1018 // Helper functions that are used across src files:
1019 namespace hip_internal {
1020 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1021  hipStream_t stream);
1022 };
1023 
1024 //---
1025 // TODO - review the context creation strategy here. Really should be:
1026 // - first "non-device" runtime call creates the context for this thread. Allowed to call
1027 // setDevice first.
1028 // - hipDeviceReset destroys the primary context for device?
1029 // - Then context is created again for next usage.
1030 static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
1031  // Per-thread initialization of the TLS:
1032  if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
1033  tls->defaultCtx = ihipGetPrimaryCtx(0);
1034  }
1035  return tls->defaultCtx;
1036 }
1037 
1038 #endif
Definition: hip_hcc_internal.h:253
Definition: hip_hcc_internal.h:782
Definition: hip_hcc_internal.h:485
Definition: hip_hcc_internal.h:112
Definition: hip_hcc_internal.h:408
Definition: hip_hcc_internal.h:381
uint32_t x
x
Definition: hip_runtime_api.h:271
Definition: grid_launch.h:17
Definition: hip_module.cpp:107
Definition: hip_hcc_internal.h:135
Definition: hip_hcc_internal.h:821
Definition: hip_hcc_internal.h:902
Definition: hip_runtime_api.h:270
uint32_t y
y
Definition: hip_runtime_api.h:272
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:844
Definition: hip_hcc_internal.h:649
Definition: hip_hcc_internal.h:751
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:800
uint32_t z
z
Definition: hip_runtime_api.h:273
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:677
Definition: hip_hcc_internal.h:703
Definition: hip_hcc_internal.h:1019
Definition: hip_hcc_internal.h:357
Definition: hip_hcc_internal.h:722
Definition: hip_hcc_internal.h:832
Definition: hip_hcc_internal.h:544
Definition: hip_hcc_internal.h:497
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:384
Definition: hip_hcc_internal.h:449
Definition: hip_hcc_internal.h:391
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:160
Definition: hip_hcc_internal.h:95