HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 extern int HIP_PROFILE_API;
67 // extern int HIP_TRACE_API;
68 extern int HIP_ATP;
69 extern int HIP_DB;
70 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
71 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
72 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
73 extern int HIP_FORCE_P2P_HOST;
74 
75 extern int HIP_HOST_COHERENT;
76 
77 extern int HIP_HIDDEN_FREE_MEM;
78 //---
79 // Chicken bits for disabling functionality to work around potential issues:
80 extern int HIP_SYNC_HOST_ALLOC;
81 extern int HIP_SYNC_STREAM_WAIT;
82 
83 extern int HIP_SYNC_NULL_STREAM;
84 extern int HIP_INIT_ALLOC;
85 extern int HIP_FORCE_NULL_STREAM;
86 
87 extern int HIP_SYNC_FREE;
88 
89 extern int HIP_DUMP_CODE_OBJECT;
90 
91 // TODO - remove when this is standard behavior.
92 extern int HCC_OPT_FLUSH;
93 
94 // Class to assign a short TID to each new thread, for HIP debugging purposes.
95 class TidInfo {
96  public:
97  TidInfo();
98 
99  int tid() const { return _shortTid; };
100  pid_t pid() const { return _pid; };
101  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
102  uint64_t apiSeqNum() const { return _apiSeqNum; };
103 
104  private:
105  int _shortTid;
106  pid_t _pid;
107 
108  // monotonically increasing API sequence number for this threa.
109  uint64_t _apiSeqNum;
110 };
111 
112 struct ProfTrigger {
113  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
114 
115  void print(int tid) {
116  std::cout << "Enabling tracing for ";
117  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
118  std::cout << "tid:" << tid << "." << *iter << ",";
119  }
120  std::cout << "\n";
121  };
122 
123  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
124  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
125  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
126 
127  private:
128  std::vector<uint64_t> _profTrigger;
129 };
130 
131 
132 //---
133 // Extern TLS
134 // Use a single struct to hold all TLS data. Attempt to reduce TLS accesses.
135 struct TlsData {
136  explicit TlsData() {
137  lastHipError = hipSuccess;
138  getPrimaryCtx = true;
139  defaultCtx = nullptr;
140  }
141 
142  hipError_t lastHipError;
143  TidInfo tidInfo;
144  // This is the implicit context used by all HIP commands.
145  // It can be set by hipSetDevice or by the CTX manipulation commands:
146  ihipCtx_t* defaultCtx;
147  // Stack of contexts
148  std::stack<ihipCtx_t*> ctxStack;
149  bool getPrimaryCtx;
150 };
151 TlsData* tls_get_ptr();
152 #define GET_TLS() TlsData *tls = tls_get_ptr()
153 
154 extern std::vector<ProfTrigger> g_dbStartTriggers;
155 extern std::vector<ProfTrigger> g_dbStopTriggers;
156 
157 //---
158 // Forward defs:
159 class ihipStream_t;
160 class ihipDevice_t;
161 class ihipCtx_t;
162 struct ihipEventData_t;
163 
164 // Color defs for debug messages:
165 #define KNRM "\x1B[0m"
166 #define KRED "\x1B[31m"
167 #define KGRN "\x1B[32m"
168 #define KYEL "\x1B[33m"
169 #define KBLU "\x1B[34m"
170 #define KMAG "\x1B[35m"
171 #define KCYN "\x1B[36m"
172 #define KWHT "\x1B[37m"
173 
174 extern const char* API_COLOR;
175 extern const char* API_COLOR_END;
176 
177 
178 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
179 // Can disable for performance or functional experiments - in this case
180 // the code uses a dummy "no-op" mutex.
181 #define EVENT_THREAD_SAFE 1
182 
183 #define STREAM_THREAD_SAFE 1
184 
185 #define CTX_THREAD_SAFE 1
186 
187 #define DEVICE_THREAD_SAFE 1
188 
189 
190 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
191 // May be set to 0 to remove debug if checks - possible code size and performance difference?
192 #define COMPILE_HIP_DB 1
193 
194 
195 // Compile HIP tracing capability.
196 // 0x1 = print a string at function entry with arguments.
197 // 0x2 = prints a simple message with function name + return code when function exits.
198 // 0x3 = print both.
199 // Must be enabled at runtime with HIP_TRACE_API
200 #define COMPILE_HIP_TRACE_API 0x3
201 
202 
203 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
204 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
205 #ifndef COMPILE_HIP_ATP_MARKER
206 #define COMPILE_HIP_ATP_MARKER 0
207 #endif
208 
209 
210 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
211 // boundary.
212 // TODO - currently we print the trace message at the beginning. if we waited, we could also
213 // tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
214 // by hipMalloc).
215 #if COMPILE_HIP_ATP_MARKER
216 #include "CXLActivityLogger.h"
217 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
218 #define MARKER_END() amdtEndMarker();
219 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
220 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
221 #else
222 // Swallow scoped markers:
223 #define MARKER_BEGIN(markerName, group)
224 #define MARKER_END()
225 #define RESUME_PROFILING
226 #define STOP_PROFILING
227 #endif
228 
229 
230 //---
231 // HIP Trace modes - use with HIP_TRACE_API=...
232 #define TRACE_ALL 0 // 0x01
233 #define TRACE_KCMD 1 // 0x02, kernel command
234 #define TRACE_MCMD 2 // 0x04, memory command
235 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
236 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
237 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
238 
239 
240 //---
241 // HIP_DB Debug flags:
242 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
243 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
244 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
245 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
246 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
247 #define DB_FB 5 /* 0x20 - trace loading fat binary */
248 #define DB_MAX_FLAG 6
249 // When adding a new debug flag, also add to the char name table below.
250 //
251 //
252 
253 struct DbName {
254  const char* _color;
255  const char* _shortName;
256 };
257 
258 // This table must be kept in-sync with the defines above.
259 static const DbName dbName[] = {
260  {KGRN, "api"}, // not used,
261  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
262  {KBLU, "fatbin"},
263 };
264 
265 
266 #if COMPILE_HIP_DB
267 #define tprintf(trace_level, ...) \
268  { \
269  if (HIP_DB & (1 << (trace_level))) { \
270  GET_TLS(); \
271  char msgStr[1000]; \
272  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
273  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
274  dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \
275  } \
276  }
277 #else
278 /* Compile to empty code */
279 #define tprintf(trace_level, ...)
280 #endif
281 
282 
283 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
284 
285 //---
286 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
287 
288 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
289 #define API_TRACE(forceTrace, ...) \
290  GET_TLS(); \
291  uint64_t hipApiStartTick = 0; \
292  { \
293  tls->tidInfo.incApiSeqNum(); \
294  if (forceTrace || \
295  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
296  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
297  std::string fullStr; \
298  hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
299  if (HIP_PROFILE_API == 0x1) { \
300  MARKER_BEGIN(__func__, "HIP") \
301  } else if (HIP_PROFILE_API == 0x2) { \
302  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
303  } \
304  } \
305  }
306 
307 #else
308 // Swallow API_TRACE
309 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum();
310 #endif
311 
312 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls)
313 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx
314 
315 #define HIP_SET_DEVICE() ihipDeviceSetState(tls);
316 
317 // This macro should be called at the beginning of every HIP API.
318 // It initializes the hip runtime (exactly once), and
319 // generates a trace string that can be output to stderr or to ATP file.
320 #define HIP_INIT_API(cid, ...) \
321  hip_impl::hip_init(); \
322  API_TRACE(0, __VA_ARGS__); \
323  HIP_CB_SPAWNER_OBJECT(cid);
324 
325 
326 // Like above, but will trace with a specified "special" bit.
327 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
328 // kernel launches, copy commands, memory sets, etc.
329 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
330  hip_impl::hip_init(); \
331  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
332  HIP_CB_SPAWNER_OBJECT(cid);
333 
334 
335 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
336 // APIS (not internal hip) It has dual function: logs the last error returned for use by
337 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
338 #define ihipLogStatus(hipStatus) \
339  ({ \
340  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
341  tls->lastHipError = localHipStatus; \
342  \
343  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
344  auto ticks = getTicks() - hipApiStartTick; \
345  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
346  (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \
347  tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
348  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
349  } \
350  if (HIP_PROFILE_API) { \
351  MARKER_END(); \
352  } \
353  localHipStatus; \
354  })
355 
356 
357 class ihipException : public std::exception {
358  public:
359  explicit ihipException(hipError_t e) : _code(e){};
360 
361  hipError_t _code;
362 };
363 
364 
365 #ifdef __cplusplus
366 extern "C" {
367 #endif
368 
369 
370 #ifdef __cplusplus
371 }
372 #endif
373 
374 const hipStream_t hipStreamNull = 0x0;
375 
376 
380 #define HIP_IPC_RESERVED_SIZE 24
382  public:
383 #if USE_IPC
384  hsa_amd_ipc_memory_t ipc_handle;
385 #endif
386  size_t psize;
387  char reserved[HIP_IPC_RESERVED_SIZE];
388 };
389 
390 
391 struct ihipModule_t {
392  std::string fileName;
393  hsa_executable_t executable = {};
394  hsa_code_object_reader_t coReader = {};
395  std::string hash;
396  std::unordered_map<
397  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
398 
399  ~ihipModule_t() {
400  if (executable.handle) hsa_executable_destroy(executable);
401  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
402  }
403 };
404 
405 
406 //---
407 // Used to remove lock, for performance or stimulating bugs.
408 class FakeMutex {
409  public:
410  void lock() {}
411  bool try_lock() { return true; }
412  void unlock() {}
413 };
414 
415 #if EVENT_THREAD_SAFE
416 typedef std::mutex EventMutex;
417 #else
418 #warning "Stream thread-safe disabled"
419 typedef FakeMutex EventMutex;
420 #endif
421 
422 #if STREAM_THREAD_SAFE
423 typedef std::mutex StreamMutex;
424 #else
425 #warning "Stream thread-safe disabled"
426 typedef FakeMutex StreamMutex;
427 #endif
428 
429 // Pair Device and Ctx together, these could also be toggled separately if desired.
430 #if CTX_THREAD_SAFE
431 typedef std::mutex CtxMutex;
432 #else
433 typedef FakeMutex CtxMutex;
434 #warning "Ctx thread-safe disabled"
435 #endif
436 
437 #if DEVICE_THREAD_SAFE
438 typedef std::mutex DeviceMutex;
439 #else
440 typedef FakeMutex DeviceMutex;
441 #warning "Device thread-safe disabled"
442 #endif
443 
444 //
445 //---
446 // Protects access to the member _data with a lock acquired on contruction/destruction.
447 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
448 template <typename T>
450  public:
451  LockedAccessor(T& criticalData, bool autoUnlock = true)
452  : _criticalData(&criticalData),
453  _autoUnlock(autoUnlock)
454 
455  {
456  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
457  ToString(_criticalData->_parent).c_str());
458  _criticalData->_mutex.lock();
459  };
460 
461  ~LockedAccessor() {
462  if (_autoUnlock) {
463  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
464  ToString(_criticalData->_parent).c_str());
465  _criticalData->_mutex.unlock();
466  }
467  }
468 
469  void unlock() {
470  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
471  ToString(_criticalData->_parent).c_str());
472  _criticalData->_mutex.unlock();
473  }
474 
475  // Syntactic sugar so -> can be used to get the underlying type.
476  T* operator->() { return _criticalData; };
477 
478  private:
479  T* _criticalData;
480  bool _autoUnlock;
481 };
482 
483 
484 template <typename MUTEX_TYPE>
485 struct LockedBase {
486  // Experts-only interface for explicit locking.
487  // Most uses should use the lock-accessor.
488  void lock() { _mutex.lock(); }
489  void unlock() { _mutex.unlock(); }
490  bool try_lock() { return _mutex.try_lock(); }
491 
492  MUTEX_TYPE _mutex;
493 };
494 
495 
496 template <typename MUTEX_TYPE>
497 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
498  public:
499  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
500  : _av(av), _parent(parentStream){};
501 
503 
506  return this;
507  };
508 
509  void munlock() {
510  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
511  ToString(this->_parent).c_str());
513  };
514 
516  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
517  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
518  ToString(this->_parent).c_str());
519  return gotLock ? this : nullptr;
520  };
521 
522  public:
523  ihipStream_t* _parent;
524 
525  hc::accelerator_view _av;
526 
527  private:
528 };
529 
530 
531 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
532 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
533 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
534 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
535 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
536 // query.
537 
538 
541 
542 //---
543 // Internal stream structure.
545  public:
546  enum ScheduleMode { Auto, Spin, Yield };
547  typedef uint64_t SeqNum_t;
548 
549  // TODOD -make av a reference to avoid shared_ptr overhead?
550  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
551  ~ihipStream_t();
552 
553  // kind is hipMemcpyKind
554  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
555  bool resolveOn = true);
556 
557  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
558  bool resolveOn = true);
559 
560  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
561 
562  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
563 
564  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
565  size_t offset, unsigned kind);
566  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
567  size_t offset, unsigned kind);
568 
569  //---
570  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
571  // the critical mutex.
572  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
573  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
574 
575 
576  void locked_wait();
577 
578  hc::accelerator_view* locked_getAv() {
579  LockedAccessor_StreamCrit_t crit(_criticalData);
580  return &(crit->_av);
581  };
582 
583  void locked_streamWaitEvent(ihipEventData_t& event);
584  hc::completion_future locked_recordEvent(hipEvent_t event);
585 
586  ihipStreamCritical_t& criticalData() { return _criticalData; };
587 
588  //---
589  hc::hcWaitMode waitMode() const;
590 
591  // Use this if we already have the stream critical data mutex:
592  void wait(LockedAccessor_StreamCrit_t& crit);
593 
594  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
595  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
596  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
597  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
598  uint64_t kernel);
599 
600 
601  //-- Non-racy accessors:
602  // These functions access fields set at initialization time and are non-racy (so do not acquire
603  // mutex)
604  const ihipDevice_t* getDevice() const;
605  ihipCtx_t* getCtx() const;
606 
607  // Before calling this function, stream must be resolved from "0" to the actual stream:
608  bool isDefaultStream() const { return _id == 0; };
609 
610  public:
611  //---
612  // Public member vars - these are set at initialization and never change:
613  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
614  unsigned _flags;
615 
616 
617  private:
618  // The unsigned return is hipMemcpyKind
619  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
620  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
621  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
622  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
623 
624  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
625  const hc::AmPointerInfo* srcInfo);
626 
627  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
628 
629  private: // Data
630  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
631  ihipStreamCritical_t _criticalData;
632 
633  std::mutex _hasQueueLock;
634 
635  ihipCtx_t* _ctx; // parent context that owns this stream.
636 
637  // Friends:
638  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
639  friend hipError_t hipStreamQuery(hipStream_t);
640 
641  ScheduleMode _scheduleMode;
642 };
643 
644 
645 //----
646 // Internal structure for stream callback handler
648  public:
649  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
650  : _stream(stream), _callback(callback), _userData(userData) {
651  };
652  hipStream_t _stream;
653  hipStreamCallback_t _callback;
654  void* _userData;
655 };
656 
657 
658 //----
659 // Internal event structure:
660 enum hipEventStatus_t {
661  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
662  hipEventStatusCreated = 1, // event created, but not yet Recorded
663  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
664  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
665 };
666 
667 // TODO - rename to ihip type of some kind
668 enum ihipEventType_t {
669  hipEventTypeIndependent,
670  hipEventTypeStartCommand,
671  hipEventTypeStopCommand,
672 };
673 
674 
676  ihipEventData_t() {
677  _state = hipEventStatusCreated;
678  _stream = NULL;
679  _timestamp = 0;
680  _type = hipEventTypeIndependent;
681  };
682 
683  void marker(const hc::completion_future& marker) { _marker = marker; };
684  hc::completion_future& marker() { return _marker; }
685  uint64_t timestamp() const { return _timestamp; };
686  ihipEventType_t type() const { return _type; };
687 
688  ihipEventType_t _type;
689  hipEventStatus_t _state;
690  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
691  // stream when recorded
692  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
693  private:
694  hc::completion_future _marker;
695 };
696 
697 
698 //=============================================================================
699 // class ihipEventCriticalBase_t
700 template <typename MUTEX_TYPE>
701 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
702  public:
703  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
705 
706  // Keep data in structure so it can be easily copied into snapshots
707  // (used to reduce lock contention and preserve correct lock order)
708  ihipEventData_t _eventData;
709 
710  private:
711  const ihipEvent_t* _parent;
712  friend class LockedAccessor<ihipEventCriticalBase_t>;
713 };
714 
716 
718 
719 // internal hip event structure.
720 class ihipEvent_t {
721  public:
722  explicit ihipEvent_t(unsigned flags);
723  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
724  ihipEventType_t eventType);
725 
726  // Return a copy of the critical state. The critical data is locked during the copy.
727  ihipEventData_t locked_copyCrit() {
728  LockedAccessor_EventCrit_t crit(_criticalData);
729  return _criticalData._eventData;
730  };
731 
732  ihipEventCritical_t& criticalData() { return _criticalData; };
733 
734  public:
735  unsigned _flags;
736 
737  private:
738  ihipEventCritical_t _criticalData;
739 
740  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
741 };
742 
743 
744 //=============================================================================
745 // class ihipDeviceCriticalBase_t
746 template <typename MUTEX_TYPE>
748  public:
749  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
750  : _parent(parentDevice), _ctxCount(0){};
751 
753 
754  // Contexts:
755  void addContext(ihipCtx_t* ctx);
756  void removeContext(ihipCtx_t* ctx);
757  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
758  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
759  int getcount() { return _ctxCount; };
760  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
761 
762  private:
763  ihipDevice_t* _parent;
764 
765  //--- Context Tracker:
766  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
767 
768  int _ctxCount;
769 };
770 
772 
774 
775 //----
776 // Properties of the HIP device.
777 // Multiple contexts can point to same device.
779  public:
780  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
781  ~ihipDevice_t();
782 
783  // Accessors:
784  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
785  void locked_removeContext(ihipCtx_t* c);
786  void locked_reset();
787  ihipDeviceCritical_t& criticalData() { return _criticalData; };
788 
789  public:
790  unsigned _deviceId; // device ID
791 
792  hc::accelerator _acc;
793  hsa_agent_t _hsaAgent; // hsa agent handle
794 
796  unsigned _computeUnits;
797  hipDeviceProp_t _props; // saved device properties.
798 
799  // TODO - report this through device properties, base on HCC API call.
800  int _isLargeBar;
801 
802  ihipCtx_t* _primaryCtx;
803 
804  int _state; // 1 if device is set otherwise 0
805 
806  private:
807  hipError_t initProperties(hipDeviceProp_t* prop);
808 
809  private:
810  ihipDeviceCritical_t _criticalData;
811 };
812 //=============================================================================
813 
814 
815 //---
816 //
817 struct ihipExec_t {
818  dim3 _gridDim;
819  dim3 _blockDim;
820  size_t _sharedMem;
821  hipStream_t _hStream;
822  std::vector<char> _arguments;
823 };
824 
825 //=============================================================================
826 // class ihipCtxCriticalBase_t
827 template <typename MUTEX_TYPE>
828 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
829  public:
830  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
831  : _parent(parentCtx), _peerCnt(0) {
832  _peerAgents = new hsa_agent_t[deviceCnt];
833  };
834 
836  if (_peerAgents != nullptr) {
837  delete _peerAgents;
838  _peerAgents = nullptr;
839  }
840  _peerCnt = 0;
841  }
842 
843  // Streams:
844  void addStream(ihipStream_t* stream);
845  std::list<ihipStream_t*>& streams() { return _streams; };
846  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
847 
848 
849  // Peer Accessor classes:
850  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
851  // physically located on this device.
852  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
853  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
854  void resetPeerWatchers(ihipCtx_t* thisDevice);
855  void printPeerWatchers(FILE* f) const;
856 
857  uint32_t peerCnt() const { return _peerCnt; };
858  hsa_agent_t* peerAgents() const { return _peerAgents; };
859 
860 
861  // TODO - move private
862  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
863  //--- Execution stack:
864  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
865 
866  friend class LockedAccessor<ihipCtxCriticalBase_t>;
867 
868  private:
869  ihipCtx_t* _parent;
870 
871  //--- Stream Tracker:
872  std::list<ihipStream_t*> _streams; // streams associated with this device.
873 
874 
875  //--- Peer Tracker:
876  // These reflect the currently Enabled set of peers for this GPU:
877  // Enabled peers have permissions to access the memory physically allocated on this device.
878  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
879  uint32_t _peerCnt; // number of enabled peers
880  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
881  private:
882  void recomputePeerAgents();
883 };
884 // Note Mutex type Real/Fake selected based on CtxMutex
886 
887 // This type is used by functions that need access to the critical device structures.
889 //=============================================================================
890 
891 
892 //=============================================================================
893 // class ihipCtx_t:
894 // A HIP CTX (context) points at one of the existing devices and contains the streams,
895 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
896 // device.
897 //
898 class ihipCtx_t {
899  public: // Functions:
900  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
901  unsigned flags); // note: calls constructor for _criticalData
902  ~ihipCtx_t();
903 
904  // Functions which read or write the critical data are named locked_.
905  // (might be better called "locking_"
906  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
907  // locked_ function from within a locked_ function. External functions which call several
908  // locked_ functions will acquire and release the lock for each function. if this occurs in
909  // performance-sensitive code we may want to refactor by adding non-locked functions and
910  // creating a new locked_ member function to call them all.
911  void locked_removeStream(ihipStream_t* s);
912  void locked_reset();
913  void locked_waitAllStreams();
914  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
915 
916  ihipCtxCritical_t& criticalData() { return _criticalData; };
917 
918  const ihipDevice_t* getDevice() const { return _device; };
919  int getDeviceNum() const { return _device->_deviceId; };
920 
921  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
922  ihipDevice_t* getWriteableDevice() const { return _device; };
923 
924  std::string toString() const;
925 
926  public: // Data
927  // The NULL stream is used if no other stream is specified.
928  // Default stream has special synchronization properties with other streams.
929  ihipStream_t* _defaultStream;
930 
931  // Flags specified when the context is created:
932  unsigned _ctxFlags;
933 
934  private:
935  ihipDevice_t* _device;
936 
937 
938  private: // Critical data, protected with locked access:
939  // Members of _protected data MUST be accessed through the LockedAccessor.
940  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
941  // directly.
942  ihipCtxCritical_t _criticalData;
943 };
944 
945 
946 //=================================================================================================
947 // Global variable definition:
948 extern unsigned g_deviceCnt;
949 extern hsa_agent_t g_cpu_agent; // the CPU agent.
950 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
951 
952 //=================================================================================================
953 // Extern functions:
954 extern void ihipInit();
955 extern const char* ihipErrorString(hipError_t);
956 extern hipError_t ihipSynchronize(TlsData *tls);
957 extern void ihipCtxStackUpdate();
958 extern hipError_t ihipDeviceSetState(TlsData *tls);
959 
960 extern ihipDevice_t* ihipGetDevice(int);
961 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
962 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
963  const char* name, hsa_agent_t *agent);
964 
965 
966 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
967 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
968 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
969 
970 // Stream printf functions:
971 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
972  os << "stream:";
973  os << s.getDevice()->_deviceId;
974  ;
975  os << '.';
976  os << s._id;
977  return os;
978 }
979 
980 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
981  os << '{';
982  os << s.x;
983  os << ',';
984  os << s.y;
985  os << ',';
986  os << s.z;
987  os << '}';
988  return os;
989 }
990 
991 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
992  os << '{';
993  os << s.x;
994  os << ',';
995  os << s.y;
996  os << ',';
997  os << s.z;
998  os << '}';
999  return os;
1000 }
1001 
1002 // Stream printf functions:
1003 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
1004  os << "event:" << std::hex << static_cast<void*>(e);
1005  return os;
1006 }
1007 
1008 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
1009  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
1010  return os;
1011 }
1012 
1013 
1014 // Helper functions that are used across src files:
1015 namespace hip_internal {
1016 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1017  hipStream_t stream);
1018 
1019 hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
1020 
1021 hipError_t ihipHostFree(TlsData *tls, void* ptr);
1022 
1023 };
1024 
1025 #define MAX_COOPERATIVE_GPUs 255
1026 
1027 // do not change these two structs without changing the device library
1028 struct mg_sync {
1029  uint w0;
1030  uint w1;
1031 };
1032 
1033 struct mg_info {
1034  struct mg_sync *mgs;
1035  uint grid_id;
1036  uint num_grids;
1037  ulong prev_sum;
1038  ulong all_sum;
1039 };
1040 
1041 //---
1042 // TODO - review the context creation strategy here. Really should be:
1043 // - first "non-device" runtime call creates the context for this thread. Allowed to call
1044 // setDevice first.
1045 // - hipDeviceReset destroys the primary context for device?
1046 // - Then context is created again for next usage.
1047 static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
1048  // Per-thread initialization of the TLS:
1049  if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
1050  tls->defaultCtx = ihipGetPrimaryCtx(0);
1051  }
1052  return tls->defaultCtx;
1053 }
1054 
1055 #endif
Definition: hip_hcc_internal.h:253
Definition: hip_hcc_internal.h:1028
Definition: hip_hcc_internal.h:778
Definition: hip_hcc_internal.h:485
Definition: hip_hcc_internal.h:112
Definition: hip_hcc_internal.h:408
Definition: hip_hcc_internal.h:381
uint32_t x
x
Definition: hip_runtime_api.h:270
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:1033
Definition: hip_module.cpp:107
Definition: hip_hcc_internal.h:135
Definition: hip_hcc_internal.h:817
Definition: hip_hcc_internal.h:898
Definition: hip_runtime_api.h:269
uint32_t y
y
Definition: hip_runtime_api.h:271
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:843
Definition: hip_hcc_internal.h:647
Definition: hip_hcc_internal.h:747
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:796
uint32_t z
z
Definition: hip_runtime_api.h:272
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:675
Definition: hip_hcc_internal.h:701
Definition: hip_hcc_internal.h:1015
Definition: hip_hcc_internal.h:357
Definition: hip_hcc_internal.h:720
Definition: hip_hcc_internal.h:828
Definition: hip_hcc_internal.h:544
Definition: hip_hcc_internal.h:497
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:104
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:384
Definition: hip_hcc_internal.h:449
Definition: hip_hcc_internal.h:391
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:160
Definition: hip_hcc_internal.h:95