HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_util.h"
34 #include "env.h"
35 
36 
37 #if (__hcc_workweek__ < 16354)
38 #error("This version of HIP requires a newer version of HCC.");
39 #endif
40 
41 // Use the __appPtr field in the am memtracker to store the context.
42 // Requires a bug fix in HCC
43 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
44  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
45 #define USE_APP_PTR_FOR_CTX 1
46 #endif
47 
48 
49 #define USE_IPC 1
50 
51 //---
52 // Environment variables:
53 
54 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
55 // in debug+release.
56 // static const int debug = 0;
57 extern const int release;
58 
59 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
60 // kernels?
61 extern int HIP_LAUNCH_BLOCKING;
62 extern int HIP_API_BLOCKING;
63 
64 extern int HIP_PRINT_ENV;
65 extern int HIP_PROFILE_API;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 extern int HIP_SYNC_FREE;
87 
88 extern int HIP_DUMP_CODE_OBJECT;
89 
90 // TODO - remove when this is standard behavior.
91 extern int HCC_OPT_FLUSH;
92 
93 // Class to assign a short TID to each new thread, for HIP debugging purposes.
94 class TidInfo {
95  public:
96  TidInfo();
97 
98  int tid() const { return _shortTid; };
99  pid_t pid() const { return _pid; };
100  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
101  uint64_t apiSeqNum() const { return _apiSeqNum; };
102 
103  private:
104  int _shortTid;
105  pid_t _pid;
106 
107  // monotonically increasing API sequence number for this threa.
108  uint64_t _apiSeqNum;
109 };
110 
111 struct ProfTrigger {
112  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
113 
114  void print(int tid) {
115  std::cout << "Enabling tracing for ";
116  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
117  std::cout << "tid:" << tid << "." << *iter << ",";
118  }
119  std::cout << "\n";
120  };
121 
122  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
123  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
124  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
125 
126  private:
127  std::vector<uint64_t> _profTrigger;
128 };
129 
130 
131 //---
132 // Extern tls
133 extern thread_local hipError_t tls_lastHipError;
134 extern thread_local TidInfo tls_tidInfo;
135 extern thread_local bool tls_getPrimaryCtx;
136 
137 extern std::vector<ProfTrigger> g_dbStartTriggers;
138 extern std::vector<ProfTrigger> g_dbStopTriggers;
139 
140 //---
141 // Forward defs:
142 class ihipStream_t;
143 class ihipDevice_t;
144 class ihipCtx_t;
145 struct ihipEventData_t;
146 
147 // Color defs for debug messages:
148 #define KNRM "\x1B[0m"
149 #define KRED "\x1B[31m"
150 #define KGRN "\x1B[32m"
151 #define KYEL "\x1B[33m"
152 #define KBLU "\x1B[34m"
153 #define KMAG "\x1B[35m"
154 #define KCYN "\x1B[36m"
155 #define KWHT "\x1B[37m"
156 
157 extern const char* API_COLOR;
158 extern const char* API_COLOR_END;
159 
160 
161 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
162 // Can disable for performance or functional experiments - in this case
163 // the code uses a dummy "no-op" mutex.
164 #define EVENT_THREAD_SAFE 1
165 
166 #define STREAM_THREAD_SAFE 1
167 
168 #define CTX_THREAD_SAFE 1
169 
170 #define DEVICE_THREAD_SAFE 1
171 
172 
173 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
174 // May be set to 0 to remove debug if checks - possible code size and performance difference?
175 #define COMPILE_HIP_DB 1
176 
177 
178 // Compile HIP tracing capability.
179 // 0x1 = print a string at function entry with arguments.
180 // 0x2 = prints a simple message with function name + return code when function exits.
181 // 0x3 = print both.
182 // Must be enabled at runtime with HIP_TRACE_API
183 #define COMPILE_HIP_TRACE_API 0x3
184 
185 
186 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
187 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
188 #ifndef COMPILE_HIP_ATP_MARKER
189 #define COMPILE_HIP_ATP_MARKER 0
190 #endif
191 
192 
193 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
194 // boundary.
195 // TODO - currently we print the trace message at the beginning. if we waited, we could also
196 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
197 // by hipMalloc).
198 #if COMPILE_HIP_ATP_MARKER
199 #include "CXLActivityLogger.h"
200 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
201 #define MARKER_END() amdtEndMarker();
202 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
203 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
204 #else
205 // Swallow scoped markers:
206 #define MARKER_BEGIN(markerName, group)
207 #define MARKER_END()
208 #define RESUME_PROFILING
209 #define STOP_PROFILING
210 #endif
211 
212 
213 //---
214 // HIP Trace modes - use with HIP_TRACE_API=...
215 #define TRACE_ALL 0 // 0x01
216 #define TRACE_KCMD 1 // 0x02, kernel command
217 #define TRACE_MCMD 2 // 0x04, memory command
218 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
219 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
220 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
221 
222 
223 //---
224 // HIP_DB Debug flags:
225 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
226 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
227 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
228 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
229 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
230 #define DB_FB 5 /* 0x20 - trace loading fat binary */
231 #define DB_MAX_FLAG 6
232 // When adding a new debug flag, also add to the char name table below.
233 //
234 //
235 
236 struct DbName {
237  const char* _color;
238  const char* _shortName;
239 };
240 
241 // This table must be kept in-sync with the defines above.
242 static const DbName dbName[] = {
243  {KGRN, "api"}, // not used,
244  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
245  {KBLU, "fatbin"},
246 };
247 
248 
249 #if COMPILE_HIP_DB
250 #define tprintf(trace_level, ...) \
251  { \
252  if (HIP_DB & (1 << (trace_level))) { \
253  char msgStr[1000]; \
254  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
255  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
256  dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
257  } \
258  }
259 #else
260 /* Compile to empty code */
261 #define tprintf(trace_level, ...)
262 #endif
263 
264 
265 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
266 
267 //---
268 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
269 
270 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
271 #define API_TRACE(forceTrace, ...) \
272  uint64_t hipApiStartTick = 0; \
273  { \
274  tls_tidInfo.incApiSeqNum(); \
275  if (forceTrace || \
276  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
277  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
278  std::string fullStr; \
279  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
280  if (HIP_PROFILE_API == 0x1) { \
281  MARKER_BEGIN(__func__, "HIP") \
282  } else if (HIP_PROFILE_API == 0x2) { \
283  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
284  } \
285  } \
286  }
287 
288 #else
289 // Swallow API_TRACE
290 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
291 #endif
292 
293 #define HIP_SET_DEVICE() ihipDeviceSetState();
294 
295 // This macro should be called at the beginning of every HIP API.
296 // It initializes the hip runtime (exactly once), and
297 // generates a trace string that can be output to stderr or to ATP file.
298 #define HIP_INIT_API(cid, ...) \
299  hip_impl::hip_init(); \
300  API_TRACE(0, __VA_ARGS__); \
301  HIP_CB_SPAWNER_OBJECT(cid);
302 
303 
304 // Like above, but will trace with a specified "special" bit.
305 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
306 // kernel launches, copy commands, memory sets, etc.
307 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
308  hip_impl::hip_init(); \
309  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
310  HIP_CB_SPAWNER_OBJECT(cid);
311 
312 
313 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
314 // APIS (not internal hip) It has dual function: logs the last error returned for use by
315 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
316 #define ihipLogStatus(hipStatus) \
317  ({ \
318  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
319  tls_lastHipError = localHipStatus; \
320  \
321  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
322  auto ticks = getTicks() - hipApiStartTick; \
323  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
324  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
325  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
326  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
327  } \
328  if (HIP_PROFILE_API) { \
329  MARKER_END(); \
330  } \
331  localHipStatus; \
332  })
333 
334 
335 class ihipException : public std::exception {
336  public:
337  explicit ihipException(hipError_t e) : _code(e){};
338 
339  hipError_t _code;
340 };
341 
342 
343 #ifdef __cplusplus
344 extern "C" {
345 #endif
346 
347 
348 #ifdef __cplusplus
349 }
350 #endif
351 
352 const hipStream_t hipStreamNull = 0x0;
353 
354 
358 #define HIP_IPC_RESERVED_SIZE 24
360  public:
361 #if USE_IPC
362  hsa_amd_ipc_memory_t ipc_handle;
363 #endif
364  size_t psize;
365  char reserved[HIP_IPC_RESERVED_SIZE];
366 };
367 
368 
369 struct ihipModule_t {
370  std::string fileName;
371  hsa_executable_t executable = {};
372  hsa_code_object_reader_t coReader = {};
373  std::string hash;
374 
375  ~ihipModule_t() {
376  if (executable.handle) hsa_executable_destroy(executable);
377  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
378  }
379 };
380 
381 
382 //---
383 // Used to remove lock, for performance or stimulating bugs.
384 class FakeMutex {
385  public:
386  void lock() {}
387  bool try_lock() { return true; }
388  void unlock() {}
389 };
390 
391 #if EVENT_THREAD_SAFE
392 typedef std::mutex EventMutex;
393 #else
394 #warning "Stream thread-safe disabled"
395 typedef FakeMutex EventMutex;
396 #endif
397 
398 #if STREAM_THREAD_SAFE
399 typedef std::mutex StreamMutex;
400 #else
401 #warning "Stream thread-safe disabled"
402 typedef FakeMutex StreamMutex;
403 #endif
404 
405 // Pair Device and Ctx together, these could also be toggled separately if desired.
406 #if CTX_THREAD_SAFE
407 typedef std::mutex CtxMutex;
408 #else
409 typedef FakeMutex CtxMutex;
410 #warning "Ctx thread-safe disabled"
411 #endif
412 
413 #if DEVICE_THREAD_SAFE
414 typedef std::mutex DeviceMutex;
415 #else
416 typedef FakeMutex DeviceMutex;
417 #warning "Device thread-safe disabled"
418 #endif
419 
420 //
421 //---
422 // Protects access to the member _data with a lock acquired on contruction/destruction.
423 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
424 template <typename T>
426  public:
427  LockedAccessor(T& criticalData, bool autoUnlock = true)
428  : _criticalData(&criticalData),
429  _autoUnlock(autoUnlock)
430 
431  {
432  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
433  ToString(_criticalData->_parent).c_str());
434  _criticalData->_mutex.lock();
435  };
436 
437  ~LockedAccessor() {
438  if (_autoUnlock) {
439  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
440  ToString(_criticalData->_parent).c_str());
441  _criticalData->_mutex.unlock();
442  }
443  }
444 
445  void unlock() {
446  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
447  ToString(_criticalData->_parent).c_str());
448  _criticalData->_mutex.unlock();
449  }
450 
451  // Syntactic sugar so -> can be used to get the underlying type.
452  T* operator->() { return _criticalData; };
453 
454  private:
455  T* _criticalData;
456  bool _autoUnlock;
457 };
458 
459 
460 template <typename MUTEX_TYPE>
461 struct LockedBase {
462  // Experts-only interface for explicit locking.
463  // Most uses should use the lock-accessor.
464  void lock() { _mutex.lock(); }
465  void unlock() { _mutex.unlock(); }
466  bool try_lock() { return _mutex.try_lock(); }
467 
468  MUTEX_TYPE _mutex;
469 };
470 
471 
472 template <typename MUTEX_TYPE>
473 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
474  public:
475  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
476  : _kernelCnt(0), _av(av), _parent(parentStream){};
477 
479 
482  return this;
483  };
484 
485  void munlock() {
486  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
487  ToString(this->_parent).c_str());
489  };
490 
492  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
493  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
494  ToString(this->_parent).c_str());
495  return gotLock ? this : nullptr;
496  };
497 
498  public:
499  ihipStream_t* _parent;
500  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
501 
502  hc::accelerator_view _av;
503 
504  private:
505 };
506 
507 
508 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
509 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
510 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
511 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
512 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
513 // query.
514 
515 
518 
519 //---
520 // Internal stream structure.
522  public:
523  enum ScheduleMode { Auto, Spin, Yield };
524  typedef uint64_t SeqNum_t;
525 
526  // TODOD -make av a reference to avoid shared_ptr overhead?
527  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
528  ~ihipStream_t();
529 
530  // kind is hipMemcpyKind
531  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
532  bool resolveOn = true);
533 
534  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
535  bool resolveOn = true);
536 
537  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
538 
539  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
540 
541  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
542  size_t offset, unsigned kind);
543  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
544  size_t offset, unsigned kind);
545 
546  //---
547  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
548  // the critical mutex.
549  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
550  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
551 
552 
553  void locked_wait();
554 
555  hc::accelerator_view* locked_getAv() {
556  LockedAccessor_StreamCrit_t crit(_criticalData);
557  return &(crit->_av);
558  };
559 
560  void locked_streamWaitEvent(ihipEventData_t& event);
561  hc::completion_future locked_recordEvent(hipEvent_t event);
562 
563  bool locked_eventIsReady(hipEvent_t event);
564  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
565 
566  ihipStreamCritical_t& criticalData() { return _criticalData; };
567 
568  //---
569  hc::hcWaitMode waitMode() const;
570 
571  // Use this if we already have the stream critical data mutex:
572  void wait(LockedAccessor_StreamCrit_t& crit);
573 
574  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
575  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
576  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
577  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
578  uint64_t kernel);
579 
580 
581  //-- Non-racy accessors:
582  // These functions access fields set at initialization time and are non-racy (so do not acquire
583  // mutex)
584  const ihipDevice_t* getDevice() const;
585  ihipCtx_t* getCtx() const;
586 
587  // Before calling this function, stream must be resolved from "0" to the actual stream:
588  bool isDefaultStream() const { return _id == 0; };
589 
590  public:
591  //---
592  // Public member vars - these are set at initialization and never change:
593  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
594  unsigned _flags;
595 
596 
597  private:
598  // The unsigned return is hipMemcpyKind
599  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
600  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
601  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
602  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
603 
604  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
605  const hc::AmPointerInfo* srcInfo);
606 
607  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
608 
609 
610  public: // TODO - move private
611  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
612  ihipStreamCritical_t _criticalData;
613 
614  private: // Data
615  std::mutex _hasQueueLock;
616 
617  ihipCtx_t* _ctx; // parent context that owns this stream.
618 
619  // Friends:
620  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
621  friend hipError_t hipStreamQuery(hipStream_t);
622 
623  ScheduleMode _scheduleMode;
624 };
625 
626 
627 //----
628 // Internal structure for stream callback handler
630  public:
631  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
632  : _stream(stream), _callback(callback), _userData(userData) {
633  };
634  hipStream_t _stream;
635  hipStreamCallback_t _callback;
636  void* _userData;
637 };
638 
639 
640 //----
641 // Internal event structure:
642 enum hipEventStatus_t {
643  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
644  hipEventStatusCreated = 1, // event created, but not yet Recorded
645  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
646  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
647 };
648 
649 // TODO - rename to ihip type of some kind
650 enum ihipEventType_t {
651  hipEventTypeIndependent,
652  hipEventTypeStartCommand,
653  hipEventTypeStopCommand,
654 };
655 
656 
658  ihipEventData_t() {
659  _state = hipEventStatusCreated;
660  _stream = NULL;
661  _timestamp = 0;
662  _type = hipEventTypeIndependent;
663  };
664 
665  void marker(const hc::completion_future& marker) { _marker = marker; };
666  hc::completion_future& marker() { return _marker; }
667  uint64_t timestamp() const { return _timestamp; };
668  ihipEventType_t type() const { return _type; };
669 
670  ihipEventType_t _type;
671  hipEventStatus_t _state;
672  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
673  // stream when recorded
674  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
675  private:
676  hc::completion_future _marker;
677 };
678 
679 
680 //=============================================================================
681 // class ihipEventCriticalBase_t
682 template <typename MUTEX_TYPE>
683 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
684  public:
685  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
687 
688  // Keep data in structure so it can be easily copied into snapshots
689  // (used to reduce lock contention and preserve correct lock order)
690  ihipEventData_t _eventData;
691 
692  private:
693  const ihipEvent_t* _parent;
694  friend class LockedAccessor<ihipEventCriticalBase_t>;
695 };
696 
698 
700 
701 // internal hip event structure.
702 class ihipEvent_t {
703  public:
704  explicit ihipEvent_t(unsigned flags);
705  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
706  ihipEventType_t eventType);
707  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
708 
709 
710  // Return a copy of the critical state. The critical data is locked during the copy.
711  ihipEventData_t locked_copyCrit() {
712  LockedAccessor_EventCrit_t crit(_criticalData);
713  return _criticalData._eventData;
714  };
715 
716  ihipEventCritical_t& criticalData() { return _criticalData; };
717 
718  public:
719  unsigned _flags;
720 
721  private:
722  ihipEventCritical_t _criticalData;
723 
724  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
725 };
726 
727 
728 //=============================================================================
729 // class ihipDeviceCriticalBase_t
730 template <typename MUTEX_TYPE>
732  public:
733  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
734  : _parent(parentDevice), _ctxCount(0){};
735 
737 
738  // Contexts:
739  void addContext(ihipCtx_t* ctx);
740  void removeContext(ihipCtx_t* ctx);
741  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
742  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
743  int getcount() { return _ctxCount; };
744  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
745 
746  private:
747  ihipDevice_t* _parent;
748 
749  //--- Context Tracker:
750  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
751 
752  int _ctxCount;
753 };
754 
756 
758 
759 //----
760 // Properties of the HIP device.
761 // Multiple contexts can point to same device.
763  public:
764  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
765  ~ihipDevice_t();
766 
767  // Accessors:
768  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
769  void locked_removeContext(ihipCtx_t* c);
770  void locked_reset();
771  ihipDeviceCritical_t& criticalData() { return _criticalData; };
772 
773  public:
774  unsigned _deviceId; // device ID
775 
776  hc::accelerator _acc;
777  hsa_agent_t _hsaAgent; // hsa agent handle
778 
780  unsigned _computeUnits;
781  hipDeviceProp_t _props; // saved device properties.
782 
783  // TODO - report this through device properties, base on HCC API call.
784  int _isLargeBar;
785 
786  ihipCtx_t* _primaryCtx;
787 
788  int _state; // 1 if device is set otherwise 0
789 
790  private:
791  hipError_t initProperties(hipDeviceProp_t* prop);
792 
793  private:
794  ihipDeviceCritical_t _criticalData;
795 };
796 //=============================================================================
797 
798 
799 //---
800 //
801 struct ihipExec_t {
802  dim3 _gridDim;
803  dim3 _blockDim;
804  size_t _sharedMem;
805  hipStream_t _hStream;
806  std::vector<char> _arguments;
807 };
808 
809 //=============================================================================
810 // class ihipCtxCriticalBase_t
811 template <typename MUTEX_TYPE>
812 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
813  public:
814  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
815  : _parent(parentCtx), _peerCnt(0) {
816  _peerAgents = new hsa_agent_t[deviceCnt];
817  };
818 
820  if (_peerAgents != nullptr) {
821  delete _peerAgents;
822  _peerAgents = nullptr;
823  }
824  _peerCnt = 0;
825  }
826 
827  // Streams:
828  void addStream(ihipStream_t* stream);
829  std::list<ihipStream_t*>& streams() { return _streams; };
830  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
831 
832 
833  // Peer Accessor classes:
834  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
835  // physically located on this device.
836  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
837  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
838  void resetPeerWatchers(ihipCtx_t* thisDevice);
839  void printPeerWatchers(FILE* f) const;
840 
841  uint32_t peerCnt() const { return _peerCnt; };
842  hsa_agent_t* peerAgents() const { return _peerAgents; };
843 
844 
845  // TODO - move private
846  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
847  //--- Execution stack:
848  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
849 
850  friend class LockedAccessor<ihipCtxCriticalBase_t>;
851 
852  private:
853  ihipCtx_t* _parent;
854 
855  //--- Stream Tracker:
856  std::list<ihipStream_t*> _streams; // streams associated with this device.
857 
858 
859  //--- Peer Tracker:
860  // These reflect the currently Enabled set of peers for this GPU:
861  // Enabled peers have permissions to access the memory physically allocated on this device.
862  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
863  uint32_t _peerCnt; // number of enabled peers
864  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
865  private:
866  void recomputePeerAgents();
867 };
868 // Note Mutex type Real/Fake selected based on CtxMutex
870 
871 // This type is used by functions that need access to the critical device structures.
873 //=============================================================================
874 
875 
876 //=============================================================================
877 // class ihipCtx_t:
878 // A HIP CTX (context) points at one of the existing devices and contains the streams,
879 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
880 // device.
881 //
882 class ihipCtx_t {
883  public: // Functions:
884  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
885  unsigned flags); // note: calls constructor for _criticalData
886  ~ihipCtx_t();
887 
888  // Functions which read or write the critical data are named locked_.
889  // (might be better called "locking_"
890  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
891  // locked_ function from within a locked_ function. External functions which call several
892  // locked_ functions will acquire and release the lock for each function. if this occurs in
893  // performance-sensitive code we may want to refactor by adding non-locked functions and
894  // creating a new locked_ member function to call them all.
895  void locked_removeStream(ihipStream_t* s);
896  void locked_reset();
897  void locked_waitAllStreams();
898  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
899 
900  ihipCtxCritical_t& criticalData() { return _criticalData; };
901 
902  const ihipDevice_t* getDevice() const { return _device; };
903  int getDeviceNum() const { return _device->_deviceId; };
904 
905  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
906  ihipDevice_t* getWriteableDevice() const { return _device; };
907 
908  std::string toString() const;
909 
910  public: // Data
911  // The NULL stream is used if no other stream is specified.
912  // Default stream has special synchronization properties with other streams.
913  ihipStream_t* _defaultStream;
914 
915  // Flags specified when the context is created:
916  unsigned _ctxFlags;
917 
918  private:
919  ihipDevice_t* _device;
920 
921 
922  private: // Critical data, protected with locked access:
923  // Members of _protected data MUST be accessed through the LockedAccessor.
924  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
925  // directly.
926  ihipCtxCritical_t _criticalData;
927 };
928 
929 
930 //=================================================================================================
931 // Global variable definition:
932 extern unsigned g_deviceCnt;
933 extern hsa_agent_t g_cpu_agent; // the CPU agent.
934 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
935 
936 //=================================================================================================
937 // Extern functions:
938 extern void ihipInit();
939 extern const char* ihipErrorString(hipError_t);
940 extern ihipCtx_t* ihipGetTlsDefaultCtx();
941 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
942 extern hipError_t ihipSynchronize(void);
943 extern void ihipCtxStackUpdate();
944 extern hipError_t ihipDeviceSetState();
945 
946 extern ihipDevice_t* ihipGetDevice(int);
947 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
948 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
949  const char* name, hsa_agent_t *agent);
950 
951 
952 hipStream_t ihipSyncAndResolveStream(hipStream_t);
953 hipError_t ihipStreamSynchronize(hipStream_t stream);
954 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
955 
956 // Stream printf functions:
957 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
958  os << "stream:";
959  os << s.getDevice()->_deviceId;
960  ;
961  os << '.';
962  os << s._id;
963  return os;
964 }
965 
966 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
967  os << '{';
968  os << s.x;
969  os << ',';
970  os << s.y;
971  os << ',';
972  os << s.z;
973  os << '}';
974  return os;
975 }
976 
977 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
978  os << '{';
979  os << s.x;
980  os << ',';
981  os << s.y;
982  os << ',';
983  os << s.z;
984  os << '}';
985  return os;
986 }
987 
988 // Stream printf functions:
989 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
990  os << "event:" << std::hex << static_cast<void*>(e);
991  return os;
992 }
993 
994 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
995  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
996  return os;
997 }
998 
999 
1000 // Helper functions that are used across src files:
1001 namespace hip_internal {
1002 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1003  hipStream_t stream);
1004 };
1005 
1006 
1007 #endif
Definition: hip_hcc_internal.h:236
Definition: hip_hcc_internal.h:762
Definition: hip_hcc_internal.h:461
Definition: hip_hcc_internal.h:111
Definition: hip_hcc_internal.h:384
Definition: hip_hcc_internal.h:359
uint32_t x
x
Definition: hip_runtime_api.h:266
Definition: grid_launch.h:17
Definition: hip_module.cpp:89
Definition: hip_hcc_internal.h:801
Definition: hip_hcc_internal.h:882
Definition: hip_runtime_api.h:265
uint32_t y
y
Definition: hip_runtime_api.h:267
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:816
Definition: hip_hcc_internal.h:629
Definition: hip_hcc_internal.h:731
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:780
uint32_t z
z
Definition: hip_runtime_api.h:268
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:657
Definition: hip_hcc_internal.h:683
Definition: hip_hcc_internal.h:1001
Definition: hip_hcc_internal.h:335
Definition: hip_hcc_internal.h:702
Definition: hip_hcc_internal.h:812
Definition: hip_hcc_internal.h:521
Definition: hip_hcc_internal.h:473
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:362
Definition: hip_hcc_internal.h:425
Definition: hip_hcc_internal.h:369
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:157
Definition: hip_hcc_internal.h:94