| //===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is dual licensed under the MIT and the University of Illinois Open |
| // Source Licenses. See LICENSE.txt for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the declarations of all library macros, types, |
| // and functions. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef __OMPTARGET_NVPTX_H |
| #define __OMPTARGET_NVPTX_H |
| |
| // std includes |
| #include <stdint.h> |
| #include <stdlib.h> |
| |
| #include <inttypes.h> |
| |
| // cuda includes |
| #include <cuda.h> |
| #include <math.h> |
| |
| // local includes |
| #include "counter_group.h" |
| #include "debug.h" // debug |
| #include "interface.h" // interfaces with omp, compiler, and user |
| #include "option.h" // choices we have |
| #include "state-queue.h" |
| #include "support.h" |
| |
| #define OMPTARGET_NVPTX_VERSION 1.1 |
| |
| // used by the library for the interface with the app |
| #define DISPATCH_FINISHED 0 |
| #define DISPATCH_NOTFINISHED 1 |
| |
| // used by dynamic scheduling |
| #define FINISHED 0 |
| #define NOT_FINISHED 1 |
| #define LAST_CHUNK 2 |
| |
| #define BARRIER_COUNTER 0 |
| #define ORDERED_COUNTER 1 |
| |
| // Macros for Cuda intrinsics |
| // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. |
| // Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). |
| #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 |
| #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) |
| #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ |
| __shfl_down_sync((mask), (var), (delta), (width)) |
| #define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) |
| #define __ACTIVEMASK() __activemask() |
| #else |
| #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) |
| #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ |
| __shfl_down((var), (delta), (width)) |
| #define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) |
| #define __ACTIVEMASK() __ballot(1) |
| #endif |
| |
| // arguments needed for L0 parallelism only. |
| class omptarget_nvptx_SharedArgs { |
| public: |
| // All these methods must be called by the master thread only. |
| INLINE void Init() { |
| args = buffer; |
| nArgs = MAX_SHARED_ARGS; |
| } |
| INLINE void DeInit() { |
| // Free any memory allocated for outlined parallel function with a large |
| // number of arguments. |
| if (nArgs > MAX_SHARED_ARGS) { |
| SafeFree(args, (char *)"new extended args"); |
| Init(); |
| } |
| } |
| INLINE void EnsureSize(size_t size) { |
| if (size > nArgs) { |
| if (nArgs > MAX_SHARED_ARGS) { |
| SafeFree(args, (char *)"new extended args"); |
| } |
| args = (void **) SafeMalloc(size * sizeof(void *), |
| (char *)"new extended args"); |
| nArgs = size; |
| } |
| } |
| // Called by all threads. |
| INLINE void **GetArgs() { return args; }; |
| private: |
| // buffer of pre-allocated arguments. |
| void *buffer[MAX_SHARED_ARGS]; |
| // pointer to arguments buffer. |
| // starts off as a pointer to 'buffer' but can be dynamically allocated. |
| void **args; |
| // starts off as MAX_SHARED_ARGS but can increase in size. |
| uint32_t nArgs; |
| }; |
| |
| extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; |
| |
| // Data sharing related quantities, need to match what is used in the compiler. |
| enum DATA_SHARING_SIZES { |
| // The maximum number of workers in a kernel. |
| DS_Max_Worker_Threads = 992, |
| // The size reserved for data in a shared memory slot. |
| DS_Slot_Size = 256, |
| // The slot size that should be reserved for a working warp. |
| DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, |
| // The maximum number of warps in use |
| DS_Max_Warp_Number = 32, |
| }; |
| |
| // Data structure to keep in shared memory that traces the current slot, stack, |
| // and frame pointer as well as the active threads that didn't exit the current |
| // environment. |
| struct DataSharingStateTy { |
| __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; |
| void *StackPtr[DS_Max_Warp_Number]; |
| void *FramePtr[DS_Max_Warp_Number]; |
| int32_t ActiveThreads[DS_Max_Warp_Number]; |
| }; |
| // Additional worker slot type which is initialized with the default worker slot |
| // size of 4*32 bytes. |
| struct __kmpc_data_sharing_worker_slot_static { |
| __kmpc_data_sharing_slot *Next; |
| __kmpc_data_sharing_slot *Prev; |
| void *PrevSlotStackPtr; |
| void *DataEnd; |
| char Data[DS_Worker_Warp_Slot_Size]; |
| }; |
| // Additional master slot type which is initialized with the default master slot |
| // size of 4 bytes. |
| struct __kmpc_data_sharing_master_slot_static { |
| __kmpc_data_sharing_slot *Next; |
| __kmpc_data_sharing_slot *Prev; |
| void *PrevSlotStackPtr; |
| void *DataEnd; |
| char Data[DS_Slot_Size]; |
| }; |
| extern __device__ __shared__ DataSharingStateTy DataSharingState; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // task ICV and (implicit & explicit) task state |
| |
| class omptarget_nvptx_TaskDescr { |
| public: |
| // methods for flags |
| INLINE omp_sched_t GetRuntimeSched(); |
| INLINE void SetRuntimeSched(omp_sched_t sched); |
| INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } |
| INLINE void SetDynamic() { |
| items.flags = items.flags | TaskDescr_IsDynamic; |
| } |
| INLINE void ClearDynamic() { |
| items.flags = items.flags & (~TaskDescr_IsDynamic); |
| } |
| INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } |
| INLINE int InL2OrHigherParallelRegion() { |
| return items.flags & TaskDescr_InParL2P; |
| } |
| INLINE int IsParallelConstruct() { |
| return items.flags & TaskDescr_IsParConstr; |
| } |
| INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } |
| // methods for other fields |
| INLINE uint16_t &NThreads() { return items.nthreads; } |
| INLINE uint16_t &ThreadLimit() { return items.threadlimit; } |
| INLINE uint16_t &ThreadId() { return items.threadId; } |
| INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } |
| INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } |
| INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } |
| INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { |
| prev = taskDescr; |
| } |
| // init & copy |
| INLINE void InitLevelZeroTaskDescr(); |
| INLINE void InitLevelOneTaskDescr(uint16_t tnum, |
| omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); |
| INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); |
| INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, |
| uint16_t tnum); |
| INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); |
| INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, |
| uint16_t tid, uint16_t tnum); |
| INLINE void SaveLoopData(); |
| INLINE void RestoreLoopData() const; |
| |
| private: |
| // bits for flags: (7 used, 1 free) |
| // 3 bits (SchedMask) for runtime schedule |
| // 1 bit (IsDynamic) for dynamic schedule (false = static) |
| // 1 bit (InPar) if this thread has encountered one or more parallel region |
| // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) |
| // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel |
| // region |
| static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); |
| static const uint8_t TaskDescr_IsDynamic = 0x8; |
| static const uint8_t TaskDescr_InPar = 0x10; |
| static const uint8_t TaskDescr_IsParConstr = 0x20; |
| static const uint8_t TaskDescr_InParL2P = 0x40; |
| |
| struct SavedLoopDescr_items { |
| int64_t loopUpperBound; |
| int64_t nextLowerBound; |
| int64_t chunk; |
| int64_t stride; |
| kmp_sched_t schedule; |
| } loopData; |
| |
| struct TaskDescr_items { |
| uint8_t flags; // 6 bit used (see flag above) |
| uint8_t unused; |
| uint16_t nthreads; // thread num for subsequent parallel regions |
| uint16_t threadlimit; // thread limit ICV |
| uint16_t threadId; // thread id |
| uint16_t threadsInTeam; // threads in current team |
| uint64_t runtimeChunkSize; // runtime chunk size |
| } items; |
| omptarget_nvptx_TaskDescr *prev; |
| }; |
| |
| // build on kmp |
| typedef struct omptarget_nvptx_ExplicitTaskDescr { |
| omptarget_nvptx_TaskDescr |
| taskDescr; // omptarget_nvptx task description (must be first) |
| kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) |
| } omptarget_nvptx_ExplicitTaskDescr; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Descriptor of a parallel region (worksharing in general) |
| |
| class omptarget_nvptx_WorkDescr { |
| |
| public: |
| // access to data |
| INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } |
| INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } |
| // init |
| INLINE void InitWorkDescr(); |
| |
| private: |
| omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) |
| omptarget_nvptx_TaskDescr masterTaskICV; |
| bool hasCancel; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| class omptarget_nvptx_TeamDescr { |
| public: |
| // access to data |
| INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { |
| return &levelZeroTaskDescr; |
| } |
| INLINE omptarget_nvptx_WorkDescr &WorkDescr() { |
| return workDescrForActiveParallel; |
| } |
| INLINE omp_lock_t *CriticalLock() { return &criticalLock; } |
| INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } |
| |
| // init |
| INLINE void InitTeamDescr(); |
| |
| INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { |
| // If this is invoked by the master thread of the master warp then intialize |
| // it with a smaller slot. |
| if (IsMasterThread) { |
| // Do not initalize this slot again if it has already been initalized. |
| if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) |
| return 0; |
| // Initialize the pointer to the end of the slot given the size of the |
| // data section. DataEnd is non-inclusive. |
| master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; |
| // We currently do not have a next slot. |
| master_rootS[0].Next = 0; |
| master_rootS[0].Prev = 0; |
| master_rootS[0].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&master_rootS[0]; |
| } |
| // Do not initalize this slot again if it has already been initalized. |
| if (worker_rootS[wid].DataEnd == |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) |
| return 0; |
| // Initialize the pointer to the end of the slot given the size of the data |
| // section. DataEnd is non-inclusive. |
| worker_rootS[wid].DataEnd = |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; |
| // We currently do not have a next slot. |
| worker_rootS[wid].Next = 0; |
| worker_rootS[wid].Prev = 0; |
| worker_rootS[wid].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; |
| } |
| |
| INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { |
| worker_rootS[wid].DataEnd = |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; |
| // We currently do not have a next slot. |
| worker_rootS[wid].Next = 0; |
| worker_rootS[wid].Prev = 0; |
| worker_rootS[wid].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; |
| } |
| |
| private: |
| omptarget_nvptx_TaskDescr |
| levelZeroTaskDescr; // icv for team master initial thread |
| omptarget_nvptx_WorkDescr |
| workDescrForActiveParallel; // one, ONLY for the active par |
| omp_lock_t criticalLock; |
| uint64_t lastprivateIterBuffer; |
| |
| __align__(16) |
| __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; |
| __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // thread private data (struct of arrays for better coalescing) |
| // tid refers here to the global thread id |
| // do not support multiple concurrent kernel a this time |
| class omptarget_nvptx_ThreadPrivateContext { |
| public: |
| // task |
| INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { |
| return &levelOneTaskDescr[tid]; |
| } |
| INLINE void SetTopLevelTaskDescr(int tid, |
| omptarget_nvptx_TaskDescr *taskICV) { |
| topTaskDescr[tid] = taskICV; |
| } |
| INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); |
| // parallel |
| INLINE uint16_t &NumThreadsForNextParallel(int tid) { |
| return nextRegion.tnum[tid]; |
| } |
| // simd |
| INLINE uint16_t &SimdLimitForNextSimd(int tid) { |
| return nextRegion.slim[tid]; |
| } |
| // sync |
| INLINE Counter &Priv(int tid) { return priv[tid]; } |
| INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } |
| // schedule (for dispatch) |
| INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } |
| INLINE int64_t &Chunk(int tid) { return chunk[tid]; } |
| INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } |
| INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } |
| INLINE int64_t &Stride(int tid) { return stride[tid]; } |
| |
| INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } |
| |
| INLINE void InitThreadPrivateContext(int tid); |
| INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } |
| INLINE uint64_t GetSourceQueue() { return SourceQueue; } |
| |
| private: |
| // team context for this team |
| omptarget_nvptx_TeamDescr teamContext; |
| // task ICV for implict threads in the only parallel region |
| omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; |
| // pointer where to find the current task ICV (top of the stack) |
| omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; |
| union { |
| // Only one of the two is live at the same time. |
| // parallel |
| uint16_t tnum[MAX_THREADS_PER_TEAM]; |
| // simd limit |
| uint16_t slim[MAX_THREADS_PER_TEAM]; |
| } nextRegion; |
| // sync |
| Counter priv[MAX_THREADS_PER_TEAM]; |
| // schedule (for dispatch) |
| kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for |
| int64_t chunk[MAX_THREADS_PER_TEAM]; |
| int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; |
| // state for dispatch with dyn/guided OR static (never use both at a time) |
| int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; |
| int64_t stride[MAX_THREADS_PER_TEAM]; |
| // Queue to which this object must be returned. |
| uint64_t SourceQueue; |
| }; |
| |
| /// Device envrionment data |
| struct omptarget_device_environmentTy { |
| int32_t debug_level; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // global device envrionment |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| extern __device__ omptarget_device_environmentTy omptarget_device_environment; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // global data tables |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| extern __device__ __shared__ |
| omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; |
| extern __device__ __shared__ uint32_t execution_param; |
| extern __device__ __shared__ void *ReductionScratchpadPtr; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // work function (outlined parallel/simd functions) and arguments. |
| // needed for L1 parallelism only. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| typedef void *omptarget_nvptx_WorkFn; |
| extern volatile __device__ __shared__ omptarget_nvptx_WorkFn |
| omptarget_nvptx_workFn; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // get private data structures |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); |
| INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); |
| INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); |
| INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // inlined implementation |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| #include "counter_groupi.h" |
| #include "omptarget-nvptxi.h" |
| #include "supporti.h" |
| |
| #endif |